Mali Valhall DDK r31p0 KMD

Provenance: 2ea0ef9bd (collaborate/EAC/v_r31p0) VX504X08X-BU-00000-r31p0-01eac0 - Valhall Android DDK VX504X08X-BU-60000-r31p0-01eac0 - Valhall Android Document Bundle VX504X08X-DC-11001-r31p0-01eac0 - Valhall Android DDK Software Errata VX504X08X-SW-99006-r31p0-01eac0 - Valhall Android Renderscript AOSP parts Signed-off-by: Sidath Senanayake <sidaths@google.com> Change-Id: Ide9d5fdc6d9c95fa66a3546b01f619b43c09496d
author: Sidath Senanayake <sidaths@google.com> 2021-06-15 13:39:30 +0100
committer: Sidath Senanayake <sidaths@google.com> 2021-06-15 14:11:16 +0100
commit: fca8613cfcf585bf9113dca96a05daea9fd89794 (patch)
tree: f2baa14910f83edf00450bc30d3703eb255a0bba
parent: 8037b534570814775d79aeddd06b76e5ee941f59 (diff)
download: gpu-fca8613cfcf585bf9113dca96a05daea9fd89794.tar.gz
132 files changed, 4422 insertions, 3277 deletions
diff --git a/common/include/linux/priority_control_manager.h b/common/include/linux/priority_control_manager.h
index d3e22f5..df3b3cd 100644
--- a/common/include/linux/priority_control_manager.h
+++ b/common/include/linux/priority_control_manager.h
@@ -53,7 +53,7 @@ struct priority_control_manager_ops {
 	 * Return: The priority that would actually be given, could be lower than requested_priority
 	 */
 	int (*pcm_scheduler_priority_check)(
-		struct priority_control_manager_device *mgm_dev,
+		struct priority_control_manager_device *pcm_dev,
 		struct task_struct *task, int requested_priority);
 };
 
@@ -62,6 +62,7 @@ struct priority_control_manager_ops {
  *                                          control manager
  *
  * @ops:   Callbacks associated with this device
+ * @data:  Pointer to device private data
  * @owner: Pointer to the module owner
  *
  * This structure should be registered with the platform device using
@@ -69,6 +70,7 @@ struct priority_control_manager_ops {
  */
 struct priority_control_manager_device {
 	struct priority_control_manager_ops ops;
+	void *data;
 	struct module *owner;
 };
 
diff --git a/common/include/uapi/gpu/arm/midgard/backend/gpu/mali_kbase_model_dummy.h b/common/include/uapi/gpu/arm/midgard/backend/gpu/mali_kbase_model_dummy.h
new file mode 100644
index 0000000..61da071
--- /dev/null
+++ b/common/include/uapi/gpu/arm/midgard/backend/gpu/mali_kbase_model_dummy.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * Dummy Model interface
+ */
+
+#ifndef _UAPI_KBASE_MODEL_DUMMY_H_
+#define _UAPI_KBASE_MODEL_DUMMY_H_
+
+#include <linux/types.h>
+
+#define KBASE_DUMMY_MODEL_COUNTER_HEADER_DWORDS (4)
+#define KBASE_DUMMY_MODEL_COUNTER_PER_CORE      (60)
+#define KBASE_DUMMY_MODEL_COUNTER_PER_CORE_TYPE  \
+		(64*KBASE_DUMMY_MODEL_COUNTER_PER_CORE)
+#define KBASE_DUMMY_MODEL_COUNTERS_PER_BIT      (4)
+#define KBASE_DUMMY_MODEL_COUNTER_ENABLED(enable_mask, ctr_idx) \
+	(enable_mask & (1 << (ctr_idx / KBASE_DUMMY_MODEL_COUNTERS_PER_BIT)))
+
+#define KBASE_DUMMY_MODEL_HEADERS_PER_BLOCK 4
+#define KBASE_DUMMY_MODEL_COUNTERS_PER_BLOCK 60
+#define KBASE_DUMMY_MODEL_VALUES_PER_BLOCK                                     \
+	(KBASE_DUMMY_MODEL_COUNTERS_PER_BLOCK +                                \
+	 KBASE_DUMMY_MODEL_HEADERS_PER_BLOCK)
+#define KBASE_DUMMY_MODEL_BLOCK_SIZE                                           \
+	(KBASE_DUMMY_MODEL_VALUES_PER_BLOCK * sizeof(__u32))
+#define KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS      8
+#define KBASE_DUMMY_MODEL_MAX_SHADER_CORES       32
+#define KBASE_DUMMY_MODEL_MAX_NUM_PERF_BLOCKS    \
+	(1 + 1 + KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS + KBASE_DUMMY_MODEL_MAX_SHADER_CORES)
+#define KBASE_DUMMY_MODEL_COUNTER_TOTAL          \
+	(KBASE_DUMMY_MODEL_MAX_NUM_PERF_BLOCKS * KBASE_DUMMY_MODEL_COUNTER_PER_CORE_TYPE)
+
+#endif /* _UAPI_KBASE_MODEL_DUMMY_H_ */
diff --git a/mali_kbase/csf/mali_base_csf_kernel.h b/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h
index 9a13760..7fa874b 100644
--- a/mali_kbase/csf/mali_base_csf_kernel.h
+++ b/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h
@@ -19,8 +19,10 @@
  *
  */
 
-#ifndef _BASE_CSF_KERNEL_H_
-#define _BASE_CSF_KERNEL_H_
+#ifndef _UAPI_BASE_CSF_KERNEL_H_
+#define _UAPI_BASE_CSF_KERNEL_H_
+
+#include <linux/types.h>
 
 /* Memory allocation, access/hint flags.
  *
@@ -203,7 +205,7 @@
 /**
  * Valid set of just-in-time memory allocation flags
  */
-#define BASE_JIT_ALLOC_VALID_FLAGS ((u8)0)
+#define BASE_JIT_ALLOC_VALID_FLAGS ((__u8)0)
 
 /* Flags to pass to ::base_context_init.
  * Flags can be ORed together to enable multiple things.
@@ -211,7 +213,7 @@
  * These share the same space as BASEP_CONTEXT_FLAG_*, and so must
  * not collide with them.
  */
-typedef u32 base_context_create_flags;
+typedef __u32 base_context_create_flags;
 
 /* No flags set */
 #define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0)
@@ -228,11 +230,10 @@ typedef u32 base_context_create_flags;
 #define BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED \
 	((base_context_create_flags)1 << 1)
 
-/* Create CSF event thread.
+/* Base context creates a CSF event notification thread.
  *
- * The creation of a CSF event thread is conditional and only allowed in
- * unit tests for the moment, in order to avoid clashes with the existing
- * Base unit tests.
+ * The creation of a CSF event notification thread is conditional but
+ * mandatory for the handling of CSF events.
  */
 #define BASE_CONTEXT_CSF_EVENT_THREAD ((base_context_create_flags)1 << 2)
 
@@ -289,23 +290,26 @@ typedef u32 base_context_create_flags;
 
 #define BASE_QUEUE_MAX_PRIORITY (15U)
 
-/* CQS Sync object is an array of u32 event_mem[2], error field index is 1 */
+/* CQS Sync object is an array of __u32 event_mem[2], error field index is 1 */
 #define BASEP_EVENT_VAL_INDEX (0U)
 #define BASEP_EVENT_ERR_INDEX (1U)
 
 /* The upper limit for number of objects that could be waited/set per command.
  * This limit is now enforced as internally the error inherit inputs are
- * converted to 32-bit flags in a u32 variable occupying a previously padding
+ * converted to 32-bit flags in a __u32 variable occupying a previously padding
  * field.
  */
 #define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32)
 
+#if MALI_UNIT_TEST
 /**
  * enum base_kcpu_command_type - Kernel CPU queue command type.
  * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL:       fence_signal,
  * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT:         fence_wait,
  * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT:           cqs_wait,
  * @BASE_KCPU_COMMAND_TYPE_CQS_SET:            cqs_set,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION:  cqs_set_operation,
  * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT:         map_import,
  * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT:       unmap_import,
  * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force,
@@ -320,6 +324,8 @@ enum base_kcpu_command_type {
 	BASE_KCPU_COMMAND_TYPE_FENCE_WAIT,
 	BASE_KCPU_COMMAND_TYPE_CQS_WAIT,
 	BASE_KCPU_COMMAND_TYPE_CQS_SET,
+	BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION,
+	BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION,
 	BASE_KCPU_COMMAND_TYPE_MAP_IMPORT,
 	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT,
 	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE,
@@ -327,10 +333,41 @@ enum base_kcpu_command_type {
 	BASE_KCPU_COMMAND_TYPE_JIT_FREE,
 	BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND,
 	BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER,
-#if MALI_UNIT_TEST
 	BASE_KCPU_COMMAND_TYPE_SAMPLE_TIME,
-#endif /* MALI_UNIT_TEST */
 };
+#else
+/**
+ * enum base_kcpu_command_type - Kernel CPU queue command type.
+ * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL:       fence_signal,
+ * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT:         fence_wait,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT:           cqs_wait,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_SET:            cqs_set,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation,
+ * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION:  cqs_set_operation,
+ * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT:         map_import,
+ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT:       unmap_import,
+ * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force,
+ * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC:          jit_alloc,
+ * @BASE_KCPU_COMMAND_TYPE_JIT_FREE:           jit_free,
+ * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND:      group_suspend,
+ * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER:      error_barrier,
+ */
+enum base_kcpu_command_type {
+	BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL,
+	BASE_KCPU_COMMAND_TYPE_FENCE_WAIT,
+	BASE_KCPU_COMMAND_TYPE_CQS_WAIT,
+	BASE_KCPU_COMMAND_TYPE_CQS_SET,
+	BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION,
+	BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION,
+	BASE_KCPU_COMMAND_TYPE_MAP_IMPORT,
+	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT,
+	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE,
+	BASE_KCPU_COMMAND_TYPE_JIT_ALLOC,
+	BASE_KCPU_COMMAND_TYPE_JIT_FREE,
+	BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND,
+	BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER,
+};
+#endif /* MALI_UNIT_TEST */
 
 /**
  * enum base_queue_group_priority - Priority of a GPU Command Queue Group.
@@ -363,29 +400,118 @@ enum base_queue_group_priority {
 };
 
 struct base_kcpu_command_fence_info {
-	u64 fence;
+	__u64 fence;
 };
 
 struct base_cqs_wait_info {
-	u64 addr;
-	u32 val;
-	u32 padding;
+	__u64 addr;
+	__u32 val;
+	__u32 padding;
 };
 
 struct base_kcpu_command_cqs_wait_info {
-	u64 objs;
-	u32 nr_objs;
-	u32 inherit_err_flags;
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 inherit_err_flags;
 };
 
 struct base_cqs_set {
-	u64 addr;
+	__u64 addr;
 };
 
 struct base_kcpu_command_cqs_set_info {
-	u64 objs;
-	u32 nr_objs;
-	u32 propagate_flags;
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 padding;
+};
+
+/**
+ * basep_cqs_data_type - Enumeration of CQS Data Types
+ *
+ * @BASEP_CQS_DATA_TYPE_U32: The Data Type of a CQS Object's value
+ *                           is an unsigned 32-bit integer
+ * @BASEP_CQS_DATA_TYPE_U64: The Data Type of a CQS Object's value
+ *                           is an unsigned 64-bit integer
+ */
+typedef enum PACKED {
+	BASEP_CQS_DATA_TYPE_U32 = 0,
+	BASEP_CQS_DATA_TYPE_U64 = 1,
+} basep_cqs_data_type;
+
+/**
+ * basep_cqs_wait_operation_op - Enumeration of CQS Object Wait
+ *                                Operation conditions
+ *
+ * @BASEP_CQS_WAIT_OPERATION_LE: CQS Wait Operation indicating that a
+ *                                wait will be satisfied when a CQS Object's
+ *                                value is Less than or Equal to
+ *                                the Wait Operation value
+ * @BASEP_CQS_WAIT_OPERATION_GT: CQS Wait Operation indicating that a
+ *                                wait will be satisfied when a CQS Object's
+ *                                value is Greater than the Wait Operation value
+ */
+typedef enum {
+	BASEP_CQS_WAIT_OPERATION_LE = 0,
+	BASEP_CQS_WAIT_OPERATION_GT = 1,
+} basep_cqs_wait_operation_op;
+
+struct base_cqs_wait_operation_info {
+	__u64 addr;
+	__u64 val;
+	__u8 operation;
+	__u8 data_type;
+	__u8 padding[6];
+};
+
+/**
+ * struct base_kcpu_command_cqs_wait_operation_info - structure which contains information
+ *		about the Timeline CQS wait objects
+ *
+ * @objs:              An array of Timeline CQS waits.
+ * @nr_objs:           Number of Timeline CQS waits in the array.
+ * @inherit_err_flags: Bit-pattern for the CQSs in the array who's error field
+ *                     to be served as the source for importing into the
+ *                     queue's error-state.
+ */
+struct base_kcpu_command_cqs_wait_operation_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 inherit_err_flags;
+};
+
+/**
+ * basep_cqs_set_operation_op - Enumeration of CQS Set Operations
+ *
+ * @BASEP_CQS_SET_OPERATION_ADD: CQS Set operation for adding a value
+ *                                to a synchronization object
+ * @BASEP_CQS_SET_OPERATION_SET: CQS Set operation for setting the value
+ *                                of a synchronization object
+ */
+typedef enum {
+	BASEP_CQS_SET_OPERATION_ADD = 0,
+	BASEP_CQS_SET_OPERATION_SET = 1,
+} basep_cqs_set_operation_op;
+
+struct base_cqs_set_operation_info {
+	__u64 addr;
+	__u64 val;
+	__u8 operation;
+	__u8 data_type;
+	__u8 padding[6];
+};
+
+/**
+ * struct base_kcpu_command_cqs_set_operation_info - structure which contains information
+ *		about the Timeline CQS set objects
+ *
+ * @objs:    An array of Timeline CQS sets.
+ * @nr_objs: Number of Timeline CQS sets in the array.
+ * @padding: Structure padding, unused bytes.
+ */
+struct base_kcpu_command_cqs_set_operation_info {
+	__u64 objs;
+	__u32 nr_objs;
+	__u32 padding;
 };
 
 /**
@@ -395,7 +521,7 @@ struct base_kcpu_command_cqs_set_info {
  * @handle:	Address of imported user buffer.
  */
 struct base_kcpu_command_import_info {
-	u64 handle;
+	__u64 handle;
 };
 
 /**
@@ -408,9 +534,9 @@ struct base_kcpu_command_import_info {
  * @padding:	Padding to a multiple of 64 bits.
  */
 struct base_kcpu_command_jit_alloc_info {
-	u64 info;
-	u8 count;
-	u8 padding[7];
+	__u64 info;
+	__u8 count;
+	__u8 padding[7];
 };
 
 /**
@@ -422,9 +548,9 @@ struct base_kcpu_command_jit_alloc_info {
  * @padding:	Padding to a multiple of 64 bits.
  */
 struct base_kcpu_command_jit_free_info {
-	u64 ids;
-	u8 count;
-	u8 padding[7];
+	__u64 ids;
+	__u8 count;
+	__u8 padding[7];
 };
 
 /**
@@ -437,15 +563,15 @@ struct base_kcpu_command_jit_free_info {
  * @padding:		padding to a multiple of 64 bits.
  */
 struct base_kcpu_command_group_suspend_info {
-	u64 buffer;
-	u32 size;
-	u8 group_handle;
-	u8 padding[3];
+	__u64 buffer;
+	__u32 size;
+	__u8 group_handle;
+	__u8 padding[3];
 };
 
 #if MALI_UNIT_TEST
 struct base_kcpu_command_sample_time_info {
-	u64 time;
+	__u64 time;
 };
 #endif /* MALI_UNIT_TEST */
 
@@ -466,12 +592,14 @@ struct base_kcpu_command_sample_time_info {
  * @info.padding:          padding
  */
 struct base_kcpu_command {
-	u8 type;
-	u8 padding[sizeof(u64) - sizeof(u8)];
+	__u8 type;
+	__u8 padding[sizeof(__u64) - sizeof(__u8)];
 	union {
 		struct base_kcpu_command_fence_info fence;
 		struct base_kcpu_command_cqs_wait_info cqs_wait;
 		struct base_kcpu_command_cqs_set_info cqs_set;
+		struct base_kcpu_command_cqs_wait_operation_info cqs_wait_operation;
+		struct base_kcpu_command_cqs_set_operation_info cqs_set_operation;
 		struct base_kcpu_command_import_info import;
 		struct base_kcpu_command_jit_alloc_info jit_alloc;
 		struct base_kcpu_command_jit_free_info jit_free;
@@ -479,7 +607,7 @@ struct base_kcpu_command {
 #if MALI_UNIT_TEST
 		struct base_kcpu_command_sample_time_info sample_time;
 #endif /* MALI_UNIT_TEST */
-		u64 padding[2]; /* No sub-struct should be larger */
+		__u64 padding[2]; /* No sub-struct should be larger */
 	} info;
 };
 
@@ -490,8 +618,8 @@ struct base_kcpu_command {
  * @padding:  Padding to a multiple of 64 bits.
  */
 struct basep_cs_stream_control {
-	u32 features;
-	u32 padding;
+	__u32 features;
+	__u32 padding;
 };
 
 /**
@@ -503,10 +631,10 @@ struct basep_cs_stream_control {
  * @padding:      Padding to a multiple of 64 bits.
  */
 struct basep_cs_group_control {
-	u32 features;
-	u32 stream_num;
-	u32 suspend_size;
-	u32 padding;
+	__u32 features;
+	__u32 stream_num;
+	__u32 suspend_size;
+	__u32 padding;
 };
 
 /**
@@ -521,9 +649,9 @@ struct basep_cs_group_control {
  * @padding:      Padding to make multiple of 64bits
  */
 struct base_gpu_queue_group_error_fatal_payload {
-	u64 sideband;
-	u32 status;
-	u32 padding;
+	__u64 sideband;
+	__u32 status;
+	__u32 padding;
 };
 
 /**
@@ -539,10 +667,10 @@ struct base_gpu_queue_group_error_fatal_payload {
  * @padding:      Padding to make multiple of 64bits
  */
 struct base_gpu_queue_error_fatal_payload {
-	u64 sideband;
-	u32 status;
-	u8 csi_index;
-	u8 padding[3];
+	__u64 sideband;
+	__u32 status;
+	__u8 csi_index;
+	__u8 padding[3];
 };
 
 /**
@@ -579,8 +707,8 @@ enum base_gpu_queue_group_error_type {
  * @payload.fatal_queue: Unrecoverable fault error associated with command queue
  */
 struct base_gpu_queue_group_error {
-	u8 error_type;
-	u8 padding[7];
+	__u8 error_type;
+	__u8 padding[7];
 	union {
 		struct base_gpu_queue_group_error_fatal_payload fatal_group;
 		struct base_gpu_queue_error_fatal_payload fatal_queue;
@@ -621,17 +749,17 @@ enum base_csf_notification_type {
  *
  */
 struct base_csf_notification {
-	u8 type;
-	u8 padding[7];
+	__u8 type;
+	__u8 padding[7];
 	union {
 		struct {
-			u8 handle;
-			u8 padding[7];
+			__u8 handle;
+			__u8 padding[7];
 			struct base_gpu_queue_group_error error;
 		} csg_error;
 
-		u8 align[56];
+		__u8 align[56];
 	} payload;
 };
 
-#endif /* _BASE_CSF_KERNEL_H_ */
+#endif /* _UAPI_BASE_CSF_KERNEL_H_ */
diff --git a/mali_kbase/csf/mali_gpu_csf_control_registers.h b/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_control_registers.h
index 8c4fc82..570cba8 100644
--- a/mali_kbase/csf/mali_gpu_csf_control_registers.h
+++ b/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_control_registers.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -23,10 +23,10 @@
  * This header was autogenerated, it should not be edited.
  */
 
-#ifndef _GPU_CSF_CONTROL_REGISTERS_H_
-#define _GPU_CSF_CONTROL_REGISTERS_H_
+#ifndef _UAPI_GPU_CSF_CONTROL_REGISTERS_H_
+#define _UAPI_GPU_CSF_CONTROL_REGISTERS_H_
 
 /* GPU_REGISTERS register offsets */
 #define GPU_CONTROL_MCU 0x3000 /* () MCU control registers */
 
-#endif /* _GPU_CSF_CONTROL_REGISTERS_H_ */
+#endif /* _UAPI_GPU_CSF_CONTROL_REGISTERS_H_ */
diff --git a/mali_kbase/csf/mali_gpu_csf_registers.h b/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h
index d37b9cc..f233a0d 100644
--- a/mali_kbase/csf/mali_gpu_csf_registers.h
+++ b/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h
@@ -23,8 +23,8 @@
  * This header was autogenerated, it should not be edited.
  */
 
-#ifndef _GPU_CSF_REGISTERS_H_
-#define _GPU_CSF_REGISTERS_H_
+#ifndef _UAPI_GPU_CSF_REGISTERS_H_
+#define _UAPI_GPU_CSF_REGISTERS_H_
 
 /*
  * Begin register sets
@@ -155,6 +155,7 @@
 #define CSG_PROTM_SUSPEND_BUF_LO 0x0048 /* () Protected mode suspend buffer, low word */
 #define CSG_PROTM_SUSPEND_BUF_HI 0x004C /* () Protected mode suspend buffer, high word */
 #define CSG_CONFIG 0x0050 /* () CSG configuration options */
+#define CSG_ITER_TRACE_CONFIG 0x0054 /* () CSG trace configuration */
 
 /* CSG_OUTPUT_BLOCK register offsets */
 #define CSG_ACK 0x0000 /* () CSG acknowledge flags */
@@ -172,6 +173,7 @@
 #define GLB_GROUP_NUM 0x0010 /* () Number of CSG interfaces */
 #define GLB_GROUP_STRIDE 0x0014 /* () Stride between CSG interfaces */
 #define GLB_PRFCNT_SIZE 0x0018 /* () Size of CSF performance counters */
+#define GLB_INSTR_FEATURES 0x001C /* () TRACE_POINT instrumentation features */
 #define GROUP_CONTROL_0 0x1000 /* () CSG control and capabilities */
 #define GROUP_CONTROL(n) (GROUP_CONTROL_0 + (n)*256)
 #define GROUP_CONTROL_REG(n, r) (GROUP_CONTROL(n) + GROUP_CONTROL_BLOCK_REG(r))
@@ -1132,16 +1134,21 @@
 #define GLB_REQ_PRFCNT_THRESHOLD_SHIFT 24
 #define GLB_REQ_PRFCNT_THRESHOLD_MASK (0x1 << GLB_REQ_PRFCNT_THRESHOLD_SHIFT)
 #define GLB_REQ_PRFCNT_THRESHOLD_GET(reg_val) \
-	(((reg_val)&GLB_REQ_PRFCNT_THRESHOLD_MASK) >> GLB_REQ_PRFCNT_THRESHOLD_SHIFT)
+	(((reg_val)&GLB_REQ_PRFCNT_THRESHOLD_MASK) >> \
+	 GLB_REQ_PRFCNT_THRESHOLD_SHIFT)
 #define GLB_REQ_PRFCNT_THRESHOLD_SET(reg_val, value) \
-	(((reg_val) & ~GLB_REQ_PRFCNT_THRESHOLD_MASK) |  \
-	 (((value) << GLB_REQ_PRFCNT_THRESHOLD_SHIFT) & GLB_REQ_PRFCNT_THRESHOLD_MASK))
+	(((reg_val) & ~GLB_REQ_PRFCNT_THRESHOLD_MASK) | \
+	 (((value) << GLB_REQ_PRFCNT_THRESHOLD_SHIFT) & \
+	  GLB_REQ_PRFCNT_THRESHOLD_MASK))
 #define GLB_REQ_PRFCNT_OVERFLOW_SHIFT 25
 #define GLB_REQ_PRFCNT_OVERFLOW_MASK (0x1 << GLB_REQ_PRFCNT_OVERFLOW_SHIFT)
-#define GLB_REQ_PRFCNT_OVERFLOW_GET(reg_val) (((reg_val)&GLB_REQ_PRFCNT_OVERFLOW_MASK) >> GLB_REQ_PRFCNT_OVERFLOW_SHIFT)
+#define GLB_REQ_PRFCNT_OVERFLOW_GET(reg_val) \
+	(((reg_val)&GLB_REQ_PRFCNT_OVERFLOW_MASK) >> \
+	 GLB_REQ_PRFCNT_OVERFLOW_SHIFT)
 #define GLB_REQ_PRFCNT_OVERFLOW_SET(reg_val, value) \
-	(((reg_val) & ~GLB_REQ_PRFCNT_OVERFLOW_MASK) |  \
-	 (((value) << GLB_REQ_PRFCNT_OVERFLOW_SHIFT) & GLB_REQ_PRFCNT_OVERFLOW_MASK))
+	(((reg_val) & ~GLB_REQ_PRFCNT_OVERFLOW_MASK) | \
+	 (((value) << GLB_REQ_PRFCNT_OVERFLOW_SHIFT) & \
+	  GLB_REQ_PRFCNT_OVERFLOW_MASK))
 #define GLB_REQ_DEBUG_CSF_REQ_SHIFT 30
 #define GLB_REQ_DEBUG_CSF_REQ_MASK (0x1 << GLB_REQ_DEBUG_CSF_REQ_SHIFT)
 #define GLB_REQ_DEBUG_CSF_REQ_GET(reg_val) (((reg_val)&GLB_REQ_DEBUG_CSF_REQ_MASK) >> GLB_REQ_DEBUG_CSF_REQ_SHIFT)
@@ -1256,19 +1263,25 @@
 	(((reg_val) & ~GLB_ACK_IRQ_MASK_PROTM_EXIT_MASK) |  \
 	 (((value) << GLB_ACK_IRQ_MASK_PROTM_EXIT_SHIFT) & GLB_ACK_IRQ_MASK_PROTM_EXIT_MASK))
 #define GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_SHIFT 24
-#define GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK (0x1 << GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_SHIFT)
+#define GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK \
+	(0x1 << GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_SHIFT)
 #define GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_GET(reg_val) \
-	(((reg_val)&GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK) >> GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_SHIFT)
+	(((reg_val)&GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK) >> \
+	 GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_SHIFT)
 #define GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_SET(reg_val, value) \
-	(((reg_val) & ~GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK) |  \
-	 (((value) << GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_SHIFT) & GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK))
+	(((reg_val) & ~GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK) | \
+	 (((value) << GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_SHIFT) & \
+	  GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK))
 #define GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_SHIFT 25
-#define GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK (0x1 << GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_SHIFT)
+#define GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK \
+	(0x1 << GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_SHIFT)
 #define GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_GET(reg_val) \
-	(((reg_val)&GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK) >> GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_SHIFT)
+	(((reg_val)&GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK) >> \
+	 GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_SHIFT)
 #define GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_SET(reg_val, value) \
-	(((reg_val) & ~GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK) |  \
-	 (((value) << GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_SHIFT) & GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK))
+	(((reg_val) & ~GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK) | \
+	 (((value) << GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_SHIFT) & \
+	  GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK))
 #define GLB_ACK_IRQ_MASK_DEBUG_CSF_REQ_SHIFT 30
 #define GLB_ACK_IRQ_MASK_DEBUG_CSF_REQ_MASK (0x1 << GLB_ACK_IRQ_MASK_DEBUG_CSF_REQ_SHIFT)
 #define GLB_ACK_IRQ_MASK_DEBUG_CSF_REQ_GET(reg_val) \
@@ -1398,4 +1411,4 @@
 	(((reg_val) & ~CSG_STATUS_STATE_IDLE_MASK) |  \
 	(((value) << CSG_STATUS_STATE_IDLE_SHIFT) & CSG_STATUS_STATE_IDLE_MASK))
 
-#endif /* _GPU_CSF_REGISTERS_H_ */
+#endif /* _UAPI_GPU_CSF_REGISTERS_H_ */
diff --git a/mali_kbase/csf/mali_kbase_csf_ioctl.h b/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h
index 8c63e1c..237cc2e 100644
--- a/mali_kbase/csf/mali_kbase_csf_ioctl.h
+++ b/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h
@@ -19,8 +19,8 @@
  *
  */
 
-#ifndef _KBASE_CSF_IOCTL_H_
-#define _KBASE_CSF_IOCTL_H_
+#ifndef _UAPI_KBASE_CSF_IOCTL_H_
+#define _UAPI_KBASE_CSF_IOCTL_H_
 
 #include <asm-generic/ioctl.h>
 #include <linux/types.h>
@@ -34,10 +34,16 @@
  * 1.2:
  * - Add new CSF GPU_FEATURES register into the property structure
  *   returned by KBASE_IOCTL_GET_GPUPROPS
+ * 1.3:
+ * - Add __u32 group_uid member to
+ *   &struct_kbase_ioctl_cs_queue_group_create.out
+ * 1.4:
+ * - Replace padding in kbase_ioctl_cs_get_glb_iface with
+ *   instr_features member of same size
  */
 
 #define BASE_UK_VERSION_MAJOR 1
-#define BASE_UK_VERSION_MINOR 2
+#define BASE_UK_VERSION_MINOR 4
 
 /**
  * struct kbase_ioctl_version_check - Check version compatibility between
@@ -146,6 +152,7 @@ struct kbase_ioctl_cs_queue_terminate {
  * @out:              Output parameters
  * @out.group_handle: Handle of a newly created queue group.
  * @out.padding:      Currently unused, must be zero
+ * @out.group_uid:    UID of the queue group available to base.
  */
 union kbase_ioctl_cs_queue_group_create {
 	struct {
@@ -162,7 +169,8 @@ union kbase_ioctl_cs_queue_group_create {
 	} in;
 	struct {
 		__u8 group_handle;
-		__u8 padding[7];
+		__u8 padding[3];
+		__u32 group_uid;
 	} out;
 };
 
@@ -287,25 +295,25 @@ struct kbase_ioctl_cs_tiler_heap_term {
  * union kbase_ioctl_cs_get_glb_iface - Request the global control block
  *                                        of CSF interface capabilities
  *
- * @in:                Input parameters
- * @in.max_group_num:  The maximum number of groups to be read. Can be 0, in
- *                     which case groups_ptr is unused.
- * @in.max_total_stream_num: The maximum number of CSs to be read. Can be 0, in
- *                     which case streams_ptr is unused.
- * @in.groups_ptr:     Pointer where to store all the group data (sequentially).
- * @in.streams_ptr:    Pointer where to store all the CS data (sequentially).
- * @out:               Output parameters
- * @out.glb_version:   Global interface version.
- * @out.features:      Bit mask of features (e.g. whether certain types of job
- *                     can be suspended).
- * @out.group_num:     Number of CSGs supported.
- * @out.prfcnt_size:   Size of CSF performance counters, in bytes. Bits 31:16
- *                     hold the size of firmware performance counter data
- *                     and 15:0 hold the size of hardware performance counter
- *                     data.
- * @out.total_stream_num: Total number of CSs, summed across all groups.
- * @out.padding:       Will be zeroed.
- *
+ * @in:                    Input parameters
+ * @in.max_group_num:      The maximum number of groups to be read. Can be 0, in
+ *                         which case groups_ptr is unused.
+ * @in.max_total_stream    _num: The maximum number of CSs to be read. Can be 0, in
+ *                         which case streams_ptr is unused.
+ * @in.groups_ptr:         Pointer where to store all the group data (sequentially).
+ * @in.streams_ptr:        Pointer where to store all the CS data (sequentially).
+ * @out:                   Output parameters
+ * @out.glb_version:       Global interface version.
+ * @out.features:          Bit mask of features (e.g. whether certain types of job
+ *                         can be suspended).
+ * @out.group_num:         Number of CSGs supported.
+ * @out.prfcnt_size:       Size of CSF performance counters, in bytes. Bits 31:16
+ *                         hold the size of firmware performance counter data
+ *                         and 15:0 hold the size of hardware performance counter
+ *                         data.
+ * @out.total_stream_num:  Total number of CSs, summed across all groups.
+ * @out.instr_features:    Instrumentation features. Bits 7:4 hold the maximum
+ *                         size of events. Bits 3:0 hold the offset update rate.
  *
  */
 union kbase_ioctl_cs_get_glb_iface {
@@ -321,7 +329,7 @@ union kbase_ioctl_cs_get_glb_iface {
 		__u32 group_num;
 		__u32 prfcnt_size;
 		__u32 total_stream_num;
-		__u32 padding;
+		__u32 instr_features;
 	} out;
 };
 
@@ -379,4 +387,4 @@ union kbase_ioctl_cs_event_memory_read {
 
 #endif /* MALI_UNIT_TEST */
 
-#endif /* _KBASE_CSF_IOCTL_H_ */
+#endif /* _UAPI_KBASE_CSF_IOCTL_H_ */
diff --git a/mali_kbase/gpu/backend/mali_kbase_gpu_regmap_csf.h b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h
index 65a06d2..c87154f 100644
--- a/mali_kbase/gpu/backend/mali_kbase_gpu_regmap_csf.h
+++ b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h
@@ -19,10 +19,12 @@
  *
  */
 
-#ifndef _KBASE_GPU_REGMAP_CSF_H_
-#define _KBASE_GPU_REGMAP_CSF_H_
+#ifndef _UAPI_KBASE_GPU_REGMAP_CSF_H_
+#define _UAPI_KBASE_GPU_REGMAP_CSF_H_
 
-#if !MALI_USE_CSF
+#include <linux/types.h>
+
+#if !MALI_USE_CSF && defined(__KERNEL__)
 #error "Cannot be compiled with JM"
 #endif
 
@@ -61,8 +63,7 @@
 #define VALUE_SHADER_REG_LO(n) (VALUE_SHADER_BASE + ((n) << 3))     /* (RO) Counter value #n, low word */
 #define VALUE_SHADER_REG_HI(n) (VALUE_SHADER_BASE + ((n) << 3) + 4) /* (RO) Counter value #n, high word */
 
-
-#include "csf/mali_gpu_csf_control_registers.h"
+#include "../../csf/mali_gpu_csf_control_registers.h"
 
 /* Set to implementation defined, outer caching */
 #define AS_MEMATTR_AARCH64_OUTER_IMPL_DEF 0x88ull
@@ -188,7 +189,7 @@
 
 /* GPU_COMMAND command + payload */
 #define GPU_COMMAND_CODE_PAYLOAD(opcode, payload) \
-	((u32)opcode | ((u32)payload << 8))
+	((__u32)opcode | ((__u32)payload << 8))
 
 /* Final GPU_COMMAND form */
 /* No operation, nothing happens */
@@ -282,9 +283,9 @@
 /* Implementation-dependent exception codes used to indicate CSG
  * and CS errors that are not specified in the specs.
  */
-#define GPU_EXCEPTION_TYPE_SW_FAULT_0 ((u8)0x70)
-#define GPU_EXCEPTION_TYPE_SW_FAULT_1 ((u8)0x71)
-#define GPU_EXCEPTION_TYPE_SW_FAULT_2 ((u8)0x72)
+#define GPU_EXCEPTION_TYPE_SW_FAULT_0 ((__u8)0x70)
+#define GPU_EXCEPTION_TYPE_SW_FAULT_1 ((__u8)0x71)
+#define GPU_EXCEPTION_TYPE_SW_FAULT_2 ((__u8)0x72)
 
 /* GPU_FAULTSTATUS_EXCEPTION_TYPE values */
 #define GPU_FAULTSTATUS_EXCEPTION_TYPE_OK 0x00
@@ -331,4 +332,4 @@
 /* GPU_CONTROL_MCU.GPU_IRQ_RAWSTAT */
 #define PRFCNT_SAMPLE_COMPLETED (1 << 16)   /* Set when performance count sample has completed */
 
-#endif /* _KBASE_GPU_REGMAP_CSF_H_ */
+#endif /* _UAPI_KBASE_GPU_REGMAP_CSF_H_ */
diff --git a/mali_kbase/gpu/backend/mali_kbase_gpu_regmap_jm.h b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h
index 1669d5a..1982668 100644
--- a/mali_kbase/gpu/backend/mali_kbase_gpu_regmap_jm.h
+++ b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -19,10 +19,10 @@
  *
  */
 
-#ifndef _KBASE_GPU_REGMAP_JM_H_
-#define _KBASE_GPU_REGMAP_JM_H_
+#ifndef _UAPI_KBASE_GPU_REGMAP_JM_H_
+#define _UAPI_KBASE_GPU_REGMAP_JM_H_
 
-#if MALI_USE_CSF
+#if MALI_USE_CSF && defined(__KERNEL__)
 #error "Cannot be compiled with CSF"
 #endif
 
@@ -284,4 +284,4 @@
 #define GPU_IRQ_REG_COMMON (GPU_FAULT | MULTIPLE_GPU_FAULTS | RESET_COMPLETED \
 		| POWER_CHANGED_ALL | PRFCNT_SAMPLE_COMPLETED)
 
-#endif /* _KBASE_GPU_REGMAP_JM_H_ */
+#endif /* _UAPI_KBASE_GPU_REGMAP_JM_H_ */
diff --git a/mali_kbase/gpu/mali_kbase_gpu_coherency.h b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_coherency.h
index a075ed0..98186d2 100644
--- a/mali_kbase/gpu/mali_kbase_gpu_coherency.h
+++ b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_coherency.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2015-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2015-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -19,12 +19,12 @@
  *
  */
 
-#ifndef _KBASE_GPU_COHERENCY_H_
-#define _KBASE_GPU_COHERENCY_H_
+#ifndef _UAPI_KBASE_GPU_COHERENCY_H_
+#define _UAPI_KBASE_GPU_COHERENCY_H_
 
 #define COHERENCY_ACE_LITE 0
 #define COHERENCY_ACE      1
 #define COHERENCY_NONE     31
 #define COHERENCY_FEATURE_BIT(x) (1 << (x))
 
-#endif /* _KBASE_GPU_COHERENCY_H_ */
+#endif /* _UAPI_KBASE_GPU_COHERENCY_H_ */
diff --git a/mali_kbase/gpu/mali_kbase_gpu_id.h b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h
index 8d687c4..0145920 100644
--- a/mali_kbase/gpu/mali_kbase_gpu_id.h
+++ b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2015-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2015-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -19,8 +19,10 @@
  *
  */
 
-#ifndef _KBASE_GPU_ID_H_
-#define _KBASE_GPU_ID_H_
+#ifndef _UAPI_KBASE_GPU_ID_H_
+#define _UAPI_KBASE_GPU_ID_H_
+
+#include <linux/types.h>
 
 /* GPU_ID register */
 #define GPU_ID_VERSION_STATUS_SHIFT       0
@@ -55,18 +57,18 @@
  * a product ignoring its version.
  */
 #define GPU_ID2_PRODUCT_MAKE(arch_major, arch_minor, arch_rev, product_major) \
-		((((u32)arch_major) << GPU_ID2_ARCH_MAJOR_SHIFT)  | \
-		 (((u32)arch_minor) << GPU_ID2_ARCH_MINOR_SHIFT)  | \
-		 (((u32)arch_rev) << GPU_ID2_ARCH_REV_SHIFT)      | \
-		 (((u32)product_major) << GPU_ID2_PRODUCT_MAJOR_SHIFT))
+		((((__u32)arch_major) << GPU_ID2_ARCH_MAJOR_SHIFT)  | \
+		 (((__u32)arch_minor) << GPU_ID2_ARCH_MINOR_SHIFT)  | \
+		 (((__u32)arch_rev) << GPU_ID2_ARCH_REV_SHIFT)      | \
+		 (((__u32)product_major) << GPU_ID2_PRODUCT_MAJOR_SHIFT))
 
 /* Helper macro to create a partial GPU_ID (new format) that specifies the
  * revision (major, minor, status) of a product
  */
 #define GPU_ID2_VERSION_MAKE(version_major, version_minor, version_status) \
-		((((u32)version_major) << GPU_ID2_VERSION_MAJOR_SHIFT)  | \
-		 (((u32)version_minor) << GPU_ID2_VERSION_MINOR_SHIFT)  | \
-		 (((u32)version_status) << GPU_ID2_VERSION_STATUS_SHIFT))
+		((((__u32)version_major) << GPU_ID2_VERSION_MAJOR_SHIFT)  | \
+		 (((__u32)version_minor) << GPU_ID2_VERSION_MINOR_SHIFT)  | \
+		 (((__u32)version_status) << GPU_ID2_VERSION_STATUS_SHIFT))
 
 /* Helper macro to create a complete GPU_ID (new format) */
 #define GPU_ID2_MAKE(arch_major, arch_minor, arch_rev, product_major, \
@@ -80,15 +82,15 @@
  * a particular GPU model by its arch_major and product_major.
  */
 #define GPU_ID2_MODEL_MAKE(arch_major, product_major) \
-		((((u32)arch_major) << GPU_ID2_ARCH_MAJOR_SHIFT)  | \
-		(((u32)product_major) << GPU_ID2_PRODUCT_MAJOR_SHIFT))
+		((((__u32)arch_major) << GPU_ID2_ARCH_MAJOR_SHIFT)  | \
+		(((__u32)product_major) << GPU_ID2_PRODUCT_MAJOR_SHIFT))
 
 /* Strip off the non-relevant bits from a product_id value and make it suitable
  * for comparison against the GPU_ID2_PRODUCT_xxx values which identify a GPU
  * model.
  */
 #define GPU_ID2_MODEL_MATCH_VALUE(product_id) \
-		((((u32)product_id) << GPU_ID2_PRODUCT_MAJOR_SHIFT) & \
+		((((__u32)product_id) << GPU_ID2_PRODUCT_MAJOR_SHIFT) & \
 		    GPU_ID2_PRODUCT_MODEL)
 
 #define GPU_ID2_PRODUCT_TMIX              GPU_ID2_MODEL_MAKE(6, 0)
@@ -110,9 +112,9 @@
  * minor, status
  */
 #define GPU_ID_MAKE(id, major, minor, status) \
-		((((u32)id) << GPU_ID_VERSION_PRODUCT_ID_SHIFT) | \
-		(((u32)major) << GPU_ID_VERSION_MAJOR_SHIFT) |   \
-		(((u32)minor) << GPU_ID_VERSION_MINOR_SHIFT) |   \
-		(((u32)status) << GPU_ID_VERSION_STATUS_SHIFT))
+		((((__u32)id) << GPU_ID_VERSION_PRODUCT_ID_SHIFT) | \
+		(((__u32)major) << GPU_ID_VERSION_MAJOR_SHIFT) |   \
+		(((__u32)minor) << GPU_ID_VERSION_MINOR_SHIFT) |   \
+		(((__u32)status) << GPU_ID_VERSION_STATUS_SHIFT))
 
-#endif /* _KBASE_GPU_ID_H_ */
+#endif /* _UAPI_KBASE_GPU_ID_H_ */
diff --git a/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_regmap.h b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_regmap.h
new file mode 100644
index 0000000..9977212
--- /dev/null
+++ b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_regmap.h
@@ -0,0 +1,424 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_GPU_REGMAP_H_
+#define _UAPI_KBASE_GPU_REGMAP_H_
+
+#include "mali_kbase_gpu_coherency.h"
+#include "mali_kbase_gpu_id.h"
+#if MALI_USE_CSF
+#include "backend/mali_kbase_gpu_regmap_csf.h"
+#else
+#include "backend/mali_kbase_gpu_regmap_jm.h"
+#endif
+
+/* Begin Register Offsets */
+/* GPU control registers */
+
+#define GPU_CONTROL_BASE        0x0000
+#define GPU_CONTROL_REG(r)      (GPU_CONTROL_BASE + (r))
+#define GPU_ID                  0x000   /* (RO) GPU and revision identifier */
+#define L2_FEATURES             0x004   /* (RO) Level 2 cache features */
+#define TILER_FEATURES          0x00C   /* (RO) Tiler Features */
+#define MEM_FEATURES            0x010   /* (RO) Memory system features */
+#define MMU_FEATURES            0x014   /* (RO) MMU features */
+#define AS_PRESENT              0x018   /* (RO) Address space slots present */
+#define GPU_IRQ_RAWSTAT         0x020   /* (RW) */
+#define GPU_IRQ_CLEAR           0x024   /* (WO) */
+#define GPU_IRQ_MASK            0x028   /* (RW) */
+#define GPU_IRQ_STATUS          0x02C   /* (RO) */
+
+#define GPU_COMMAND             0x030   /* (WO) */
+#define GPU_STATUS              0x034   /* (RO) */
+
+#define GPU_DBGEN               (1 << 8)    /* DBGEN wire status */
+
+#define GPU_FAULTSTATUS         0x03C   /* (RO) GPU exception type and fault status */
+#define GPU_FAULTADDRESS_LO     0x040   /* (RO) GPU exception fault address, low word */
+#define GPU_FAULTADDRESS_HI     0x044   /* (RO) GPU exception fault address, high word */
+
+#define L2_CONFIG               0x048   /* (RW) Level 2 cache configuration */
+
+#define GROUPS_L2_COHERENT      (1 << 0) /* Cores groups are l2 coherent */
+#define SUPER_L2_COHERENT       (1 << 1) /* Shader cores within a core
+					  * supergroup are l2 coherent
+					  */
+
+#define PWR_KEY                 0x050   /* (WO) Power manager key register */
+#define PWR_OVERRIDE0           0x054   /* (RW) Power manager override settings */
+#define PWR_OVERRIDE1           0x058   /* (RW) Power manager override settings */
+#define GPU_FEATURES_LO         0x060   /* (RO) GPU features, low word */
+#define GPU_FEATURES_HI         0x064   /* (RO) GPU features, high word */
+#define CYCLE_COUNT_LO          0x090   /* (RO) Cycle counter, low word */
+#define CYCLE_COUNT_HI          0x094   /* (RO) Cycle counter, high word */
+#define TIMESTAMP_LO            0x098   /* (RO) Global time stamp counter, low word */
+#define TIMESTAMP_HI            0x09C   /* (RO) Global time stamp counter, high word */
+
+#define THREAD_MAX_THREADS      0x0A0   /* (RO) Maximum number of threads per core */
+#define THREAD_MAX_WORKGROUP_SIZE 0x0A4 /* (RO) Maximum workgroup size */
+#define THREAD_MAX_BARRIER_SIZE 0x0A8   /* (RO) Maximum threads waiting at a barrier */
+#define THREAD_FEATURES         0x0AC   /* (RO) Thread features */
+#define THREAD_TLS_ALLOC        0x310   /* (RO) Number of threads per core that TLS must be allocated for */
+
+#define TEXTURE_FEATURES_0      0x0B0   /* (RO) Support flags for indexed texture formats 0..31 */
+#define TEXTURE_FEATURES_1      0x0B4   /* (RO) Support flags for indexed texture formats 32..63 */
+#define TEXTURE_FEATURES_2      0x0B8   /* (RO) Support flags for indexed texture formats 64..95 */
+#define TEXTURE_FEATURES_3      0x0BC   /* (RO) Support flags for texture order */
+
+#define TEXTURE_FEATURES_REG(n) GPU_CONTROL_REG(TEXTURE_FEATURES_0 + ((n) << 2))
+
+#define SHADER_PRESENT_LO       0x100   /* (RO) Shader core present bitmap, low word */
+#define SHADER_PRESENT_HI       0x104   /* (RO) Shader core present bitmap, high word */
+
+#define TILER_PRESENT_LO        0x110   /* (RO) Tiler core present bitmap, low word */
+#define TILER_PRESENT_HI        0x114   /* (RO) Tiler core present bitmap, high word */
+
+#define L2_PRESENT_LO           0x120   /* (RO) Level 2 cache present bitmap, low word */
+#define L2_PRESENT_HI           0x124   /* (RO) Level 2 cache present bitmap, high word */
+
+#define STACK_PRESENT_LO        0xE00   /* (RO) Core stack present bitmap, low word */
+#define STACK_PRESENT_HI        0xE04   /* (RO) Core stack present bitmap, high word */
+
+#define SHADER_READY_LO         0x140   /* (RO) Shader core ready bitmap, low word */
+#define SHADER_READY_HI         0x144   /* (RO) Shader core ready bitmap, high word */
+
+#define TILER_READY_LO          0x150   /* (RO) Tiler core ready bitmap, low word */
+#define TILER_READY_HI          0x154   /* (RO) Tiler core ready bitmap, high word */
+
+#define L2_READY_LO             0x160   /* (RO) Level 2 cache ready bitmap, low word */
+#define L2_READY_HI             0x164   /* (RO) Level 2 cache ready bitmap, high word */
+
+#define STACK_READY_LO          0xE10   /* (RO) Core stack ready bitmap, low word */
+#define STACK_READY_HI          0xE14   /* (RO) Core stack ready bitmap, high word */
+
+#define SHADER_PWRON_LO         0x180   /* (WO) Shader core power on bitmap, low word */
+#define SHADER_PWRON_HI         0x184   /* (WO) Shader core power on bitmap, high word */
+
+#define TILER_PWRON_LO          0x190   /* (WO) Tiler core power on bitmap, low word */
+#define TILER_PWRON_HI          0x194   /* (WO) Tiler core power on bitmap, high word */
+
+#define L2_PWRON_LO             0x1A0   /* (WO) Level 2 cache power on bitmap, low word */
+#define L2_PWRON_HI             0x1A4   /* (WO) Level 2 cache power on bitmap, high word */
+
+#define STACK_PWRON_LO          0xE20   /* (RO) Core stack power on bitmap, low word */
+#define STACK_PWRON_HI          0xE24   /* (RO) Core stack power on bitmap, high word */
+
+#define SHADER_PWROFF_LO        0x1C0   /* (WO) Shader core power off bitmap, low word */
+#define SHADER_PWROFF_HI        0x1C4   /* (WO) Shader core power off bitmap, high word */
+
+#define TILER_PWROFF_LO         0x1D0   /* (WO) Tiler core power off bitmap, low word */
+#define TILER_PWROFF_HI         0x1D4   /* (WO) Tiler core power off bitmap, high word */
+
+#define L2_PWROFF_LO            0x1E0   /* (WO) Level 2 cache power off bitmap, low word */
+#define L2_PWROFF_HI            0x1E4   /* (WO) Level 2 cache power off bitmap, high word */
+
+#define STACK_PWROFF_LO         0xE30   /* (RO) Core stack power off bitmap, low word */
+#define STACK_PWROFF_HI         0xE34   /* (RO) Core stack power off bitmap, high word */
+
+#define SHADER_PWRTRANS_LO      0x200   /* (RO) Shader core power transition bitmap, low word */
+#define SHADER_PWRTRANS_HI      0x204   /* (RO) Shader core power transition bitmap, high word */
+
+#define TILER_PWRTRANS_LO       0x210   /* (RO) Tiler core power transition bitmap, low word */
+#define TILER_PWRTRANS_HI       0x214   /* (RO) Tiler core power transition bitmap, high word */
+
+#define L2_PWRTRANS_LO          0x220   /* (RO) Level 2 cache power transition bitmap, low word */
+#define L2_PWRTRANS_HI          0x224   /* (RO) Level 2 cache power transition bitmap, high word */
+
+#define ASN_HASH_0              0x02C0 /* (RW) ASN hash function argument 0 */
+#define ASN_HASH(n)             (ASN_HASH_0 + (n)*4)
+#define ASN_HASH_COUNT          3
+
+#define STACK_PWRTRANS_LO       0xE40   /* (RO) Core stack power transition bitmap, low word */
+#define STACK_PWRTRANS_HI       0xE44   /* (RO) Core stack power transition bitmap, high word */
+
+#define SHADER_PWRACTIVE_LO     0x240   /* (RO) Shader core active bitmap, low word */
+#define SHADER_PWRACTIVE_HI     0x244   /* (RO) Shader core active bitmap, high word */
+
+#define TILER_PWRACTIVE_LO      0x250   /* (RO) Tiler core active bitmap, low word */
+#define TILER_PWRACTIVE_HI      0x254   /* (RO) Tiler core active bitmap, high word */
+
+#define L2_PWRACTIVE_LO         0x260   /* (RO) Level 2 cache active bitmap, low word */
+#define L2_PWRACTIVE_HI         0x264   /* (RO) Level 2 cache active bitmap, high word */
+
+#define COHERENCY_FEATURES      0x300   /* (RO) Coherency features present */
+#define COHERENCY_ENABLE        0x304   /* (RW) Coherency enable */
+
+#define SHADER_CONFIG           0xF04   /* (RW) Shader core configuration (implementation-specific) */
+#define TILER_CONFIG            0xF08   /* (RW) Tiler core configuration (implementation-specific) */
+#define L2_MMU_CONFIG           0xF0C   /* (RW) L2 cache and MMU configuration (implementation-specific) */
+
+/* Job control registers */
+
+#define JOB_CONTROL_BASE        0x1000
+
+#define JOB_CONTROL_REG(r)      (JOB_CONTROL_BASE + (r))
+
+#define JOB_IRQ_RAWSTAT         0x000   /* Raw interrupt status register */
+#define JOB_IRQ_CLEAR           0x004   /* Interrupt clear register */
+#define JOB_IRQ_MASK            0x008   /* Interrupt mask register */
+#define JOB_IRQ_STATUS          0x00C   /* Interrupt status register */
+
+/* MMU control registers */
+
+#define MEMORY_MANAGEMENT_BASE  0x2000
+#define MMU_REG(r)              (MEMORY_MANAGEMENT_BASE + (r))
+
+#define MMU_IRQ_RAWSTAT         0x000   /* (RW) Raw interrupt status register */
+#define MMU_IRQ_CLEAR           0x004   /* (WO) Interrupt clear register */
+#define MMU_IRQ_MASK            0x008   /* (RW) Interrupt mask register */
+#define MMU_IRQ_STATUS          0x00C   /* (RO) Interrupt status register */
+
+#define MMU_AS0                 0x400   /* Configuration registers for address space 0 */
+#define MMU_AS1                 0x440   /* Configuration registers for address space 1 */
+#define MMU_AS2                 0x480   /* Configuration registers for address space 2 */
+#define MMU_AS3                 0x4C0   /* Configuration registers for address space 3 */
+#define MMU_AS4                 0x500   /* Configuration registers for address space 4 */
+#define MMU_AS5                 0x540   /* Configuration registers for address space 5 */
+#define MMU_AS6                 0x580   /* Configuration registers for address space 6 */
+#define MMU_AS7                 0x5C0   /* Configuration registers for address space 7 */
+#define MMU_AS8                 0x600   /* Configuration registers for address space 8 */
+#define MMU_AS9                 0x640   /* Configuration registers for address space 9 */
+#define MMU_AS10                0x680   /* Configuration registers for address space 10 */
+#define MMU_AS11                0x6C0   /* Configuration registers for address space 11 */
+#define MMU_AS12                0x700   /* Configuration registers for address space 12 */
+#define MMU_AS13                0x740   /* Configuration registers for address space 13 */
+#define MMU_AS14                0x780   /* Configuration registers for address space 14 */
+#define MMU_AS15                0x7C0   /* Configuration registers for address space 15 */
+
+/* MMU address space control registers */
+
+#define MMU_AS_REG(n, r)        (MMU_REG(MMU_AS0 + ((n) << 6)) + (r))
+
+#define AS_TRANSTAB_LO         0x00	/* (RW) Translation Table Base Address for address space n, low word */
+#define AS_TRANSTAB_HI         0x04	/* (RW) Translation Table Base Address for address space n, high word */
+#define AS_MEMATTR_LO          0x08	/* (RW) Memory attributes for address space n, low word. */
+#define AS_MEMATTR_HI          0x0C	/* (RW) Memory attributes for address space n, high word. */
+#define AS_LOCKADDR_LO         0x10	/* (RW) Lock region address for address space n, low word */
+#define AS_LOCKADDR_HI         0x14	/* (RW) Lock region address for address space n, high word */
+#define AS_COMMAND             0x18	/* (WO) MMU command register for address space n */
+#define AS_FAULTSTATUS         0x1C	/* (RO) MMU fault status register for address space n */
+#define AS_FAULTADDRESS_LO     0x20	/* (RO) Fault Address for address space n, low word */
+#define AS_FAULTADDRESS_HI     0x24	/* (RO) Fault Address for address space n, high word */
+#define AS_STATUS              0x28	/* (RO) Status flags for address space n */
+
+/* (RW) Translation table configuration for address space n, low word */
+#define AS_TRANSCFG_LO         0x30
+/* (RW) Translation table configuration for address space n, high word */
+#define AS_TRANSCFG_HI         0x34
+/* (RO) Secondary fault address for address space n, low word */
+#define AS_FAULTEXTRA_LO       0x38
+/* (RO) Secondary fault address for address space n, high word */
+#define AS_FAULTEXTRA_HI       0x3C
+
+/* End Register Offsets */
+
+#define GPU_IRQ_REG_ALL (GPU_IRQ_REG_COMMON)
+
+/*
+ * MMU_IRQ_RAWSTAT register values. Values are valid also for
+ * MMU_IRQ_CLEAR, MMU_IRQ_MASK, MMU_IRQ_STATUS registers.
+ */
+
+#define MMU_PAGE_FAULT_FLAGS    16
+
+/* Macros returning a bitmask to retrieve page fault or bus error flags from
+ * MMU registers
+ */
+#define MMU_PAGE_FAULT(n)       (1UL << (n))
+#define MMU_BUS_ERROR(n)        (1UL << ((n) + MMU_PAGE_FAULT_FLAGS))
+
+/*
+ * Begin AARCH64 MMU TRANSTAB register values
+ */
+#define MMU_HW_OUTA_BITS 40
+#define AS_TRANSTAB_BASE_MASK ((1ULL << MMU_HW_OUTA_BITS) - (1ULL << 4))
+
+/*
+ * Begin MMU STATUS register values
+ */
+#define AS_STATUS_AS_ACTIVE 0x01
+
+#define AS_FAULTSTATUS_EXCEPTION_CODE_MASK                      (0x7<<3)
+#define AS_FAULTSTATUS_EXCEPTION_CODE_TRANSLATION_FAULT         (0x0<<3)
+#define AS_FAULTSTATUS_EXCEPTION_CODE_PERMISSION_FAULT          (0x1<<3)
+#define AS_FAULTSTATUS_EXCEPTION_CODE_TRANSTAB_BUS_FAULT        (0x2<<3)
+#define AS_FAULTSTATUS_EXCEPTION_CODE_ACCESS_FLAG               (0x3<<3)
+#define AS_FAULTSTATUS_EXCEPTION_CODE_ADDRESS_SIZE_FAULT        (0x4<<3)
+#define AS_FAULTSTATUS_EXCEPTION_CODE_MEMORY_ATTRIBUTES_FAULT   (0x5<<3)
+
+#define AS_FAULTSTATUS_EXCEPTION_TYPE_SHIFT 0
+#define AS_FAULTSTATUS_EXCEPTION_TYPE_MASK (0xFF << AS_FAULTSTATUS_EXCEPTION_TYPE_SHIFT)
+#define AS_FAULTSTATUS_EXCEPTION_TYPE_GET(reg_val) \
+	(((reg_val)&AS_FAULTSTATUS_EXCEPTION_TYPE_MASK) >> AS_FAULTSTATUS_EXCEPTION_TYPE_SHIFT)
+#define AS_FAULTSTATUS_EXCEPTION_TYPE_TRANSLATION_FAULT_0 0xC0
+
+#define AS_FAULTSTATUS_ACCESS_TYPE_SHIFT 8
+#define AS_FAULTSTATUS_ACCESS_TYPE_MASK (0x3 << AS_FAULTSTATUS_ACCESS_TYPE_SHIFT)
+#define AS_FAULTSTATUS_ACCESS_TYPE_GET(reg_val) \
+	(((reg_val)&AS_FAULTSTATUS_ACCESS_TYPE_MASK) >> AS_FAULTSTATUS_ACCESS_TYPE_SHIFT)
+
+#define AS_FAULTSTATUS_ACCESS_TYPE_ATOMIC       (0x0)
+#define AS_FAULTSTATUS_ACCESS_TYPE_EX           (0x1)
+#define AS_FAULTSTATUS_ACCESS_TYPE_READ         (0x2)
+#define AS_FAULTSTATUS_ACCESS_TYPE_WRITE        (0x3)
+
+#define AS_FAULTSTATUS_SOURCE_ID_SHIFT 16
+#define AS_FAULTSTATUS_SOURCE_ID_MASK (0xFFFF << AS_FAULTSTATUS_SOURCE_ID_SHIFT)
+#define AS_FAULTSTATUS_SOURCE_ID_GET(reg_val) \
+	(((reg_val)&AS_FAULTSTATUS_SOURCE_ID_MASK) >> AS_FAULTSTATUS_SOURCE_ID_SHIFT)
+
+/*
+ * Begin MMU TRANSCFG register values
+ */
+#define AS_TRANSCFG_ADRMODE_LEGACY      0
+#define AS_TRANSCFG_ADRMODE_UNMAPPED    1
+#define AS_TRANSCFG_ADRMODE_IDENTITY    2
+#define AS_TRANSCFG_ADRMODE_AARCH64_4K  6
+#define AS_TRANSCFG_ADRMODE_AARCH64_64K 8
+
+#define AS_TRANSCFG_ADRMODE_MASK        0xF
+
+/*
+ * Begin TRANSCFG register values
+ */
+#define AS_TRANSCFG_PTW_MEMATTR_MASK (3ull << 24)
+#define AS_TRANSCFG_PTW_MEMATTR_NON_CACHEABLE (1ull << 24)
+#define AS_TRANSCFG_PTW_MEMATTR_WRITE_BACK (2ull << 24)
+
+#define AS_TRANSCFG_PTW_SH_MASK ((3ull << 28))
+#define AS_TRANSCFG_PTW_SH_OS (2ull << 28)
+#define AS_TRANSCFG_PTW_SH_IS (3ull << 28)
+#define AS_TRANSCFG_R_ALLOCATE (1ull << 30)
+
+/*
+ * Begin Command Values
+ */
+
+/* AS_COMMAND register commands */
+#define AS_COMMAND_NOP         0x00	/* NOP Operation */
+#define AS_COMMAND_UPDATE      0x01	/* Broadcasts the values in AS_TRANSTAB and ASn_MEMATTR to all MMUs */
+#define AS_COMMAND_LOCK        0x02	/* Issue a lock region command to all MMUs */
+#define AS_COMMAND_UNLOCK      0x03	/* Issue a flush region command to all MMUs */
+/* Flush all L2 caches then issue a flush region command to all MMUs
+ * (deprecated - only for use with T60x)
+ */
+#define AS_COMMAND_FLUSH 0x04
+/* Flush all L2 caches then issue a flush region command to all MMUs */
+#define AS_COMMAND_FLUSH_PT 0x04
+/* Wait for memory accesses to complete, flush all the L1s cache then flush all
+ * L2 caches then issue a flush region command to all MMUs
+ */
+#define AS_COMMAND_FLUSH_MEM 0x05
+
+/* GPU_STATUS values */
+#define GPU_STATUS_PRFCNT_ACTIVE            (1 << 2)    /* Set if the performance counters are active. */
+#define GPU_STATUS_CYCLE_COUNT_ACTIVE       (1 << 6)    /* Set if the cycle counter is active. */
+#define GPU_STATUS_PROTECTED_MODE_ACTIVE    (1 << 7)    /* Set if protected mode is active */
+
+/* PRFCNT_CONFIG register values */
+#define PRFCNT_CONFIG_MODE_SHIFT        0 /* Counter mode position. */
+#define PRFCNT_CONFIG_AS_SHIFT          4 /* Address space bitmap position. */
+#define PRFCNT_CONFIG_SETSELECT_SHIFT   8 /* Set select position. */
+
+/* The performance counters are disabled. */
+#define PRFCNT_CONFIG_MODE_OFF          0
+/* The performance counters are enabled, but are only written out when a
+ * PRFCNT_SAMPLE command is issued using the GPU_COMMAND register.
+ */
+#define PRFCNT_CONFIG_MODE_MANUAL       1
+/* The performance counters are enabled, and are written out each time a tile
+ * finishes rendering.
+ */
+#define PRFCNT_CONFIG_MODE_TILE         2
+
+/* AS<n>_MEMATTR values from MMU_MEMATTR_STAGE1: */
+/* Use GPU implementation-defined caching policy. */
+#define AS_MEMATTR_IMPL_DEF_CACHE_POLICY 0x88ull
+/* The attribute set to force all resources to be cached. */
+#define AS_MEMATTR_FORCE_TO_CACHE_ALL    0x8Full
+/* Inner write-alloc cache setup, no outer caching */
+#define AS_MEMATTR_WRITE_ALLOC           0x8Dull
+
+/* Use GPU implementation-defined  caching policy. */
+#define AS_MEMATTR_LPAE_IMPL_DEF_CACHE_POLICY 0x48ull
+/* The attribute set to force all resources to be cached. */
+#define AS_MEMATTR_LPAE_FORCE_TO_CACHE_ALL    0x4Full
+/* Inner write-alloc cache setup, no outer caching */
+#define AS_MEMATTR_LPAE_WRITE_ALLOC           0x4Dull
+/* Set to implementation defined, outer caching */
+#define AS_MEMATTR_LPAE_OUTER_IMPL_DEF        0x88ull
+/* Set to write back memory, outer caching */
+#define AS_MEMATTR_LPAE_OUTER_WA              0x8Dull
+/* There is no LPAE support for non-cacheable, since the memory type is always
+ * write-back.
+ * Marking this setting as reserved for LPAE
+ */
+#define AS_MEMATTR_LPAE_NON_CACHEABLE_RESERVED
+
+/* L2_MMU_CONFIG register */
+#define L2_MMU_CONFIG_ALLOW_SNOOP_DISPARITY_SHIFT       (23)
+#define L2_MMU_CONFIG_ALLOW_SNOOP_DISPARITY             (0x1 << L2_MMU_CONFIG_ALLOW_SNOOP_DISPARITY_SHIFT)
+
+/* End L2_MMU_CONFIG register */
+
+/* THREAD_* registers */
+
+/* THREAD_FEATURES IMPLEMENTATION_TECHNOLOGY values */
+#define IMPLEMENTATION_UNSPECIFIED  0
+#define IMPLEMENTATION_SILICON      1
+#define IMPLEMENTATION_FPGA         2
+#define IMPLEMENTATION_MODEL        3
+
+/* Default values when registers are not supported by the implemented hardware */
+#define THREAD_MT_DEFAULT     256
+#define THREAD_MWS_DEFAULT    256
+#define THREAD_MBS_DEFAULT    256
+#define THREAD_MR_DEFAULT     1024
+#define THREAD_MTQ_DEFAULT    4
+#define THREAD_MTGS_DEFAULT   10
+
+/* End THREAD_* registers */
+
+/* SHADER_CONFIG register */
+#define SC_LS_ALLOW_ATTR_TYPES      (1ul << 16)
+#define SC_TLS_HASH_ENABLE          (1ul << 17)
+#define SC_LS_ATTR_CHECK_DISABLE    (1ul << 18)
+#define SC_VAR_ALGORITHM            (1ul << 29)
+/* End SHADER_CONFIG register */
+
+/* TILER_CONFIG register */
+#define TC_CLOCK_GATE_OVERRIDE      (1ul << 0)
+/* End TILER_CONFIG register */
+
+/* L2_CONFIG register */
+#define L2_CONFIG_SIZE_SHIFT        16
+#define L2_CONFIG_SIZE_MASK         (0xFFul << L2_CONFIG_SIZE_SHIFT)
+#define L2_CONFIG_HASH_SHIFT        24
+#define L2_CONFIG_HASH_MASK         (0xFFul << L2_CONFIG_HASH_SHIFT)
+#define L2_CONFIG_ASN_HASH_ENABLE_SHIFT        24
+#define L2_CONFIG_ASN_HASH_ENABLE_MASK         (1ul << L2_CONFIG_ASN_HASH_ENABLE_SHIFT)
+/* End L2_CONFIG register */
+
+/* IDVS_GROUP register */
+#define IDVS_GROUP_SIZE_SHIFT (16)
+#define IDVS_GROUP_MAX_SIZE (0x3F)
+
+#endif /* _UAPI_KBASE_GPU_REGMAP_H_ */
diff --git a/mali_kbase/jm/mali_base_jm_kernel.h b/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h
index a72819e..cd81421 100644
--- a/mali_kbase/jm/mali_base_jm_kernel.h
+++ b/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h
@@ -19,8 +19,10 @@
  *
  */
 
-#ifndef _BASE_JM_KERNEL_H_
-#define _BASE_JM_KERNEL_H_
+#ifndef _UAPI_BASE_JM_KERNEL_H_
+#define _UAPI_BASE_JM_KERNEL_H_
+
+#include <linux/types.h>
 
 /* Memory allocation, access/hint flags.
  *
@@ -207,8 +209,8 @@
 #define BASE_JIT_ALLOC_MEM_TILER_ALIGN_TOP  (1 << 0)
 
 /**
- * If set, the heap info address points to a u32 holding the used size in bytes;
- * otherwise it points to a u64 holding the lowest address of unused memory.
+ * If set, the heap info address points to a __u32 holding the used size in bytes;
+ * otherwise it points to a __u64 holding the lowest address of unused memory.
  */
 #define BASE_JIT_ALLOC_HEAP_INFO_IS_SIZE  (1 << 1)
 
@@ -230,7 +232,7 @@
  * These share the same space as BASEP_CONTEXT_FLAG_*, and so must
  * not collide with them.
  */
-typedef u32 base_context_create_flags;
+typedef __u32 base_context_create_flags;
 
 /* No flags set */
 #define BASE_CONTEXT_CREATE_FLAG_NONE ((base_context_create_flags)0)
@@ -320,7 +322,7 @@ typedef u32 base_context_create_flags;
  * @blob: per-job data array
  */
 struct base_jd_udata {
-	u64 blob[2];
+	__u64 blob[2];
 };
 
 /**
@@ -333,7 +335,7 @@ struct base_jd_udata {
  * When the flag is set for a particular dependency to signal that it is an
  * ordering only dependency then errors will not be propagated.
  */
-typedef u8 base_jd_dep_type;
+typedef __u8 base_jd_dep_type;
 
 #define BASE_JD_DEP_TYPE_INVALID  (0)       /**< Invalid dependency */
 #define BASE_JD_DEP_TYPE_DATA     (1U << 0) /**< Data dependency */
@@ -349,7 +351,7 @@ typedef u8 base_jd_dep_type;
  * Special case is ::BASE_JD_REQ_DEP, which is used to express complex
  * dependencies, and that doesn't execute anything on the hardware.
  */
-typedef u32 base_jd_core_req;
+typedef __u32 base_jd_core_req;
 
 /* Requirements that come from the HW */
 
@@ -581,6 +583,13 @@ typedef u32 base_jd_core_req;
  */
 #define BASE_JD_REQ_END_RENDERPASS ((base_jd_core_req)1 << 19)
 
+/* SW-only requirement: The atom needs to run on a limited core mask affinity.
+ *
+ * If this bit is set then the kbase_context.limited_core_mask will be applied
+ * to the affinity.
+ */
+#define BASE_JD_REQ_LIMITED_CORE_MASK ((base_jd_core_req)1 << 20)
+
 /* These requirement bits are currently unused in base_jd_core_req
  */
 #define BASEP_JD_REQ_RESERVED \
@@ -591,7 +600,7 @@ typedef u32 base_jd_core_req;
 	BASE_JD_REQ_FS_AFBC | BASE_JD_REQ_PERMON | \
 	BASE_JD_REQ_SKIP_CACHE_START | BASE_JD_REQ_SKIP_CACHE_END | \
 	BASE_JD_REQ_JOB_SLOT | BASE_JD_REQ_START_RENDERPASS | \
-	BASE_JD_REQ_END_RENDERPASS))
+	BASE_JD_REQ_END_RENDERPASS | BASE_JD_REQ_LIMITED_CORE_MASK))
 
 /* Mask of all bits in base_jd_core_req that control the type of the atom.
  *
@@ -636,7 +645,7 @@ enum kbase_jd_atom_state {
 /**
  * typedef base_atom_id - Type big enough to store an atom number in.
  */
-typedef u8 base_atom_id;
+typedef __u8 base_atom_id;
 
 /**
  * struct base_dependency -
@@ -699,10 +708,10 @@ struct base_dependency {
  * BASE_JD_REQ_END_RENDERPASS is set in the base_jd_core_req.
  */
 struct base_jd_fragment {
-	u64 norm_read_norm_write;
-	u64 norm_read_forced_write;
-	u64 forced_read_forced_write;
-	u64 forced_read_norm_write;
+	__u64 norm_read_norm_write;
+	__u64 norm_read_forced_write;
+	__u64 forced_read_forced_write;
+	__u64 forced_read_norm_write;
 };
 
 /**
@@ -742,7 +751,7 @@ struct base_jd_fragment {
  * the same context. See KBASE_JS_SYSTEM_PRIORITY_MODE and
  * KBASE_JS_PROCESS_LOCAL_PRIORITY_MODE for more details.
  */
-typedef u8 base_jd_prio;
+typedef __u8 base_jd_prio;
 
 /* Medium atom priority. This is a priority higher than BASE_JD_PRIO_LOW */
 #define BASE_JD_PRIO_MEDIUM  ((base_jd_prio)0)
@@ -793,32 +802,32 @@ typedef u8 base_jd_prio;
  * @padding:       Unused. Must be zero.
  *
  * This structure has changed since UK 10.2 for which base_jd_core_req was a
- * u16 value.
+ * __u16 value.
  *
- * In UK 10.3 a core_req field of a u32 type was added to the end of the
- * structure, and the place in the structure previously occupied by u16
+ * In UK 10.3 a core_req field of a __u32 type was added to the end of the
+ * structure, and the place in the structure previously occupied by __u16
  * core_req was kept but renamed to compat_core_req.
  *
- * From UK 11.20 - compat_core_req is now occupied by u8 jit_id[2].
+ * From UK 11.20 - compat_core_req is now occupied by __u8 jit_id[2].
  * Compatibility with UK 10.x from UK 11.y is not handled because
  * the major version increase prevents this.
  *
  * For UK 11.20 jit_id[2] must be initialized to zero.
  */
 struct base_jd_atom_v2 {
-	u64 jc;
+	__u64 jc;
 	struct base_jd_udata udata;
-	u64 extres_list;
-	u16 nr_extres;
-	u8 jit_id[2];
+	__u64 extres_list;
+	__u16 nr_extres;
+	__u8 jit_id[2];
 	struct base_dependency pre_dep[2];
 	base_atom_id atom_number;
 	base_jd_prio prio;
-	u8 device_nr;
-	u8 jobslot;
+	__u8 device_nr;
+	__u8 jobslot;
 	base_jd_core_req core_req;
-	u8 renderpass_id;
-	u8 padding[7];
+	__u8 renderpass_id;
+	__u8 padding[7];
 };
 
 /**
@@ -853,20 +862,20 @@ struct base_jd_atom_v2 {
  * @padding:       Unused. Must be zero.
  */
 typedef struct base_jd_atom {
-	u64 seq_nr;
-	u64 jc;
+	__u64 seq_nr;
+	__u64 jc;
 	struct base_jd_udata udata;
-	u64 extres_list;
-	u16 nr_extres;
-	u8 jit_id[2];
+	__u64 extres_list;
+	__u16 nr_extres;
+	__u8 jit_id[2];
 	struct base_dependency pre_dep[2];
 	base_atom_id atom_number;
 	base_jd_prio prio;
-	u8 device_nr;
-	u8 jobslot;
+	__u8 device_nr;
+	__u8 jobslot;
 	base_jd_core_req core_req;
-	u8 renderpass_id;
-	u8 padding[7];
+	__u8 renderpass_id;
+	__u8 padding[7];
 } base_jd_atom;
 
 /* Job chain event code bits
@@ -1181,11 +1190,11 @@ struct base_jd_event_v2 {
  */
 
 struct base_dump_cpu_gpu_counters {
-	u64 system_time;
-	u64 cycle_counter;
-	u64 sec;
-	u32 usec;
-	u8 padding[36];
+	__u64 system_time;
+	__u64 cycle_counter;
+	__u64 sec;
+	__u32 usec;
+	__u8 padding[36];
 };
 
-#endif /* _BASE_JM_KERNEL_H_ */
+#endif /* _UAPI_BASE_JM_KERNEL_H_ */
diff --git a/mali_kbase/jm/mali_kbase_jm_ioctl.h b/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h
index 93c9c44..1eb6bcb 100644
--- a/mali_kbase/jm/mali_kbase_jm_ioctl.h
+++ b/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -19,8 +19,8 @@
  *
  */
 
-#ifndef _KBASE_JM_IOCTL_H_
-#define _KBASE_JM_IOCTL_H_
+#ifndef _UAPI_KBASE_JM_IOCTL_H_
+#define _UAPI_KBASE_JM_IOCTL_H_
 
 #include <asm-generic/ioctl.h>
 #include <linux/types.h>
@@ -116,9 +116,12 @@
  * 11.30:
  * - Add a new priority level BASE_JD_PRIO_REALTIME
  * - Add ioctl 54: This controls the priority setting.
+ * 11.31:
+ * - Added BASE_JD_REQ_LIMITED_CORE_MASK.
+ * - Added ioctl 55: set_limited_core_count.
  */
 #define BASE_UK_VERSION_MAJOR 11
-#define BASE_UK_VERSION_MINOR 30
+#define BASE_UK_VERSION_MINOR 31
 
 /**
  * struct kbase_ioctl_version_check - Check version compatibility between
@@ -217,4 +220,4 @@ union kbase_kinstr_jm_fd {
 #define KBASE_IOCTL_VERSION_CHECK_RESERVED \
 	_IOWR(KBASE_IOCTL_TYPE, 52, struct kbase_ioctl_version_check)
 
-#endif /* _KBASE_JM_IOCTL_H_ */
+#endif /* _UAPI_KBASE_JM_IOCTL_H_ */
diff --git a/mali_kbase/mali_base_kernel.h b/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h
index 5c173eb..554c5a3 100644
--- a/mali_kbase/mali_base_kernel.h
+++ b/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h
@@ -23,38 +23,46 @@
  * Base structures shared with the kernel.
  */
 
-#ifndef _BASE_KERNEL_H_
-#define _BASE_KERNEL_H_
+#ifndef _UAPI_BASE_KERNEL_H_
+#define _UAPI_BASE_KERNEL_H_
+
+#include <linux/types.h>
 
 struct base_mem_handle {
 	struct {
-		u64 handle;
+		__u64 handle;
 	} basep;
 };
 
 #include "mali_base_mem_priv.h"
-#include "gpu/mali_kbase_gpu_coherency.h"
 #include "gpu/mali_kbase_gpu_id.h"
+#include "gpu/mali_kbase_gpu_coherency.h"
 
 #define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 4
 
 #define BASE_MAX_COHERENT_GROUPS 16
 
-#if defined CDBG_ASSERT
+#if defined(CDBG_ASSERT)
 #define LOCAL_ASSERT CDBG_ASSERT
-#elif defined KBASE_DEBUG_ASSERT
+#elif defined(KBASE_DEBUG_ASSERT)
 #define LOCAL_ASSERT KBASE_DEBUG_ASSERT
 #else
+#if defined(__KERNEL__)
 #error assert macro not defined!
+#else
+#define LOCAL_ASSERT(...)	((void)#__VA_ARGS__)
+#endif
 #endif
 
 #if defined(PAGE_MASK) && defined(PAGE_SHIFT)
 #define LOCAL_PAGE_SHIFT PAGE_SHIFT
 #define LOCAL_PAGE_LSB ~PAGE_MASK
 #else
-#include <osu/mali_osu.h>
+#ifndef OSU_CONFIG_CPU_PAGE_SIZE_LOG2
+#define OSU_CONFIG_CPU_PAGE_SIZE_LOG2 12
+#endif
 
-#if defined OSU_CONFIG_CPU_PAGE_SIZE_LOG2
+#if defined(OSU_CONFIG_CPU_PAGE_SIZE_LOG2)
 #define LOCAL_PAGE_SHIFT OSU_CONFIG_CPU_PAGE_SIZE_LOG2
 #define LOCAL_PAGE_LSB ((1ul << OSU_CONFIG_CPU_PAGE_SIZE_LOG2) - 1)
 #else
@@ -82,7 +90,7 @@ struct base_mem_handle {
  * More flags can be added to this list, as long as they don't clash
  * (see BASE_MEM_FLAGS_NR_BITS for the number of the first free bit).
  */
-typedef u32 base_mem_alloc_flags;
+typedef __u32 base_mem_alloc_flags;
 
 /* A mask for all the flags which are modifiable via the base_mem_set_flags
  * interface.
@@ -135,8 +143,8 @@ enum base_mem_import_type {
  */
 
 struct base_mem_import_user_buffer {
-	u64 ptr;
-	u64 length;
+	__u64 ptr;
+	__u64 length;
 };
 
 /* Mask to detect 4GB boundary alignment */
@@ -197,8 +205,8 @@ struct base_fence {
  */
 struct base_mem_aliasing_info {
 	struct base_mem_handle handle;
-	u64 offset;
-	u64 length;
+	__u64 offset;
+	__u64 length;
 };
 
 /* Maximum percentage of just-in-time memory allocation trimming to perform
@@ -220,11 +228,11 @@ struct base_mem_aliasing_info {
  * An array of structures was not supported
  */
 struct base_jit_alloc_info_10_2 {
-	u64 gpu_alloc_addr;
-	u64 va_pages;
-	u64 commit_pages;
-	u64 extension;
-	u8 id;
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
 };
 
 /* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up
@@ -247,16 +255,16 @@ struct base_jit_alloc_info_10_2 {
  * 11.10: Arrays of this structure are supported
  */
 struct base_jit_alloc_info_11_5 {
-	u64 gpu_alloc_addr;
-	u64 va_pages;
-	u64 commit_pages;
-	u64 extension;
-	u8 id;
-	u8 bin_id;
-	u8 max_allocations;
-	u8 flags;
-	u8 padding[2];
-	u16 usage_id;
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+	__u8 bin_id;
+	__u8 max_allocations;
+	__u8 flags;
+	__u8 padding[2];
+	__u16 usage_id;
 };
 
 /**
@@ -302,17 +310,17 @@ struct base_jit_alloc_info_11_5 {
  * 11.20: added @heap_info_gpu_addr
  */
 struct base_jit_alloc_info {
-	u64 gpu_alloc_addr;
-	u64 va_pages;
-	u64 commit_pages;
-	u64 extension;
-	u8 id;
-	u8 bin_id;
-	u8 max_allocations;
-	u8 flags;
-	u8 padding[2];
-	u16 usage_id;
-	u64 heap_info_gpu_addr;
+	__u64 gpu_alloc_addr;
+	__u64 va_pages;
+	__u64 commit_pages;
+	__u64 extension;
+	__u8 id;
+	__u8 bin_id;
+	__u8 max_allocations;
+	__u8 flags;
+	__u8 padding[2];
+	__u16 usage_id;
+	__u64 heap_info_gpu_addr;
 };
 
 enum base_external_resource_access {
@@ -321,7 +329,7 @@ enum base_external_resource_access {
 };
 
 struct base_external_resource {
-	u64 ext_resource;
+	__u64 ext_resource;
 };
 
 
@@ -339,13 +347,13 @@ struct base_external_resource {
  *                                      sized at allocation time.
  */
 struct base_external_resource_list {
-	u64 count;
+	__u64 count;
 	struct base_external_resource ext_res[1];
 };
 
 struct base_jd_debug_copy_buffer {
-	u64 address;
-	u64 size;
+	__u64 address;
+	__u64 size;
 	struct base_external_resource extres;
 };
 
@@ -457,7 +465,7 @@ struct base_jd_debug_copy_buffer {
  * population count, since faulty cores may be disabled during production,
  * producing a non-contiguous mask.
  *
- * The memory requirements for this algorithm can be determined either by a u64
+ * The memory requirements for this algorithm can be determined either by a __u64
  * population count on the L2_PRESENT mask (a LUT helper already is
  * required for the above), or simple assumption that there can be no more than
  * 16 coherent groups, since core groups are typically 4 cores.
@@ -496,16 +504,16 @@ struct base_jd_debug_copy_buffer {
  * @num_exec_engines: The number of execution engines.
  */
 struct mali_base_gpu_core_props {
-	u32 product_id;
-	u16 version_status;
-	u16 minor_revision;
-	u16 major_revision;
-	u16 padding;
-	u32 gpu_freq_khz_max;
-	u32 log2_program_counter_size;
-	u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
-	u64 gpu_available_memory_size;
-	u8 num_exec_engines;
+	__u32 product_id;
+	__u16 version_status;
+	__u16 minor_revision;
+	__u16 major_revision;
+	__u16 padding;
+	__u32 gpu_freq_khz_max;
+	__u32 log2_program_counter_size;
+	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+	__u64 gpu_available_memory_size;
+	__u8 num_exec_engines;
 };
 
 /*
@@ -513,15 +521,15 @@ struct mali_base_gpu_core_props {
  * required by upper-level apis.
  */
 struct mali_base_gpu_l2_cache_props {
-	u8 log2_line_size;
-	u8 log2_cache_size;
-	u8 num_l2_slices; /* Number of L2C slices. 1 or higher */
-	u8 padding[5];
+	__u8 log2_line_size;
+	__u8 log2_cache_size;
+	__u8 num_l2_slices; /* Number of L2C slices. 1 or higher */
+	__u8 padding[5];
 };
 
 struct mali_base_gpu_tiler_props {
-	u32 bin_size_bytes;	/* Max is 4*2^15 */
-	u32 max_active_levels;	/* Max is 2^15 */
+	__u32 bin_size_bytes;	/* Max is 4*2^15 */
+	__u32 max_active_levels;	/* Max is 2^15 */
 };
 
 /**
@@ -543,15 +551,15 @@ struct mali_base_gpu_tiler_props {
  *                          allocated for
  */
 struct mali_base_gpu_thread_props {
-	u32 max_threads;
-	u32 max_workgroup_size;
-	u32 max_barrier_size;
-	u16 max_registers;
-	u8 max_task_queue;
-	u8 max_thread_group_split;
-	u8 impl_tech;
-	u8  padding[3];
-	u32 tls_alloc;
+	__u32 max_threads;
+	__u32 max_workgroup_size;
+	__u32 max_barrier_size;
+	__u16 max_registers;
+	__u8 max_task_queue;
+	__u8 max_thread_group_split;
+	__u8 impl_tech;
+	__u8  padding[3];
+	__u32 tls_alloc;
 };
 
 /**
@@ -570,9 +578,9 @@ struct mali_base_gpu_thread_props {
  * 	wastage.
  */
 struct mali_base_gpu_coherent_group {
-	u64 core_mask;
-	u16 num_cores;
-	u16 padding[3];
+	__u64 core_mask;
+	__u16 num_cores;
+	__u16 padding[3];
 };
 
 /**
@@ -591,17 +599,17 @@ struct mali_base_gpu_coherent_group {
  * @group: Descriptors of coherent groups
  *
  * Note that the sizes of the members could be reduced. However, the \c group
- * member might be 8-byte aligned to ensure the u64 core_mask is 8-byte
+ * member might be 8-byte aligned to ensure the __u64 core_mask is 8-byte
  * aligned, thus leading to wastage if the other members sizes were reduced.
  *
  * The groups are sorted by core mask. The core masks are non-repeating and do
  * not intersect.
  */
 struct mali_base_gpu_coherent_group_info {
-	u32 num_groups;
-	u32 num_core_groups;
-	u32 coherency;
-	u32 padding;
+	__u32 num_groups;
+	__u32 num_core_groups;
+	__u32 coherency;
+	__u32 padding;
 	struct mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS];
 };
 
@@ -644,37 +652,37 @@ struct mali_base_gpu_coherent_group_info {
  *
  */
 struct gpu_raw_gpu_props {
-	u64 shader_present;
-	u64 tiler_present;
-	u64 l2_present;
-	u64 stack_present;
-	u32 l2_features;
-	u32 core_features;
-	u32 mem_features;
-	u32 mmu_features;
+	__u64 shader_present;
+	__u64 tiler_present;
+	__u64 l2_present;
+	__u64 stack_present;
+	__u32 l2_features;
+	__u32 core_features;
+	__u32 mem_features;
+	__u32 mmu_features;
 
-	u32 as_present;
+	__u32 as_present;
 
-	u32 js_present;
-	u32 js_features[GPU_MAX_JOB_SLOTS];
-	u32 tiler_features;
-	u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+	__u32 js_present;
+	__u32 js_features[GPU_MAX_JOB_SLOTS];
+	__u32 tiler_features;
+	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
 
-	u32 gpu_id;
+	__u32 gpu_id;
 
-	u32 thread_max_threads;
-	u32 thread_max_workgroup_size;
-	u32 thread_max_barrier_size;
-	u32 thread_features;
+	__u32 thread_max_threads;
+	__u32 thread_max_workgroup_size;
+	__u32 thread_max_barrier_size;
+	__u32 thread_features;
 
 	/*
 	 * Note: This is the _selected_ coherency mode rather than the
 	 * available modes as exposed in the coherency_features register.
 	 */
-	u32 coherency_mode;
+	__u32 coherency_mode;
 
-	u32 thread_tls_alloc;
-	u64 gpu_features;
+	__u32 thread_tls_alloc;
+	__u64 gpu_features;
 };
 
 /**
@@ -695,7 +703,7 @@ struct gpu_raw_gpu_props {
 struct base_gpu_props {
 	struct mali_base_gpu_core_props core_props;
 	struct mali_base_gpu_l2_cache_props l2_props;
-	u64 unused_1;
+	__u64 unused_1;
 	struct mali_base_gpu_tiler_props tiler_props;
 	struct mali_base_gpu_thread_props thread_props;
 	struct gpu_raw_gpu_props raw_props;
@@ -717,7 +725,7 @@ struct base_gpu_props {
  *
  * Return: group ID(0~15) extracted from the parameter
  */
-static inline int base_mem_group_id_get(base_mem_alloc_flags flags)
+static __inline__ int base_mem_group_id_get(base_mem_alloc_flags flags)
 {
 	LOCAL_ASSERT((flags & ~BASE_MEM_FLAGS_INPUT_MASK) == 0);
 	return (int)((flags & BASE_MEM_GROUP_ID_MASK) >>
@@ -736,7 +744,7 @@ static inline int base_mem_group_id_get(base_mem_alloc_flags flags)
  * The return value can be combined with other flags against base_mem_alloc
  * to identify a specific memory group.
  */
-static inline base_mem_alloc_flags base_mem_group_id_set(int id)
+static __inline__ base_mem_alloc_flags base_mem_group_id_set(int id)
 {
 	if ((id < 0) || (id >= BASE_MEM_GROUP_COUNT)) {
 		/* Set to default value when id is out of range. */
@@ -757,7 +765,7 @@ static inline base_mem_alloc_flags base_mem_group_id_set(int id)
  *
  * Return: Bitmask of flags to pass to base_context_init.
  */
-static inline base_context_create_flags base_context_mmu_group_id_set(
+static __inline__ base_context_create_flags base_context_mmu_group_id_set(
 	int const group_id)
 {
 	LOCAL_ASSERT(group_id >= 0);
@@ -777,7 +785,7 @@ static inline base_context_create_flags base_context_mmu_group_id_set(
  *
  * Return: Physical memory group ID. Valid range is 0..(BASE_MEM_GROUP_COUNT-1).
  */
-static inline int base_context_mmu_group_id_get(
+static __inline__ int base_context_mmu_group_id_get(
 	base_context_create_flags const flags)
 {
 	LOCAL_ASSERT(flags == (flags & BASEP_CONTEXT_CREATE_ALLOWED_FLAGS));
@@ -809,4 +817,10 @@ static inline int base_context_mmu_group_id_get(
 		BASE_TIMEINFO_KERNEL_SOURCE_FLAG | \
 		BASE_TIMEINFO_USER_SOURCE_FLAG)
 
-#endif				/* _BASE_KERNEL_H_ */
+/* Maximum number of source allocations allowed to create an alias allocation.
+ * This needs to be 4096 * 6 to allow cube map arrays with up to 4096 array
+ * layers, since each cube map in the array will have 6 faces.
+ */
+#define BASE_MEM_ALIAS_MAX_ENTS ((size_t)24576)
+
+#endif /* _UAPI_BASE_KERNEL_H_ */
diff --git a/mali_kbase/mali_base_mem_priv.h b/common/include/uapi/gpu/arm/midgard/mali_base_mem_priv.h
index 9f59a4f..982bd3d 100644
--- a/mali_kbase/mali_base_mem_priv.h
+++ b/common/include/uapi/gpu/arm/midgard/mali_base_mem_priv.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2010-2015, 2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2015, 2020-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -19,8 +19,12 @@
  *
  */
 
-#ifndef _BASE_MEM_PRIV_H_
-#define _BASE_MEM_PRIV_H_
+#ifndef _UAPI_BASE_MEM_PRIV_H_
+#define _UAPI_BASE_MEM_PRIV_H_
+
+#include <linux/types.h>
+
+#include "mali_base_kernel.h"
 
 #define BASE_SYNCSET_OP_MSYNC	(1U << 0)
 #define BASE_SYNCSET_OP_CSYNC	(1U << 1)
@@ -45,10 +49,10 @@
  */
 struct basep_syncset {
 	struct base_mem_handle mem_handle;
-	u64 user_addr;
-	u64 size;
-	u8 type;
-	u8 padding[7];
+	__u64 user_addr;
+	__u64 size;
+	__u8 type;
+	__u8 padding[7];
 };
 
-#endif
+#endif /* _UAPI_BASE_MEM_PRIV_H_ */
diff --git a/mali_kbase/mali_kbase_hwcnt_reader.h b/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h
index 9f2172b..615dbb0 100644
--- a/mali_kbase/mali_kbase_hwcnt_reader.h
+++ b/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h
@@ -19,17 +19,18 @@
  *
  */
 
-#ifndef _KBASE_HWCNT_READER_H_
-#define _KBASE_HWCNT_READER_H_
+#ifndef _UAPI_KBASE_HWCNT_READER_H_
+#define _UAPI_KBASE_HWCNT_READER_H_
 
 #include <stddef.h>
+#include <linux/types.h>
 
 /* The ids of ioctl commands. */
 #define KBASE_HWCNT_READER 0xBE
-#define KBASE_HWCNT_READER_GET_HWVER       _IOR(KBASE_HWCNT_READER, 0x00, u32)
-#define KBASE_HWCNT_READER_GET_BUFFER_SIZE _IOR(KBASE_HWCNT_READER, 0x01, u32)
-#define KBASE_HWCNT_READER_DUMP            _IOW(KBASE_HWCNT_READER, 0x10, u32)
-#define KBASE_HWCNT_READER_CLEAR           _IOW(KBASE_HWCNT_READER, 0x11, u32)
+#define KBASE_HWCNT_READER_GET_HWVER       _IOR(KBASE_HWCNT_READER, 0x00, __u32)
+#define KBASE_HWCNT_READER_GET_BUFFER_SIZE _IOR(KBASE_HWCNT_READER, 0x01, __u32)
+#define KBASE_HWCNT_READER_DUMP            _IOW(KBASE_HWCNT_READER, 0x10, __u32)
+#define KBASE_HWCNT_READER_CLEAR           _IOW(KBASE_HWCNT_READER, 0x11, __u32)
 #define KBASE_HWCNT_READER_GET_BUFFER      _IOC(_IOC_READ, KBASE_HWCNT_READER, 0x20,\
 		offsetof(struct kbase_hwcnt_reader_metadata, cycles))
 #define KBASE_HWCNT_READER_GET_BUFFER_WITH_CYCLES      _IOR(KBASE_HWCNT_READER, 0x20,\
@@ -38,10 +39,10 @@
 		offsetof(struct kbase_hwcnt_reader_metadata, cycles))
 #define KBASE_HWCNT_READER_PUT_BUFFER_WITH_CYCLES      _IOW(KBASE_HWCNT_READER, 0x21,\
 		struct kbase_hwcnt_reader_metadata)
-#define KBASE_HWCNT_READER_SET_INTERVAL    _IOW(KBASE_HWCNT_READER, 0x30, u32)
-#define KBASE_HWCNT_READER_ENABLE_EVENT    _IOW(KBASE_HWCNT_READER, 0x40, u32)
-#define KBASE_HWCNT_READER_DISABLE_EVENT   _IOW(KBASE_HWCNT_READER, 0x41, u32)
-#define KBASE_HWCNT_READER_GET_API_VERSION _IOW(KBASE_HWCNT_READER, 0xFF, u32)
+#define KBASE_HWCNT_READER_SET_INTERVAL    _IOW(KBASE_HWCNT_READER, 0x30, __u32)
+#define KBASE_HWCNT_READER_ENABLE_EVENT    _IOW(KBASE_HWCNT_READER, 0x40, __u32)
+#define KBASE_HWCNT_READER_DISABLE_EVENT   _IOW(KBASE_HWCNT_READER, 0x41, __u32)
+#define KBASE_HWCNT_READER_GET_API_VERSION _IOW(KBASE_HWCNT_READER, 0xFF, __u32)
 #define KBASE_HWCNT_READER_GET_API_VERSION_WITH_FEATURES \
 		_IOW(KBASE_HWCNT_READER, 0xFF, \
 		     struct kbase_hwcnt_reader_api_version)
@@ -53,8 +54,8 @@
  * @shader_cores:  the cycles that have elapsed on the GPU shader cores
  */
 struct kbase_hwcnt_reader_metadata_cycles {
-	u64 top;
-	u64 shader_cores;
+	__u64 top;
+	__u64 shader_cores;
 };
 
 /**
@@ -65,9 +66,9 @@ struct kbase_hwcnt_reader_metadata_cycles {
  * @cycles:     the GPU cycles that occurred since the last sample
  */
 struct kbase_hwcnt_reader_metadata {
-	u64 timestamp;
-	u32 event_id;
-	u32 buffer_idx;
+	__u64 timestamp;
+	__u32 event_id;
+	__u32 buffer_idx;
 	struct kbase_hwcnt_reader_metadata_cycles cycles;
 };
 
@@ -84,7 +85,6 @@ enum base_hwcnt_reader_event {
 	BASE_HWCNT_READER_EVENT_PERIODIC,
 	BASE_HWCNT_READER_EVENT_PREJOB,
 	BASE_HWCNT_READER_EVENT_POSTJOB,
-
 	BASE_HWCNT_READER_EVENT_COUNT
 };
 
@@ -97,9 +97,9 @@ enum base_hwcnt_reader_event {
  * @features: available features in this API version
  */
 struct kbase_hwcnt_reader_api_version {
-	u32 version;
-	u32 features;
+	__u32 version;
+	__u32 features;
 };
 
-#endif /* _KBASE_HWCNT_READER_H_ */
+#endif /* _UAPI_KBASE_HWCNT_READER_H_ */
 
diff --git a/mali_kbase/mali_kbase_ioctl.h b/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h
index 36dfc34..5ca528a 100644
--- a/mali_kbase/mali_kbase_ioctl.h
+++ b/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h
@@ -19,8 +19,8 @@
  *
  */
 
-#ifndef _KBASE_IOCTL_H_
-#define _KBASE_IOCTL_H_
+#ifndef _UAPI_KBASE_IOCTL_H_
+#define _UAPI_KBASE_IOCTL_H_
 
 #ifdef __cpluscplus
 extern "C" {
@@ -64,16 +64,16 @@ struct kbase_ioctl_set_flags {
  * @flags may be used in the future to request a different format for the
  * buffer. With @flags == 0 the following format is used.
  *
- * The buffer will be filled with pairs of values, a u32 key identifying the
+ * The buffer will be filled with pairs of values, a __u32 key identifying the
  * property followed by the value. The size of the value is identified using
  * the bottom bits of the key. The value then immediately followed the key and
  * is tightly packed (there is no padding). All keys and values are
  * little-endian.
  *
- * 00 = u8
- * 01 = u16
- * 10 = u32
- * 11 = u64
+ * 00 = __u8
+ * 01 = __u16
+ * 10 = __u32
+ * 11 = __u64
  */
 struct kbase_ioctl_get_gpuprops {
 	__u64 buffer;
@@ -134,9 +134,9 @@ union kbase_ioctl_mem_query {
 #define KBASE_IOCTL_MEM_QUERY \
 	_IOWR(KBASE_IOCTL_TYPE, 6, union kbase_ioctl_mem_query)
 
-#define KBASE_MEM_QUERY_COMMIT_SIZE	((u64)1)
-#define KBASE_MEM_QUERY_VA_SIZE		((u64)2)
-#define KBASE_MEM_QUERY_FLAGS		((u64)3)
+#define KBASE_MEM_QUERY_COMMIT_SIZE	((__u64)1)
+#define KBASE_MEM_QUERY_VA_SIZE		((__u64)2)
+#define KBASE_MEM_QUERY_FLAGS		((__u64)3)
 
 /**
  * struct kbase_ioctl_mem_free - Free a memory region
@@ -529,7 +529,7 @@ struct kbase_ioctl_mem_profile_add {
 /**
  * struct kbase_ioctl_sticky_resource_map - Permanently map an external resource
  * @count: Number of resources
- * @address: Array of u64 GPU addresses of the external resources to map
+ * @address: Array of __u64 GPU addresses of the external resources to map
  */
 struct kbase_ioctl_sticky_resource_map {
 	__u64 count;
@@ -543,7 +543,7 @@ struct kbase_ioctl_sticky_resource_map {
  * struct kbase_ioctl_sticky_resource_map - Unmap a resource mapped which was
  *                                          previously permanently mapped
  * @count: Number of resources
- * @address: Array of u64 GPU addresses of the external resources to unmap
+ * @address: Array of __u64 GPU addresses of the external resources to unmap
  */
 struct kbase_ioctl_sticky_resource_unmap {
 	__u64 count;
@@ -581,7 +581,6 @@ union kbase_ioctl_mem_find_gpu_start_and_offset {
 #define KBASE_IOCTL_MEM_FIND_GPU_START_AND_OFFSET \
 	_IOWR(KBASE_IOCTL_TYPE, 31, union kbase_ioctl_mem_find_gpu_start_and_offset)
 
-
 #define KBASE_IOCTL_CINSTR_GWT_START \
 	_IO(KBASE_IOCTL_TYPE, 33)
 
@@ -642,7 +641,7 @@ struct kbase_ioctl_mem_exec_init {
  * @out: Output parameters
  * @out.sec:           Integer field of the monotonic time, unit in seconds.
  * @out.nsec:          Fractional sec of the monotonic time, in nano-seconds.
- * @out.padding:       Unused, for u64 alignment
+ * @out.padding:       Unused, for __u64 alignment
  * @out.timestamp:     System wide timestamp (counter) value.
  * @out.cycle_counter: GPU cycle counter value.
  */
@@ -675,6 +674,19 @@ struct kbase_ioctl_context_priority_check {
 #define KBASE_IOCTL_CONTEXT_PRIORITY_CHECK \
 	_IOWR(KBASE_IOCTL_TYPE, 54, struct kbase_ioctl_context_priority_check)
 
+/**
+ * struct kbase_ioctl_set_limited_core_count - Set the limited core count.
+ *
+ * @max_core_count: Maximum core count
+ */
+struct kbase_ioctl_set_limited_core_count {
+	__u8 max_core_count;
+};
+
+#define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \
+	_IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count)
+
+
 /***************
  * test ioctls *
  ***************/
@@ -685,23 +697,6 @@ struct kbase_ioctl_context_priority_check {
 
 #define KBASE_IOCTL_TEST_TYPE (KBASE_IOCTL_TYPE + 1)
 
-/**
- * struct kbase_ioctl_tlstream_test - Start a timeline stream test
- *
- * @tpw_count: number of trace point writers in each context
- * @msg_delay: time delay between tracepoints from one writer in milliseconds
- * @msg_count: number of trace points written by one writer
- * @aux_msg:   if non-zero aux messages will be included
- */
-struct kbase_ioctl_tlstream_test {
-	__u32 tpw_count;
-	__u32 msg_delay;
-	__u32 msg_count;
-	__u32 aux_msg;
-};
-
-#define KBASE_IOCTL_TLSTREAM_TEST \
-	_IOW(KBASE_IOCTL_TEST_TYPE, 1, struct kbase_ioctl_tlstream_test)
 
 /**
  * struct kbase_ioctl_tlstream_stats - Read tlstream stats for test purposes
@@ -838,4 +833,4 @@ struct kbase_ioctl_tlstream_stats {
 }
 #endif
 
-#endif
+#endif /* _UAPI_KBASE_IOCTL_H_ */
diff --git a/mali_kbase/mali_kbase_kinstr_jm_reader.h b/common/include/uapi/gpu/arm/midgard/mali_kbase_kinstr_jm_reader.h
index cbd495f..cb782bd 100644
--- a/mali_kbase/mali_kbase_kinstr_jm_reader.h
+++ b/common/include/uapi/gpu/arm/midgard/mali_kbase_kinstr_jm_reader.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -34,8 +34,8 @@
  *    8. Close the file descriptor
  */
 
-#ifndef _KBASE_KINSTR_JM_READER_H_
-#define _KBASE_KINSTR_JM_READER_H_
+#ifndef _UAPI_KBASE_KINSTR_JM_READER_H_
+#define _UAPI_KBASE_KINSTR_JM_READER_H_
 
 /**
  * enum kbase_kinstr_jm_reader_atom_state - Determines the work state of an atom
@@ -66,4 +66,4 @@ enum kbase_kinstr_jm_reader_atom_state {
 	KBASE_KINSTR_JM_READER_ATOM_STATE_COUNT
 };
 
-#endif /* _KBASE_KINSTR_JM_READER_H_ */
+#endif /* _UAPI_KBASE_KINSTR_JM_READER_H_ */
diff --git a/mali_kbase/mali_uk.h b/common/include/uapi/gpu/arm/midgard/mali_uk.h
index a499e02..81cbb9e 100644
--- a/mali_kbase/mali_uk.h
+++ b/common/include/uapi/gpu/arm/midgard/mali_uk.h
@@ -24,12 +24,12 @@
  * and kernel side of the User-Kernel interface.
  */
 
-#ifndef _UK_H_
-#define _UK_H_
+#ifndef _UAPI_UK_H_
+#define _UAPI_UK_H_
 
 #ifdef __cplusplus
 extern "C" {
-#endif				/* __cplusplus */
+#endif /* __cplusplus */
 
 /**
  * DOC: uk_api User-Kernel Interface API
@@ -66,5 +66,5 @@ enum uk_client_id {
 
 #ifdef __cplusplus
 }
-#endif				/* __cplusplus */
-#endif				/* _UK_H_ */
+#endif /* __cplusplus */
+#endif /* _UAPI_UK_H_ */
diff --git a/mali_kbase/Kbuild b/mali_kbase/Kbuild
index 1c9e109..5463a24 100644
--- a/mali_kbase/Kbuild
+++ b/mali_kbase/Kbuild
@@ -20,11 +20,11 @@
 
 
 # Driver version string which is returned to userspace via an ioctl
-MALI_RELEASE_NAME ?= "r30p0-01eac0"
+MALI_RELEASE_NAME ?= '"r31p0-01eac0"'
 
 # Paths required for build
 
-# make $(src) as absolute path if it isn't already, by prefixing $(srctree)
+# make $(src) as absolute path if it is not already, by prefixing $(srctree)
 src:=$(if $(patsubst /%,,$(src)),$(srctree)/$(src),$(src))
 KBASE_PATH = $(src)
 KBASE_PLATFORM_PATH = $(KBASE_PATH)/platform_dummy
@@ -64,7 +64,7 @@ DEFINES = \
 	-DMALI_KERNEL_TEST_API=$(MALI_KERNEL_TEST_API) \
 	-DMALI_UNIT_TEST=$(MALI_UNIT_TEST) \
 	-DMALI_COVERAGE=$(MALI_COVERAGE) \
-	-DMALI_RELEASE_NAME=\"$(MALI_RELEASE_NAME)\" \
+	-DMALI_RELEASE_NAME=$(MALI_RELEASE_NAME) \
 	-DMALI_JIT_PRESSURE_LIMIT_BASE=$(MALI_JIT_PRESSURE_LIMIT_BASE) \
 	-DMALI_INCREMENTAL_RENDERING=$(MALI_INCREMENTAL_RENDERING)
 
@@ -114,7 +114,6 @@ SRC := \
 	mali_kbase_mem_profile_debugfs.c \
 	mmu/mali_kbase_mmu.c \
 	mmu/mali_kbase_mmu_hw_direct.c \
-	mmu/mali_kbase_mmu_mode_lpae.c \
 	mmu/mali_kbase_mmu_mode_aarch64.c \
 	mali_kbase_disjoint_events.c \
 	mali_kbase_debug_mem_view.c \
@@ -170,9 +169,6 @@ ifeq ($(CONFIG_MALI_CINSTR_GWT),y)
 	SRC += mali_kbase_gwt.c
 endif
 
-ifeq ($(MALI_UNIT_TEST),1)
-	SRC += tl/mali_kbase_timeline_test.c
-endif
 
 ifeq ($(MALI_CUSTOMER_RELEASE),0)
 	SRC += mali_kbase_regs_dump_debugfs.c
diff --git a/mali_kbase/Makefile b/mali_kbase/Makefile
index 2ba2d77..84103af 100644
--- a/mali_kbase/Makefile
+++ b/mali_kbase/Makefile
@@ -27,7 +27,7 @@ ifeq ($(KBUILD_EXTMOD),)
 export CONFIG_MALI_MIDGARD?=m
 
 ifneq ($(CONFIG_MALI_MIDGARD),n)
-export CONFIF_MALI_CSF_SUPPORT?=n
+export CONFIG_MALI_CSF_SUPPORT?=n
 export CONFIG_MALI_KUTF?=m
 export CONFIG_MALI_REAL_HW?=y
 
@@ -39,7 +39,7 @@ export CONFIG_MALI_DEVFREQ?=y
 endif
 
 DEFINES += -DCONFIG_MALI_MIDGARD=$(CONFIG_MALI_MIDGARD) \
-	-DCONFIF_MALI_CSF_SUPPORT=$(CONFIF_MALI_CSF_SUPPORT) \
+	-DCONFIG_MALI_CSF_SUPPORT=$(CONFIG_MALI_CSF_SUPPORT) \
 	-DCONFIG_MALI_KUTF=$(CONFIG_MALI_KUTF) \
 	-DCONFIG_MALI_REAL_HW=$(CONFIG_MALI_REAL_HW) \
 	-DCONFIG_MALI_GATOR_SUPPORT=$(CONFIG_MALI_GATOR_SUPPORT) \
@@ -50,13 +50,8 @@ export DEFINES
 endif
 endif
 
-BUSLOG_PATH_RELATIVE = $(CURDIR)/../../../..
 KBASE_PATH_RELATIVE = $(CURDIR)
 
-ifeq ($(CONFIG_MALI_BUSLOG),y)
-#Add bus logger symbols
-EXTRA_SYMBOLS += $(BUSLOG_PATH_RELATIVE)/drivers/base/bus_logger/Module.symvers
-endif
 
 # we get the symbols from modules using KBUILD_EXTRA_SYMBOLS to prevent warnings about unknown functions
 all:
diff --git a/mali_kbase/arbiter/mali_kbase_arbif.c b/mali_kbase/arbiter/mali_kbase_arbif.c
index 5ed5f80..7d6ab0c 100644
--- a/mali_kbase/arbiter/mali_kbase_arbif.c
+++ b/mali_kbase/arbiter/mali_kbase_arbif.c
@@ -30,6 +30,66 @@
 #include <linux/of_platform.h>
 #include "mali_kbase_arbiter_interface.h"
 
+/* Arbiter interface version against which was implemented this module */
+#define MALI_REQUIRED_KBASE_ARBITER_INTERFACE_VERSION 5
+#if MALI_REQUIRED_KBASE_ARBITER_INTERFACE_VERSION != \
+			MALI_KBASE_ARBITER_INTERFACE_VERSION
+#error "Unsupported Mali Arbiter interface version."
+#endif
+
+static void on_max_config(struct device *dev, uint32_t max_l2_slices,
+			  uint32_t max_core_mask)
+{
+	struct kbase_device *kbdev;
+
+	if (!dev) {
+		pr_err("%s(): dev is NULL", __func__);
+		return;
+	}
+
+	kbdev = dev_get_drvdata(dev);
+	if (!kbdev) {
+		dev_err(dev, "%s(): kbdev is NULL", __func__);
+		return;
+	}
+
+	if (!max_l2_slices || !max_core_mask) {
+		dev_dbg(dev,
+			"%s(): max_config ignored as one of the fields is zero",
+			__func__);
+		return;
+	}
+
+	/* set the max config info in the kbase device */
+	kbase_arbiter_set_max_config(kbdev, max_l2_slices, max_core_mask);
+}
+
+/**
+ * on_update_freq() - Updates GPU clock frequency
+ * @dev: arbiter interface device handle
+ * @freq: GPU clock frequency value reported from arbiter
+ *
+ * call back function to update GPU clock frequency with
+ * new value from arbiter
+ */
+static void on_update_freq(struct device *dev, uint32_t freq)
+{
+	struct kbase_device *kbdev;
+
+	if (!dev) {
+		pr_err("%s(): dev is NULL", __func__);
+		return;
+	}
+
+	kbdev = dev_get_drvdata(dev);
+	if (!kbdev) {
+		dev_err(dev, "%s(): kbdev is NULL", __func__);
+		return;
+	}
+
+	kbase_arbiter_pm_update_gpu_freq(&kbdev->arb.arb_freq, freq);
+}
+
 /**
  * on_gpu_stop() - sends KBASE_VM_GPU_STOP_EVT event on VM stop
  * @dev: arbiter interface device handle
@@ -38,7 +98,18 @@
  */
 static void on_gpu_stop(struct device *dev)
 {
-	struct kbase_device *kbdev = dev_get_drvdata(dev);
+	struct kbase_device *kbdev;
+
+	if (!dev) {
+		pr_err("%s(): dev is NULL", __func__);
+		return;
+	}
+
+	kbdev = dev_get_drvdata(dev);
+	if (!kbdev) {
+		dev_err(dev, "%s(): kbdev is NULL", __func__);
+		return;
+	}
 
 	KBASE_TLSTREAM_TL_ARBITER_STOP_REQUESTED(kbdev, kbdev);
 	kbase_arbiter_pm_vm_event(kbdev, KBASE_VM_GPU_STOP_EVT);
@@ -52,7 +123,18 @@ static void on_gpu_stop(struct device *dev)
  */
 static void on_gpu_granted(struct device *dev)
 {
-	struct kbase_device *kbdev = dev_get_drvdata(dev);
+	struct kbase_device *kbdev;
+
+	if (!dev) {
+		pr_err("%s(): dev is NULL", __func__);
+		return;
+	}
+
+	kbdev = dev_get_drvdata(dev);
+	if (!kbdev) {
+		dev_err(dev, "%s(): kbdev is NULL", __func__);
+		return;
+	}
 
 	KBASE_TLSTREAM_TL_ARBITER_GRANTED(kbdev, kbdev);
 	kbase_arbiter_pm_vm_event(kbdev, KBASE_VM_GPU_GRANTED_EVT);
@@ -66,7 +148,18 @@ static void on_gpu_granted(struct device *dev)
  */
 static void on_gpu_lost(struct device *dev)
 {
-	struct kbase_device *kbdev = dev_get_drvdata(dev);
+	struct kbase_device *kbdev;
+
+	if (!dev) {
+		pr_err("%s(): dev is NULL", __func__);
+		return;
+	}
+
+	kbdev = dev_get_drvdata(dev);
+	if (!kbdev) {
+		dev_err(dev, "%s(): kbdev is NULL", __func__);
+		return;
+	}
 
 	kbase_arbiter_pm_vm_event(kbdev, KBASE_VM_GPU_LOST_EVT);
 }
@@ -122,6 +215,12 @@ int kbase_arbif_init(struct kbase_device *kbdev)
 	ops.arb_vm_gpu_stop = on_gpu_stop;
 	ops.arb_vm_gpu_granted = on_gpu_granted;
 	ops.arb_vm_gpu_lost = on_gpu_lost;
+	ops.arb_vm_max_config = on_max_config;
+	ops.arb_vm_update_freq = on_update_freq;
+
+
+	kbdev->arb.arb_freq.arb_freq = 0;
+	mutex_init(&kbdev->arb.arb_freq.arb_freq_lock);
 
 	/* register kbase arbiter_if callbacks */
 	if (arb_if->vm_ops.vm_arb_register_dev) {
@@ -133,6 +232,7 @@ int kbase_arbif_init(struct kbase_device *kbdev)
 			return err;
 		}
 	}
+
 #else /* CONFIG_OF */
 	dev_dbg(kbdev->dev, "No arbiter without Device Tree support\n");
 	kbdev->arb.arb_dev = NULL;
@@ -162,6 +262,22 @@ void kbase_arbif_destroy(struct kbase_device *kbdev)
 }
 
 /**
+ * kbase_arbif_get_max_config() - Request max config info
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * call back function from arb interface to arbiter requesting max config info
+ */
+void kbase_arbif_get_max_config(struct kbase_device *kbdev)
+{
+	struct arbiter_if_dev *arb_if = kbdev->arb.arb_if;
+
+	if (arb_if && arb_if->vm_ops.vm_arb_get_max_config) {
+		dev_dbg(kbdev->dev, "%s\n", __func__);
+		arb_if->vm_ops.vm_arb_get_max_config(arb_if);
+	}
+}
+
+/**
  * kbase_arbif_gpu_request() - Request GPU from
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
@@ -173,6 +289,7 @@ void kbase_arbif_gpu_request(struct kbase_device *kbdev)
 
 	if (arb_if && arb_if->vm_ops.vm_arb_gpu_request) {
 		dev_dbg(kbdev->dev, "%s\n", __func__);
+		KBASE_TLSTREAM_TL_ARBITER_REQUESTED(kbdev, kbdev);
 		arb_if->vm_ops.vm_arb_gpu_request(arb_if);
 	}
 }
diff --git a/mali_kbase/arbiter/mali_kbase_arbif.h b/mali_kbase/arbiter/mali_kbase_arbif.h
index c6a2031..710559c 100644
--- a/mali_kbase/arbiter/mali_kbase_arbif.h
+++ b/mali_kbase/arbiter/mali_kbase_arbif.h
@@ -72,6 +72,14 @@ int kbase_arbif_init(struct kbase_device *kbdev);
 void kbase_arbif_destroy(struct kbase_device *kbdev);
 
 /**
+ * kbase_arbif_get_max_config() - Request max config info
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * call back function from arb interface to arbiter requesting max config info
+ */
+void kbase_arbif_get_max_config(struct kbase_device *kbdev);
+
+/**
  * kbase_arbif_gpu_request() - Send GPU request message to the arbiter
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
diff --git a/mali_kbase/arbiter/mali_kbase_arbiter_defs.h b/mali_kbase/arbiter/mali_kbase_arbiter_defs.h
index c754b6e..586c5d4 100644
--- a/mali_kbase/arbiter/mali_kbase_arbiter_defs.h
+++ b/mali_kbase/arbiter/mali_kbase_arbiter_defs.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -44,6 +44,8 @@
  * @vm_resume_work:  Work item for vm_arb_wq to resume current work on GPU
  * @vm_arb_starting: Work queue resume in progress
  * @vm_arb_stopping: Work queue suspend in progress
+ * @interrupts_installed: Flag set when interrupts are installed
+ * @vm_request_timer: Timer to monitor GPU request
  */
 struct kbase_arbiter_vm_state {
 	struct kbase_device *kbdev;
@@ -55,6 +57,8 @@ struct kbase_arbiter_vm_state {
 	struct work_struct vm_resume_work;
 	bool vm_arb_starting;
 	bool vm_arb_stopping;
+	bool interrupts_installed;
+	struct hrtimer vm_request_timer;
 };
 
 /**
@@ -62,10 +66,12 @@ struct kbase_arbiter_vm_state {
  *                               allocated from the probe method of Mali driver
  * @arb_if:                 Pointer to the arbiter interface device
  * @arb_dev:                Pointer to the arbiter device
+ * @arb_freq:               GPU clock frequency retrieved from arbiter.
  */
 struct kbase_arbiter_device {
 	struct arbiter_if_dev *arb_if;
 	struct device *arb_dev;
+	struct kbase_arbiter_freq arb_freq;
 };
 
 #endif /* _MALI_KBASE_ARBITER_DEFS_H_ */
diff --git a/mali_kbase/arbiter/mali_kbase_arbiter_interface.h b/mali_kbase/arbiter/mali_kbase_arbiter_interface.h
index 958b0a1..84389e8 100644
--- a/mali_kbase/arbiter/mali_kbase_arbiter_interface.h
+++ b/mali_kbase/arbiter/mali_kbase_arbiter_interface.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -28,7 +28,7 @@
 #define _MALI_KBASE_ARBITER_INTERFACE_H_
 
 /**
- * @brief Mali arbiter interface version
+ *  Mali arbiter interface version
  *
  * This specifies the current version of the configuration interface. Whenever
  * the arbiter interface changes, so that integration effort is required, the
@@ -39,8 +39,15 @@
  * 1 - Added the Mali arbiter configuration interface.
  * 2 - Strip out reference code from header
  * 3 - Removed DVFS utilization interface (DVFS moved to arbiter side)
+ * 4 - Added max_config support
+ * 5 - Added GPU clock frequency reporting support from arbiter
  */
-#define MALI_KBASE_ARBITER_INTERFACE_VERSION 3
+#define MALI_KBASE_ARBITER_INTERFACE_VERSION 5
+
+/**
+ * NO_FREQ is used in case platform doesn't support reporting frequency
+ */
+#define NO_FREQ 0
 
 struct arbiter_if_dev;
 
@@ -86,6 +93,27 @@ struct arbiter_if_arb_vm_ops {
 	 * If successful, will respond with a vm_arb_gpu_stopped message.
 	 */
 	void (*arb_vm_gpu_lost)(struct device *dev);
+
+	/**
+	 * arb_vm_max_config() - Send max config info to the VM
+	 * @dev: The arbif kernel module device.
+	 * @max_l2_slices: The maximum number of L2 slices.
+	 * @max_core_mask: The largest core mask.
+	 *
+	 * Informs KBase the maximum resources that can be allocated to the
+	 * partition in use.
+	 */
+	void (*arb_vm_max_config)(struct device *dev, uint32_t max_l2_slices,
+				  uint32_t max_core_mask);
+
+	/**
+	 * arb_vm_update_freq() - GPU clock frequency has been updated
+	 * @dev: The arbif kernel module device.
+	 * @freq: GPU clock frequency value reported from arbiter
+	 *
+	 * Informs KBase that the GPU clock frequency has been updated.
+	 */
+	void (*arb_vm_update_freq)(struct device *dev, uint32_t freq);
 };
 
 /**
@@ -115,6 +143,13 @@ struct arbiter_if_vm_arb_ops {
 	void (*vm_arb_unregister_dev)(struct arbiter_if_dev *arbif_dev);
 
 	/**
+	 * vm_arb_gpu_get_max_config() - Request the max config from the
+	 * Arbiter.
+	 * @arbif_dev: The arbiter interface we want to issue the request.
+	 */
+	void (*vm_arb_get_max_config)(struct arbiter_if_dev *arbif_dev);
+
+	/**
 	 * vm_arb_gpu_request() - Ask the arbiter interface for GPU access.
 	 * @arbif_dev: The arbiter interface we want to issue the request.
 	 */
diff --git a/mali_kbase/arbiter/mali_kbase_arbiter_pm.c b/mali_kbase/arbiter/mali_kbase_arbiter_pm.c
index 08a6872..456cc70 100644
--- a/mali_kbase/arbiter/mali_kbase_arbiter_pm.c
+++ b/mali_kbase/arbiter/mali_kbase_arbiter_pm.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -20,14 +20,33 @@
  */
 
 /**
- * @file mali_kbase_arbiter_pm.c
+ * @file
  * Mali arbiter power manager state machine and APIs
  */
 
 #include <mali_kbase.h>
 #include <mali_kbase_pm.h>
+#include <mali_kbase_hwaccess_jm.h>
 #include <mali_kbase_irq_internal.h>
+#include <mali_kbase_hwcnt_context.h>
+#include <mali_kbase_pm_internal.h>
 #include <tl/mali_kbase_tracepoints.h>
+#include <mali_kbase_gpuprops.h>
+
+/* A dmesg warning will occur if the GPU is not granted
+ * after the following time (in milliseconds) has ellapsed.
+ */
+#define GPU_REQUEST_TIMEOUT 1000
+
+#define MAX_L2_SLICES_MASK		0xFF
+
+/* Maximum time in ms, before deferring probe incase
+ * GPU_GRANTED message is not received
+ */
+static int gpu_req_timeout = 1;
+module_param(gpu_req_timeout, int, 0644);
+MODULE_PARM_DESC(gpu_req_timeout,
+	"On a virtualized platform, if the GPU is not granted within this time(ms) kbase will defer the probe");
 
 static void kbase_arbiter_pm_vm_wait_gpu_assignment(struct kbase_device *kbdev);
 static inline bool kbase_arbiter_pm_vm_gpu_assigned_lockheld(
@@ -195,6 +214,60 @@ static void kbase_arbiter_pm_resume_wq(struct work_struct *data)
 }
 
 /**
+ * request_timer_callback() - Issue warning on request timer expiration
+ * @timer: Request hr timer data
+ *
+ * Called when the Arbiter takes too long to grant the GPU after a
+ * request has been made.  Issues a warning in dmesg.
+ *
+ * Return: Always returns HRTIMER_NORESTART
+ */
+static enum hrtimer_restart request_timer_callback(struct hrtimer *timer)
+{
+	struct kbase_arbiter_vm_state *arb_vm_state = container_of(timer,
+			struct kbase_arbiter_vm_state, vm_request_timer);
+
+	KBASE_DEBUG_ASSERT(arb_vm_state);
+	KBASE_DEBUG_ASSERT(arb_vm_state->kbdev);
+
+	dev_warn(arb_vm_state->kbdev->dev,
+		"Still waiting for GPU to be granted from Arbiter after %d ms\n",
+		GPU_REQUEST_TIMEOUT);
+	return HRTIMER_NORESTART;
+}
+
+/**
+ * start_request_timer() - Start a timer after requesting GPU
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * Start a timer to track when kbase is waiting for the GPU from the
+ * Arbiter.  If the timer expires before GPU is granted, a warning in
+ * dmesg will be issued.
+ */
+static void start_request_timer(struct kbase_device *kbdev)
+{
+	struct kbase_arbiter_vm_state *arb_vm_state = kbdev->pm.arb_vm_state;
+
+	hrtimer_start(&arb_vm_state->vm_request_timer,
+			HR_TIMER_DELAY_MSEC(GPU_REQUEST_TIMEOUT),
+			HRTIMER_MODE_REL);
+}
+
+/**
+ * cancel_request_timer() - Stop the request timer
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * Stops the request timer once GPU has been granted.  Safe to call
+ * even if timer is no longer running.
+ */
+static void cancel_request_timer(struct kbase_device *kbdev)
+{
+	struct kbase_arbiter_vm_state *arb_vm_state = kbdev->pm.arb_vm_state;
+
+	hrtimer_cancel(&arb_vm_state->vm_request_timer);
+}
+
+/**
  * kbase_arbiter_pm_early_init() - Initialize arbiter for VM
  *                                 Paravirtualized use.
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
@@ -230,6 +303,10 @@ int kbase_arbiter_pm_early_init(struct kbase_device *kbdev)
 	INIT_WORK(&arb_vm_state->vm_resume_work, kbase_arbiter_pm_resume_wq);
 	arb_vm_state->vm_arb_starting = false;
 	atomic_set(&kbdev->pm.gpu_users_waiting, 0);
+	hrtimer_init(&arb_vm_state->vm_request_timer, CLOCK_MONOTONIC,
+							HRTIMER_MODE_REL);
+	arb_vm_state->vm_request_timer.function =
+						request_timer_callback;
 	kbdev->pm.arb_vm_state = arb_vm_state;
 
 	err = kbase_arbif_init(kbdev);
@@ -237,17 +314,31 @@ int kbase_arbiter_pm_early_init(struct kbase_device *kbdev)
 		dev_err(kbdev->dev, "Failed to initialise arbif module\n");
 		goto arbif_init_fail;
 	}
+
 	if (kbdev->arb.arb_if) {
 		kbase_arbif_gpu_request(kbdev);
 		dev_dbg(kbdev->dev, "Waiting for initial GPU assignment...\n");
-		wait_event(arb_vm_state->vm_state_wait,
+		err = wait_event_timeout(arb_vm_state->vm_state_wait,
 			arb_vm_state->vm_state ==
-					KBASE_VM_STATE_INITIALIZING_WITH_GPU);
+					KBASE_VM_STATE_INITIALIZING_WITH_GPU,
+			msecs_to_jiffies(gpu_req_timeout));
+
+		if (!err) {
+			dev_dbg(kbdev->dev,
+			"Kbase probe Deferred after waiting %d ms to receive GPU_GRANT\n",
+			gpu_req_timeout);
+			err = -EPROBE_DEFER;
+			goto arbif_eprobe_defer;
+		}
+
 		dev_dbg(kbdev->dev,
 			"Waiting for initial GPU assignment - done\n");
 	}
 	return 0;
 
+arbif_eprobe_defer:
+	kbase_arbiter_pm_early_term(kbdev);
+	return err;
 arbif_init_fail:
 	destroy_workqueue(arb_vm_state->vm_arb_wq);
 	kfree(arb_vm_state);
@@ -265,14 +356,15 @@ void kbase_arbiter_pm_early_term(struct kbase_device *kbdev)
 {
 	struct kbase_arbiter_vm_state *arb_vm_state = kbdev->pm.arb_vm_state;
 
+	cancel_request_timer(kbdev);
 	mutex_lock(&arb_vm_state->vm_state_lock);
 	if (arb_vm_state->vm_state > KBASE_VM_STATE_STOPPED_GPU_REQUESTED) {
 		kbase_pm_set_gpu_lost(kbdev, false);
 		kbase_arbif_gpu_stopped(kbdev, false);
 	}
 	mutex_unlock(&arb_vm_state->vm_state_lock);
-	kbase_arbif_destroy(kbdev);
 	destroy_workqueue(arb_vm_state->vm_arb_wq);
+	kbase_arbif_destroy(kbdev);
 	arb_vm_state->vm_arb_wq = NULL;
 	kfree(kbdev->pm.arb_vm_state);
 	kbdev->pm.arb_vm_state = NULL;
@@ -282,19 +374,36 @@ void kbase_arbiter_pm_early_term(struct kbase_device *kbdev)
  * kbase_arbiter_pm_release_interrupts() - Release the GPU interrupts
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
- * Releases interrupts if needed (GPU is available) otherwise does nothing
+ * Releases interrupts and set the interrupt flag to false
  */
 void kbase_arbiter_pm_release_interrupts(struct kbase_device *kbdev)
 {
 	struct kbase_arbiter_vm_state *arb_vm_state = kbdev->pm.arb_vm_state;
 
 	mutex_lock(&arb_vm_state->vm_state_lock);
-	if (!kbdev->arb.arb_if ||
-			arb_vm_state->vm_state >
-					KBASE_VM_STATE_STOPPED_GPU_REQUESTED)
+	if (arb_vm_state->interrupts_installed == true) {
+		arb_vm_state->interrupts_installed = false;
 		kbase_release_interrupts(kbdev);
+	}
+	mutex_unlock(&arb_vm_state->vm_state_lock);
+}
 
+/**
+ * kbase_arbiter_pm_install_interrupts() - Install the GPU interrupts
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * Install interrupts and set the interrupt_install flag to true.
+ */
+int kbase_arbiter_pm_install_interrupts(struct kbase_device *kbdev)
+{
+	struct kbase_arbiter_vm_state *arb_vm_state = kbdev->pm.arb_vm_state;
+	int err;
+
+	mutex_lock(&arb_vm_state->vm_state_lock);
+	arb_vm_state->interrupts_installed = true;
+	err = kbase_install_interrupts(kbdev);
 	mutex_unlock(&arb_vm_state->vm_state_lock);
+	return err;
 }
 
 /**
@@ -317,7 +426,12 @@ void kbase_arbiter_pm_vm_stopped(struct kbase_device *kbdev)
 
 	dev_dbg(kbdev->dev, "%s %s\n", __func__,
 		kbase_arbiter_pm_vm_state_str(arb_vm_state->vm_state));
-	kbase_release_interrupts(kbdev);
+
+	if (arb_vm_state->interrupts_installed) {
+		arb_vm_state->interrupts_installed = false;
+		kbase_release_interrupts(kbdev);
+	}
+
 	switch (arb_vm_state->vm_state) {
 	case KBASE_VM_STATE_STOPPING_ACTIVE:
 		request_gpu = true;
@@ -338,6 +452,71 @@ void kbase_arbiter_pm_vm_stopped(struct kbase_device *kbdev)
 
 	kbase_pm_set_gpu_lost(kbdev, false);
 	kbase_arbif_gpu_stopped(kbdev, request_gpu);
+	if (request_gpu)
+		start_request_timer(kbdev);
+}
+
+void kbase_arbiter_set_max_config(struct kbase_device *kbdev,
+				  uint32_t max_l2_slices,
+				  uint32_t max_core_mask)
+{
+	struct kbase_arbiter_vm_state *arb_vm_state;
+	struct max_config_props max_config;
+
+	if (!kbdev)
+		return;
+
+	/* Mask the max_l2_slices as it is stored as 8 bits into kbase */
+	max_config.l2_slices = max_l2_slices & MAX_L2_SLICES_MASK;
+	max_config.core_mask = max_core_mask;
+	arb_vm_state = kbdev->pm.arb_vm_state;
+
+	mutex_lock(&arb_vm_state->vm_state_lock);
+	/* Just set the max_props in kbase during initialization. */
+	if (arb_vm_state->vm_state == KBASE_VM_STATE_INITIALIZING)
+		kbase_gpuprops_set_max_config(kbdev, &max_config);
+	else
+		dev_dbg(kbdev->dev, "Unexpected max_config on VM state %s",
+			kbase_arbiter_pm_vm_state_str(arb_vm_state->vm_state));
+
+	mutex_unlock(&arb_vm_state->vm_state_lock);
+}
+
+int kbase_arbiter_pm_gpu_assigned(struct kbase_device *kbdev)
+{
+	struct kbase_arbiter_vm_state *arb_vm_state;
+	int result = -EINVAL;
+
+	if (!kbdev)
+		return result;
+
+	/* First check the GPU_LOST state */
+	kbase_pm_lock(kbdev);
+	if (kbase_pm_is_gpu_lost(kbdev)) {
+		kbase_pm_unlock(kbdev);
+		return 0;
+	}
+	kbase_pm_unlock(kbdev);
+
+	/* Then the arbitration state machine */
+	arb_vm_state = kbdev->pm.arb_vm_state;
+
+	mutex_lock(&arb_vm_state->vm_state_lock);
+	switch (arb_vm_state->vm_state) {
+	case KBASE_VM_STATE_INITIALIZING:
+	case KBASE_VM_STATE_SUSPENDED:
+	case KBASE_VM_STATE_STOPPED:
+	case KBASE_VM_STATE_STOPPED_GPU_REQUESTED:
+	case KBASE_VM_STATE_SUSPEND_WAIT_FOR_GRANT:
+		result = 0;
+		break;
+	default:
+		result = 1;
+		break;
+	}
+	mutex_unlock(&arb_vm_state->vm_state_lock);
+
+	return result;
 }
 
 /**
@@ -351,6 +530,7 @@ static void kbase_arbiter_pm_vm_gpu_start(struct kbase_device *kbdev)
 	struct kbase_arbiter_vm_state *arb_vm_state = kbdev->pm.arb_vm_state;
 
 	lockdep_assert_held(&arb_vm_state->vm_state_lock);
+	cancel_request_timer(kbdev);
 	switch (arb_vm_state->vm_state) {
 	case KBASE_VM_STATE_INITIALIZING:
 		kbase_arbiter_pm_vm_set_state(kbdev,
@@ -358,7 +538,14 @@ static void kbase_arbiter_pm_vm_gpu_start(struct kbase_device *kbdev)
 		break;
 	case KBASE_VM_STATE_STOPPED_GPU_REQUESTED:
 		kbase_arbiter_pm_vm_set_state(kbdev, KBASE_VM_STATE_STARTING);
+		arb_vm_state->interrupts_installed = true;
 		kbase_install_interrupts(kbdev);
+		/*
+		 * GPU GRANTED received while in stop can be a result of a
+		 * repartitioning.
+		 */
+		kbase_gpuprops_req_curr_config_update(kbdev);
+		/* curr_config will be updated while resuming the PM. */
 		queue_work(arb_vm_state->vm_arb_wq,
 			&arb_vm_state->vm_resume_work);
 		break;
@@ -591,6 +778,7 @@ static void kbase_arbiter_pm_vm_os_resume(struct kbase_device *kbdev)
 	kbase_arbiter_pm_vm_set_state(kbdev,
 		KBASE_VM_STATE_STOPPED_GPU_REQUESTED);
 	kbase_arbif_gpu_request(kbdev);
+	start_request_timer(kbdev);
 
 	/* Release lock and block resume OS function until we have
 	 * asynchronously received the GRANT message from the Arbiter and
@@ -764,6 +952,7 @@ int kbase_arbiter_pm_ctx_active_handle_suspend(struct kbase_device *kbdev,
 				kbase_arbiter_pm_vm_set_state(kbdev,
 					KBASE_VM_STATE_STOPPED_GPU_REQUESTED);
 				kbase_arbif_gpu_request(kbdev);
+				start_request_timer(kbdev);
 			} else if (arb_vm_state->vm_state ==
 					KBASE_VM_STATE_INITIALIZING_WITH_GPU)
 				break;
@@ -811,3 +1000,60 @@ int kbase_arbiter_pm_ctx_active_handle_suspend(struct kbase_device *kbdev,
 	}
 	return res;
 }
+
+/**
+ * kbase_arbiter_pm_update_gpu_freq() - Updates GPU clock frequency received
+ * from arbiter.
+ * @arb_freq - Pointer to struchture holding GPU clock frequenecy data
+ * @freq - New frequency value
+ */
+void kbase_arbiter_pm_update_gpu_freq(struct kbase_arbiter_freq *arb_freq,
+		uint32_t freq)
+{
+	mutex_lock(&arb_freq->arb_freq_lock);
+	arb_freq->arb_freq = freq;
+	mutex_unlock(&arb_freq->arb_freq_lock);
+}
+
+/**
+ * enumerate_arb_gpu_clk() - Enumerate a GPU clock on the given index
+ * @kbdev - kbase_device pointer
+ * @index - GPU clock index
+ *
+ * Returns pointer to structure holding GPU clock frequency data reported from
+ * arbiter, only index 0 is valid.
+ */
+static void *enumerate_arb_gpu_clk(struct kbase_device *kbdev,
+		unsigned int index)
+{
+	if (index == 0)
+		return &kbdev->arb.arb_freq;
+	return NULL;
+}
+
+/**
+ * get_arb_gpu_clk_rate() - Get the current rate of GPU clock frequency value
+ * @kbdev - kbase_device pointer
+ * @index - GPU clock index
+ *
+ * Returns the GPU clock frequency value saved when gpu is granted from arbiter
+ */
+static unsigned long get_arb_gpu_clk_rate(struct kbase_device *kbdev,
+		void *gpu_clk_handle)
+{
+	uint32_t freq;
+	struct kbase_arbiter_freq *arb_dev_freq =
+			(struct kbase_arbiter_freq *) gpu_clk_handle;
+
+	mutex_lock(&arb_dev_freq->arb_freq_lock);
+	freq = arb_dev_freq->arb_freq;
+	mutex_unlock(&arb_dev_freq->arb_freq_lock);
+	return freq;
+}
+
+struct kbase_clk_rate_trace_op_conf arb_clk_rate_trace_ops = {
+	.get_gpu_clk_rate = get_arb_gpu_clk_rate,
+	.enumerate_gpu_clk = enumerate_arb_gpu_clk,
+	.gpu_clk_notifier_register = NULL,
+	.gpu_clk_notifier_unregister = NULL
+};
diff --git a/mali_kbase/arbiter/mali_kbase_arbiter_pm.h b/mali_kbase/arbiter/mali_kbase_arbiter_pm.h
index ef82271..0f74b63 100644
--- a/mali_kbase/arbiter/mali_kbase_arbiter_pm.h
+++ b/mali_kbase/arbiter/mali_kbase_arbiter_pm.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -93,11 +93,19 @@ void kbase_arbiter_pm_early_term(struct kbase_device *kbdev);
  * kbase_arbiter_pm_release_interrupts() - Release the GPU interrupts
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
- * Releases interrupts if needed (GPU is available) otherwise does nothing
+ * Releases interrupts and set the interrupt flag to false
  */
 void kbase_arbiter_pm_release_interrupts(struct kbase_device *kbdev);
 
 /**
+ * kbase_arbiter_pm_install_interrupts() - Install the GPU interrupts
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * Install interrupts and set the interrupt_install flag to true.
+ */
+int kbase_arbiter_pm_install_interrupts(struct kbase_device *kbdev);
+
+/**
  * kbase_arbiter_pm_vm_event() - Dispatch VM event to the state machine
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
@@ -133,4 +141,42 @@ int kbase_arbiter_pm_ctx_active_handle_suspend(struct kbase_device *kbdev,
  */
 void kbase_arbiter_pm_vm_stopped(struct kbase_device *kbdev);
 
+/**
+ * kbase_arbiter_set_max_config() - Set the max config data in kbase device.
+ * @kbdev: The kbase device structure for the device (must be a valid pointer).
+ * @max_l2_slices: The maximum number of L2 slices.
+ * @max_core_mask: The largest core mask.
+ *
+ * This function handles a stop event for the VM.
+ * It will update the VM state and forward the stop event to the driver.
+ */
+void kbase_arbiter_set_max_config(struct kbase_device *kbdev,
+				  uint32_t max_l2_slices,
+				  uint32_t max_core_mask);
+
+/**
+ * kbase_arbiter_pm_gpu_assigned() - Determine if this VM has access to the GPU
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * Return: 0 if the VM does not have access, 1 if it does, and a negative number
+ * if an error occurred
+ */
+int kbase_arbiter_pm_gpu_assigned(struct kbase_device *kbdev);
+
+extern struct kbase_clk_rate_trace_op_conf arb_clk_rate_trace_ops;
+
+/**
+ * struct kbase_arbiter_freq - Holding the GPU clock frequency data retrieved
+ * from arbiter
+ * @arb_freq:                 GPU clock frequency value
+ * @arb_freq_lock:            Mutex protecting access to arbfreq value
+ */
+struct kbase_arbiter_freq {
+	uint32_t arb_freq;
+	struct mutex arb_freq_lock;
+};
+
+void kbase_arbiter_pm_update_gpu_freq(struct kbase_arbiter_freq *arb_freq,
+		uint32_t freq);
+
 #endif /*_MALI_KBASE_ARBITER_PM_H_ */
diff --git a/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.h b/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.h
index 84fb1fc..fcf4e5b 100644
--- a/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.h
+++ b/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2014-2016, 2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2016, 2020-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -23,7 +23,7 @@
 #define _KBASE_CACHE_POLICY_BACKEND_H_
 
 #include "mali_kbase.h"
-#include "mali_base_kernel.h"
+#include <uapi/gpu/arm/midgard/mali_base_kernel.h>
 
 /**
   * kbase_cache_set_coherency_mode() - Sets the system coherency mode
diff --git a/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.c b/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.c
index dcd1b02..7076ab4 100644
--- a/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.c
+++ b/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -39,6 +39,38 @@
 #define CLK_RATE_TRACE_OPS (NULL)
 #endif
 
+/**
+ * get_clk_rate_trace_callbacks() - Returns pointer to clk trace ops.
+ * @kbdev: Pointer to kbase device, used to check if arbitration is enabled
+ *         when compiled with arbiter support.
+ * Return: Pointer to clk trace ops if supported or NULL.
+ */
+static struct kbase_clk_rate_trace_op_conf *
+get_clk_rate_trace_callbacks(struct kbase_device *kbdev __maybe_unused)
+{
+	/* base case */
+	struct kbase_clk_rate_trace_op_conf *callbacks =
+		(struct kbase_clk_rate_trace_op_conf *)CLK_RATE_TRACE_OPS;
+#if defined(CONFIG_MALI_ARBITER_SUPPORT) && defined(CONFIG_OF)
+	const void *arbiter_if_node;
+
+	if (WARN_ON(!kbdev) || WARN_ON(!kbdev->dev))
+		return callbacks;
+
+	arbiter_if_node =
+		of_get_property(kbdev->dev->of_node, "arbiter_if", NULL);
+	/* Arbitration enabled, override the callback pointer.*/
+	if (arbiter_if_node)
+		callbacks = &arb_clk_rate_trace_ops;
+	else
+		dev_dbg(kbdev->dev,
+			"Arbitration supported but disabled by platform. Leaving clk rate callbacks as default.\n");
+
+#endif
+
+	return callbacks;
+}
+
 static int gpu_clk_rate_change_notifier(struct notifier_block *nb,
 			unsigned long event, void *data)
 {
@@ -69,12 +101,13 @@ static int gpu_clk_rate_change_notifier(struct notifier_block *nb,
 static int gpu_clk_data_init(struct kbase_device *kbdev,
 		void *gpu_clk_handle, unsigned int index)
 {
-	struct kbase_clk_rate_trace_op_conf *callbacks =
-		(struct kbase_clk_rate_trace_op_conf *)CLK_RATE_TRACE_OPS;
+	struct kbase_clk_rate_trace_op_conf *callbacks;
 	struct kbase_clk_data *clk_data;
 	struct kbase_clk_rate_trace_manager *clk_rtm = &kbdev->pm.clk_rtm;
 	int ret = 0;
 
+	callbacks = get_clk_rate_trace_callbacks(kbdev);
+
 	if (WARN_ON(!callbacks) ||
 	    WARN_ON(!gpu_clk_handle) ||
 	    WARN_ON(index >= BASE_MAX_NR_CLOCKS_REGULATORS))
@@ -108,8 +141,9 @@ static int gpu_clk_data_init(struct kbase_device *kbdev,
 	clk_data->clk_rate_change_nb.notifier_call =
 			gpu_clk_rate_change_notifier;
 
-	ret = callbacks->gpu_clk_notifier_register(kbdev, gpu_clk_handle,
-			&clk_data->clk_rate_change_nb);
+	if (callbacks->gpu_clk_notifier_register)
+		ret = callbacks->gpu_clk_notifier_register(kbdev,
+				gpu_clk_handle, &clk_data->clk_rate_change_nb);
 	if (ret) {
 		dev_err(kbdev->dev, "Failed to register notifier for clock enumerated at index %u", index);
 		kfree(clk_data);
@@ -120,12 +154,13 @@ static int gpu_clk_data_init(struct kbase_device *kbdev,
 
 int kbase_clk_rate_trace_manager_init(struct kbase_device *kbdev)
 {
-	struct kbase_clk_rate_trace_op_conf *callbacks =
-		(struct kbase_clk_rate_trace_op_conf *)CLK_RATE_TRACE_OPS;
+	struct kbase_clk_rate_trace_op_conf *callbacks;
 	struct kbase_clk_rate_trace_manager *clk_rtm = &kbdev->pm.clk_rtm;
 	unsigned int i;
 	int ret = 0;
 
+	callbacks = get_clk_rate_trace_callbacks(kbdev);
+
 	spin_lock_init(&clk_rtm->lock);
 	INIT_LIST_HEAD(&clk_rtm->listeners);
 
@@ -186,9 +221,10 @@ void kbase_clk_rate_trace_manager_term(struct kbase_device *kbdev)
 		if (!clk_rtm->clks[i])
 			break;
 
-		clk_rtm->clk_rate_trace_ops->gpu_clk_notifier_unregister(
-				kbdev, clk_rtm->clks[i]->gpu_clk_handle,
-				&clk_rtm->clks[i]->clk_rate_change_nb);
+		if (clk_rtm->clk_rate_trace_ops->gpu_clk_notifier_unregister)
+			clk_rtm->clk_rate_trace_ops->gpu_clk_notifier_unregister
+			(kbdev, clk_rtm->clks[i]->gpu_clk_handle,
+			&clk_rtm->clks[i]->clk_rate_change_nb);
 		kfree(clk_rtm->clks[i]);
 	}
 
diff --git a/mali_kbase/backend/gpu/mali_kbase_devfreq.c b/mali_kbase/backend/gpu/mali_kbase_devfreq.c
index 07767c2..9b82184 100644
--- a/mali_kbase/backend/gpu/mali_kbase_devfreq.c
+++ b/mali_kbase/backend/gpu/mali_kbase_devfreq.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2014-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -643,7 +643,7 @@ int kbase_devfreq_init(struct kbase_device *kbdev)
 		/* Record the maximum frequency possible */
 		kbdev->gpu_props.props.core_props.gpu_freq_khz_max =
 			dp->freq_table[0] / 1000;
-	};
+	}
 
 	err = kbase_devfreq_init_core_mask_table(kbdev);
 	if (err) {
diff --git a/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c b/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c
index 4254a64..7542209 100644
--- a/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2014-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -121,6 +121,32 @@ int kbase_backend_gpuprops_get(struct kbase_device *kbdev,
 		return -EIO;
 }
 
+int kbase_backend_gpuprops_get_curr_config(struct kbase_device *kbdev,
+		struct kbase_current_config_regdump *curr_config_regdump)
+{
+	if (WARN_ON(!kbdev) || WARN_ON(!curr_config_regdump))
+		return -EINVAL;
+
+	curr_config_regdump->mem_features = kbase_reg_read(kbdev,
+					GPU_CONTROL_REG(MEM_FEATURES));
+
+	curr_config_regdump->shader_present_lo = kbase_reg_read(kbdev,
+					GPU_CONTROL_REG(SHADER_PRESENT_LO));
+	curr_config_regdump->shader_present_hi = kbase_reg_read(kbdev,
+					GPU_CONTROL_REG(SHADER_PRESENT_HI));
+
+	curr_config_regdump->l2_present_lo = kbase_reg_read(kbdev,
+					GPU_CONTROL_REG(L2_PRESENT_LO));
+	curr_config_regdump->l2_present_hi = kbase_reg_read(kbdev,
+					GPU_CONTROL_REG(L2_PRESENT_HI));
+
+	if (WARN_ON(kbase_is_gpu_removed(kbdev)))
+		return -EIO;
+
+	return 0;
+
+}
+
 int kbase_backend_gpuprops_get_features(struct kbase_device *kbdev,
 					struct kbase_gpuprops_regdump *regdump)
 {
@@ -156,11 +182,15 @@ int kbase_backend_gpuprops_get_l2_features(struct kbase_device *kbdev,
 	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_L2_CONFIG)) {
 		u32 l2_features = kbase_reg_read(kbdev,
 				GPU_CONTROL_REG(L2_FEATURES));
+		u32 l2_config =
+			kbase_reg_read(kbdev, GPU_CONTROL_REG(L2_CONFIG));
+
 
 		if (kbase_is_gpu_removed(kbdev))
 			return -EIO;
 
 		regdump->l2_features = l2_features;
+		regdump->l2_config = l2_config;
 	}
 
 	return 0;
diff --git a/mali_kbase/backend/gpu/mali_kbase_instr_backend.c b/mali_kbase/backend/gpu/mali_kbase_instr_backend.c
index 9cc425e..6868dc3 100644
--- a/mali_kbase/backend/gpu/mali_kbase_instr_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_instr_backend.c
@@ -107,7 +107,7 @@ int kbase_instr_hwcnt_enable_internal(struct kbase_device *kbdev,
 
 	err = 0;
 
-	dev_dbg(kbdev->dev, "HW counters dumping set-up for context %p", kctx);
+	dev_dbg(kbdev->dev, "HW counters dumping set-up for context %pK", kctx);
 	return err;
  out_err:
 	return err;
@@ -167,7 +167,7 @@ int kbase_instr_hwcnt_disable_internal(struct kbase_context *kctx)
 	spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
 
-	dev_dbg(kbdev->dev, "HW counters dumping disabled for context %p",
+	dev_dbg(kbdev->dev, "HW counters dumping disabled for context %pK",
 									kctx);
 
 	err = 0;
@@ -214,7 +214,7 @@ int kbase_instr_hwcnt_request_dump(struct kbase_context *kctx)
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
 					GPU_COMMAND_PRFCNT_SAMPLE);
 
-	dev_dbg(kbdev->dev, "HW counters dumping done for context %p", kctx);
+	dev_dbg(kbdev->dev, "HW counters dumping done for context %pK", kctx);
 
 	err = 0;
 
@@ -325,7 +325,7 @@ KBASE_EXPORT_SYMBOL(kbase_instr_hwcnt_clear);
 
 int kbase_instr_backend_init(struct kbase_device *kbdev)
 {
-	int ret = 0;
+	spin_lock_init(&kbdev->hwcnt.lock);
 
 	kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_DISABLED;
 
@@ -344,12 +344,12 @@ int kbase_instr_backend_init(struct kbase_device *kbdev)
 	kbdev->hwcnt.backend.override_counter_set = KBASE_HWCNT_SET_PRIMARY;
 #endif
 #endif
-	return ret;
+	return 0;
 }
 
 void kbase_instr_backend_term(struct kbase_device *kbdev)
 {
-	(void)kbdev;
+	CSTD_UNUSED(kbdev);
 }
 
 #ifdef CONFIG_MALI_PRFCNT_SET_SELECT_VIA_DEBUG_FS
diff --git a/mali_kbase/backend/gpu/mali_kbase_instr_defs.h b/mali_kbase/backend/gpu/mali_kbase_instr_defs.h
index 39b009d..05d5193 100644
--- a/mali_kbase/backend/gpu/mali_kbase_instr_defs.h
+++ b/mali_kbase/backend/gpu/mali_kbase_instr_defs.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2014, 2016, 2018, 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014, 2016, 2018-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
diff --git a/mali_kbase/backend/gpu/mali_kbase_jm_hw.c b/mali_kbase/backend/gpu/mali_kbase_jm_hw.c
index 7cfca97..e84f3a9 100644
--- a/mali_kbase/backend/gpu/mali_kbase_jm_hw.c
+++ b/mali_kbase/backend/gpu/mali_kbase_jm_hw.c
@@ -40,10 +40,12 @@
 #include <mali_kbase_regs_history_debugfs.h>
 
 static void kbasep_try_reset_gpu_early_locked(struct kbase_device *kbdev);
+static u64 kbasep_apply_limited_core_mask(const struct kbase_device *kbdev,
+				const u64 affinity, const u64 limited_core_mask);
 
 static u64 kbase_job_write_affinity(struct kbase_device *kbdev,
 				base_jd_core_req core_req,
-				int js)
+				int js, const u64 limited_core_mask)
 {
 	u64 affinity;
 
@@ -72,14 +74,21 @@ static u64 kbase_job_write_affinity(struct kbase_device *kbdev,
 		 */
 		if (js == 2 && num_core_groups > 1)
 			affinity &= coherency_info->group[1].core_mask;
-		else
+		else if (num_core_groups > 1)
 			affinity &= coherency_info->group[0].core_mask;
+		else
+			affinity &= kbdev->gpu_props.curr_config.shader_present;
 	} else {
 		/* Use all cores */
 		affinity = kbdev->pm.backend.shaders_avail &
 				kbdev->pm.debug_core_mask[js];
 	}
 
+	if (core_req & BASE_JD_REQ_LIMITED_CORE_MASK) {
+		/* Limiting affinity due to BASE_JD_REQ_LIMITED_CORE_MASK by applying the limited core mask. */
+		affinity = kbasep_apply_limited_core_mask(kbdev, affinity, limited_core_mask);
+	}
+
 	if (unlikely(!affinity)) {
 #ifdef CONFIG_MALI_DEBUG
 		u64 shaders_ready =
@@ -89,6 +98,16 @@ static u64 kbase_job_write_affinity(struct kbase_device *kbdev,
 #endif
 
 		affinity = kbdev->pm.backend.shaders_avail;
+
+		if (core_req & BASE_JD_REQ_LIMITED_CORE_MASK) {
+			/* Limiting affinity again to make sure it only enables shader cores with backed TLS memory. */
+			affinity = kbasep_apply_limited_core_mask(kbdev, affinity, limited_core_mask);
+
+#ifdef CONFIG_MALI_DEBUG
+			/* affinity should never be 0 */
+			WARN_ON(!affinity);
+#endif
+		}
 	}
 
 	kbase_reg_write(kbdev, JOB_SLOT_REG(js, JS_AFFINITY_NEXT_LO),
@@ -169,7 +188,7 @@ static u64 select_job_chain(struct kbase_jd_atom *katom)
 	}
 
 	dev_dbg(kctx->kbdev->dev,
-		"Selected job chain 0x%llx for end atom %p in state %d\n",
+		"Selected job chain 0x%llx for end atom %pK in state %d\n",
 		jc, (void *)katom, (int)rp->state);
 
 	katom->jc = jc;
@@ -193,7 +212,7 @@ void kbase_job_hw_submit(struct kbase_device *kbdev,
 	/* Command register must be available */
 	KBASE_DEBUG_ASSERT(kbasep_jm_is_js_free(kbdev, js, kctx));
 
-	dev_dbg(kctx->kbdev->dev, "Write JS_HEAD_NEXT 0x%llx for atom %p\n",
+	dev_dbg(kctx->kbdev->dev, "Write JS_HEAD_NEXT 0x%llx for atom %pK\n",
 		jc_head, (void *)katom);
 
 	kbase_reg_write(kbdev, JOB_SLOT_REG(js, JS_HEAD_NEXT_LO),
@@ -201,7 +220,8 @@ void kbase_job_hw_submit(struct kbase_device *kbdev,
 	kbase_reg_write(kbdev, JOB_SLOT_REG(js, JS_HEAD_NEXT_HI),
 						jc_head >> 32);
 
-	affinity = kbase_job_write_affinity(kbdev, katom->core_req, js);
+	affinity = kbase_job_write_affinity(kbdev, katom->core_req, js,
+						kctx->limited_core_mask);
 
 	/* start MMU, medium priority, cache clean/flush on end, clean/flush on
 	 * start
@@ -257,7 +277,7 @@ void kbase_job_hw_submit(struct kbase_device *kbdev,
 	katom->start_timestamp = ktime_get();
 
 	/* GO ! */
-	dev_dbg(kbdev->dev, "JS: Submitting atom %p from ctx %p to js[%d] with head=0x%llx",
+	dev_dbg(kbdev->dev, "JS: Submitting atom %pK from ctx %pK to js[%d] with head=0x%llx",
 				katom, kctx, js, jc_head);
 
 	KBASE_KTRACE_ADD_JM_SLOT_INFO(kbdev, JM_SUBMIT, kctx, katom, jc_head, js,
@@ -431,7 +451,9 @@ void kbase_job_done(struct kbase_device *kbdev, u32 done)
 				 */
 				if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_TTRX_3076)) {
 					if (completion_code == BASE_JD_EVENT_JOB_BUS_FAULT) {
-						if (kbase_prepare_to_reset_gpu_locked(kbdev))
+						if (kbase_prepare_to_reset_gpu_locked(
+							    kbdev,
+							    RESET_FLAGS_NONE))
 							kbase_reset_gpu_locked(kbdev);
 					}
 				}
@@ -789,7 +811,7 @@ static int softstop_start_rp_nolock(
 
 	if (!(katom->core_req & BASE_JD_REQ_START_RENDERPASS)) {
 		dev_dbg(kctx->kbdev->dev,
-			"Atom %p on job slot is not start RP\n", (void *)katom);
+			"Atom %pK on job slot is not start RP\n", (void *)katom);
 		return -EPERM;
 	}
 
@@ -802,13 +824,13 @@ static int softstop_start_rp_nolock(
 		rp->state != KBASE_JD_RP_RETRY))
 		return -EINVAL;
 
-	dev_dbg(kctx->kbdev->dev, "OOM in state %d with region %p\n",
+	dev_dbg(kctx->kbdev->dev, "OOM in state %d with region %pK\n",
 		(int)rp->state, (void *)reg);
 
 	if (WARN_ON(katom != rp->start_katom))
 		return -EINVAL;
 
-	dev_dbg(kctx->kbdev->dev, "Adding region %p to list %p\n",
+	dev_dbg(kctx->kbdev->dev, "Adding region %pK to list %pK\n",
 		(void *)reg, (void *)&rp->oom_reg_list);
 	list_move_tail(&reg->link, &rp->oom_reg_list);
 	dev_dbg(kctx->kbdev->dev, "Added region to list\n");
@@ -853,7 +875,7 @@ void kbase_jm_wait_for_zero_jobs(struct kbase_context *kctx)
 	if (timeout != 0)
 		goto exit;
 
-	if (kbase_prepare_to_reset_gpu(kbdev)) {
+	if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE)) {
 		dev_err(kbdev->dev,
 			"Issuing GPU soft-reset because jobs failed to be killed (within %d ms) as part of context termination (e.g. process exit)\n",
 			ZAP_TIMEOUT);
@@ -863,7 +885,7 @@ void kbase_jm_wait_for_zero_jobs(struct kbase_context *kctx)
 	/* Wait for the reset to complete */
 	kbase_reset_gpu_wait(kbdev);
 exit:
-	dev_dbg(kbdev->dev, "Zap: Finished Context %p", kctx);
+	dev_dbg(kbdev->dev, "Zap: Finished Context %pK", kctx);
 
 	/* Ensure that the signallers of the waitqs have finished */
 	mutex_lock(&kctx->jctx.lock);
@@ -924,7 +946,7 @@ KBASE_EXPORT_TEST_API(kbase_job_slot_term);
 void kbase_job_slot_softstop_swflags(struct kbase_device *kbdev, int js,
 			struct kbase_jd_atom *target_katom, u32 sw_flags)
 {
-	dev_dbg(kbdev->dev, "Soft-stop atom %p with flags 0x%x (s:%d)\n",
+	dev_dbg(kbdev->dev, "Soft-stop atom %pK with flags 0x%x (s:%d)\n",
 		target_katom, sw_flags, js);
 
 	KBASE_DEBUG_ASSERT(!(sw_flags & JS_COMMAND_MASK));
@@ -1337,6 +1359,7 @@ static void kbasep_try_reset_gpu_early(struct kbase_device *kbdev)
 /**
  * kbase_prepare_to_reset_gpu_locked - Prepare for resetting the GPU
  * @kbdev: kbase device
+ * @flags: Bitfield indicating impact of reset (see flag defines)
  *
  * This function just soft-stops all the slots to ensure that as many jobs as
  * possible are saved.
@@ -1347,10 +1370,12 @@ static void kbasep_try_reset_gpu_early(struct kbase_device *kbdev)
  *   false - Another thread is performing a reset, kbase_reset_gpu should
  *   not be called.
  */
-bool kbase_prepare_to_reset_gpu_locked(struct kbase_device *kbdev)
+bool kbase_prepare_to_reset_gpu_locked(struct kbase_device *kbdev,
+				       unsigned int flags)
 {
 	int i;
 
+	CSTD_UNUSED(flags);
 	KBASE_DEBUG_ASSERT(kbdev);
 
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
@@ -1378,14 +1403,14 @@ bool kbase_prepare_to_reset_gpu_locked(struct kbase_device *kbdev)
 	return true;
 }
 
-bool kbase_prepare_to_reset_gpu(struct kbase_device *kbdev)
+bool kbase_prepare_to_reset_gpu(struct kbase_device *kbdev, unsigned int flags)
 {
-	unsigned long flags;
+	unsigned long lock_flags;
 	bool ret;
 
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	ret = kbase_prepare_to_reset_gpu_locked(kbdev);
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	spin_lock_irqsave(&kbdev->hwaccess_lock, lock_flags);
+	ret = kbase_prepare_to_reset_gpu_locked(kbdev, flags);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, lock_flags);
 
 	return ret;
 }
@@ -1506,3 +1531,21 @@ void kbase_reset_gpu_term(struct kbase_device *kbdev)
 {
 	destroy_workqueue(kbdev->hwaccess.backend.reset_workq);
 }
+
+static u64 kbasep_apply_limited_core_mask(const struct kbase_device *kbdev,
+				const u64 affinity, const u64 limited_core_mask)
+{
+	const u64 result = affinity & limited_core_mask;
+
+#ifdef CONFIG_MALI_DEBUG
+	dev_dbg(kbdev->dev,
+				"Limiting affinity due to BASE_JD_REQ_LIMITED_CORE_MASK from 0x%lx to 0x%lx (mask is 0x%lx)\n",
+				(unsigned long int)affinity,
+				(unsigned long int)result,
+				(unsigned long int)limited_core_mask);
+#else
+	CSTD_UNUSED(kbdev);
+#endif
+
+	return result;
+}
diff --git a/mali_kbase/backend/gpu/mali_kbase_jm_rb.c b/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
index 7104658..5fdf9b6 100644
--- a/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
+++ b/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2014-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -1024,7 +1024,7 @@ void kbase_backend_run_atom(struct kbase_device *kbdev,
 				struct kbase_jd_atom *katom)
 {
 	lockdep_assert_held(&kbdev->hwaccess_lock);
-	dev_dbg(kbdev->dev, "Backend running atom %p\n", (void *)katom);
+	dev_dbg(kbdev->dev, "Backend running atom %pK\n", (void *)katom);
 
 	kbase_gpu_enqueue_atom(kbdev, katom);
 	kbase_backend_slot_update(kbdev);
@@ -1085,7 +1085,7 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js,
 	struct kbase_context *kctx = katom->kctx;
 
 	dev_dbg(kbdev->dev,
-		"Atom %p completed on hw with code 0x%x and job_tail 0x%llx (s:%d)\n",
+		"Atom %pK completed on hw with code 0x%x and job_tail 0x%llx (s:%d)\n",
 		(void *)katom, completion_code, job_tail, js);
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
@@ -1205,7 +1205,7 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js,
 	if (job_tail != 0 && job_tail != katom->jc) {
 		/* Some of the job has been executed */
 		dev_dbg(kbdev->dev,
-			"Update job chain address of atom %p to resume from 0x%llx\n",
+			"Update job chain address of atom %pK to resume from 0x%llx\n",
 			(void *)katom, job_tail);
 
 		katom->jc = job_tail;
@@ -1266,7 +1266,7 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js,
 
 	if (katom) {
 		dev_dbg(kbdev->dev,
-			"Cross-slot dependency %p has become runnable.\n",
+			"Cross-slot dependency %pK has become runnable.\n",
 			(void *)katom);
 
 		/* Check if there are lower priority jobs to soft stop */
@@ -1666,7 +1666,7 @@ void kbase_gpu_dump_slots(struct kbase_device *kbdev)
 
 			if (katom)
 				dev_info(kbdev->dev,
-				"  js%d idx%d : katom=%p gpu_rb_state=%d\n",
+				"  js%d idx%d : katom=%pK gpu_rb_state=%d\n",
 				js, idx, katom, katom->gpu_rb_state);
 			else
 				dev_info(kbdev->dev, "  js%d idx%d : empty\n",
diff --git a/mali_kbase/backend/gpu/mali_kbase_js_backend.c b/mali_kbase/backend/gpu/mali_kbase_js_backend.c
index d28e7b0..cab222d 100644
--- a/mali_kbase/backend/gpu/mali_kbase_js_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_js_backend.c
@@ -257,7 +257,7 @@ static enum hrtimer_restart timer_callback(struct hrtimer *timer)
 	if (reset_needed) {
 		dev_err(kbdev->dev, "JS: Job has been on the GPU for too long (JS_RESET_TICKS_SS/DUMPING timeout hit). Issuing GPU soft-reset to resolve.");
 
-		if (kbase_prepare_to_reset_gpu_locked(kbdev))
+		if (kbase_prepare_to_reset_gpu_locked(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu_locked(kbdev);
 	}
 	/* the timer is re-issued if there is contexts in the run-pool */
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_backend.c b/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
index 921849b..0cfa93c 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2010-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -498,7 +498,15 @@ static void kbase_pm_hwcnt_disable_worker(struct work_struct *data)
 		/* PM state was updated while we were doing the disable,
 		 * so we need to undo the disable we just performed.
 		 */
+#if MALI_USE_CSF
+		unsigned long lock_flags;
+
+		kbase_csf_scheduler_spin_lock(kbdev, &lock_flags);
+#endif
 		kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
+#if MALI_USE_CSF
+		kbase_csf_scheduler_spin_unlock(kbdev, lock_flags);
+#endif
 	}
 
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
@@ -664,10 +672,15 @@ void kbase_hwaccess_pm_term(struct kbase_device *kbdev)
 
 	if (kbdev->pm.backend.hwcnt_disabled) {
 		unsigned long flags;
-
+#if MALI_USE_CSF
+		kbase_csf_scheduler_spin_lock(kbdev, &flags);
+		kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
+		kbase_csf_scheduler_spin_unlock(kbdev, flags);
+#else
 		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 		kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
 		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+#endif
 	}
 
 	/* Free any resources the policy allocated */
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_ca.c b/mali_kbase/backend/gpu/mali_kbase_pm_ca.c
index c546766..3cf7608 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_ca.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_ca.c
@@ -102,10 +102,18 @@ u64 kbase_pm_ca_get_core_mask(struct kbase_device *kbdev)
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
 #ifdef CONFIG_MALI_DEVFREQ
-	return kbdev->pm.backend.ca_cores_enabled & debug_core_mask;
+	/*
+	 * Although in the init we let the pm_backend->ca_cores_enabled to be
+	 * the max config (it uses the base_gpu_props), at this function we need
+	 * to limit it to be a subgroup of the curr config, otherwise the
+	 * shaders state machine on the PM does not evolve.
+	 */
+	return kbdev->gpu_props.curr_config.shader_present &
+			kbdev->pm.backend.ca_cores_enabled &
+			debug_core_mask;
 #else
-	return kbdev->gpu_props.props.raw_props.shader_present &
-	       debug_core_mask;
+	return kbdev->gpu_props.curr_config.shader_present &
+		debug_core_mask;
 #endif
 }
 
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_defs.h b/mali_kbase/backend/gpu/mali_kbase_pm_defs.h
index 1b4e141..0687a43 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_defs.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_defs.h
@@ -61,24 +61,9 @@ enum kbase_pm_core_type {
 	KBASE_PM_CORE_STACK = STACK_PRESENT_LO
 };
 
-/**
+/*
  * enum kbase_l2_core_state - The states used for the L2 cache & tiler power
  *                            state machine.
- *
- * @KBASE_L2_OFF: The L2 cache and tiler are off
- * @KBASE_L2_PEND_ON: The L2 cache and tiler are powering on
- * @KBASE_L2_RESTORE_CLOCKS: The GPU clock is restored. Conditionally used.
- * @KBASE_L2_ON_HWCNT_ENABLE: The L2 cache and tiler are on, and hwcnt is being
- *                            enabled
- * @KBASE_L2_ON: The L2 cache and tiler are on, and hwcnt is enabled
- * @KBASE_L2_ON_HWCNT_DISABLE: The L2 cache and tiler are on, and hwcnt is being
- *                             disabled
- * @KBASE_L2_SLOW_DOWN_CLOCKS: The GPU clock is set to appropriate or lowest
- *                             clock. Conditionally used.
- * @KBASE_L2_POWER_DOWN: The L2 cache and tiler are about to be powered off
- * @KBASE_L2_PEND_OFF: The L2 cache and tiler are powering off
- * @KBASE_L2_RESET_WAIT: The GPU is resetting, L2 cache and tiler power state
- *                       are unknown
  */
 enum kbase_l2_core_state {
 #define KBASEP_L2_STATE(n) KBASE_L2_ ## n,
@@ -87,26 +72,8 @@ enum kbase_l2_core_state {
 };
 
 #if MALI_USE_CSF
-/**
+/*
  * enum kbase_mcu_state - The states used for the MCU state machine.
- *
- * @KBASE_MCU_OFF:            The MCU is powered off.
- * @KBASE_MCU_PEND_ON_RELOAD: The warm boot of MCU or cold boot of MCU (with
- *                            firmware reloading) is in progress.
- * @KBASE_MCU_ON_GLB_REINIT_PEND: The MCU is enabled and Global configuration
- *                                requests have been sent to the firmware.
- * @KBASE_MCU_ON_HWCNT_ENABLE: The Global requests have completed and MCU is
- *                             now ready for use and hwcnt is being enabled.
- * @KBASE_MCU_ON:             The MCU is active and hwcnt has been enabled.
- * @KBASE_MCU_ON_CORE_MASK_UPDATE_PEND: The MCU is active and mask of enabled
- *                                      shader cores is being updated.
- * @KBASE_MCU_ON_HWCNT_DISABLE: The MCU is on and hwcnt is being disabled.
- * @KBASE_MCU_ON_HALT:        The MCU is on and hwcnt has been disabled,
- *                            MCU halt would be triggered.
- * @KBASE_MCU_ON_PEND_HALT:   MCU halt in progress, confirmation pending.
- * @KBASE_MCU_POWER_DOWN:     MCU halted operations, pending being disabled.
- * @KBASE_MCU_PEND_OFF:       MCU is being disabled, pending on powering off.
- * @KBASE_MCU_RESET_WAIT:     The GPU is resetting, MCU state is unknown.
  */
 enum kbase_mcu_state {
 #define KBASEP_MCU_STATE(n) KBASE_MCU_ ## n,
@@ -115,45 +82,8 @@ enum kbase_mcu_state {
 };
 #endif
 
-/**
+/*
  * enum kbase_shader_core_state - The states used for the shaders' state machine.
- *
- * @KBASE_SHADERS_OFF_CORESTACK_OFF: The shaders and core stacks are off
- * @KBASE_SHADERS_OFF_CORESTACK_PEND_ON: The shaders are off, core stacks have
- *                                       been requested to power on and hwcnt
- *                                       is being disabled
- * @KBASE_SHADERS_PEND_ON_CORESTACK_ON: Core stacks are on, shaders have been
- *                                      requested to power on. Or after doing
- *                                      partial shader on/off, checking whether
- *                                      it's the desired state.
- * @KBASE_SHADERS_ON_CORESTACK_ON: The shaders and core stacks are on, and hwcnt
- *					already enabled.
- * @KBASE_SHADERS_ON_CORESTACK_ON_RECHECK: The shaders and core stacks
- *                                      are on, hwcnt disabled, and checks
- *                                      to powering down or re-enabling
- *                                      hwcnt.
- * @KBASE_SHADERS_WAIT_OFF_CORESTACK_ON: The shaders have been requested to
- *                                       power off, but they remain on for the
- *                                       duration of the hysteresis timer
- * @KBASE_SHADERS_WAIT_GPU_IDLE: The shaders partial poweroff needs to reach
- *                               a state where jobs on the GPU are finished
- *                               including jobs currently running and in the
- *                               GPU queue because of GPU2017-861
- * @KBASE_SHADERS_WAIT_FINISHED_CORESTACK_ON: The hysteresis timer has expired
- * @KBASE_SHADERS_L2_FLUSHING_CORESTACK_ON: The core stacks are on and the
- *                                          level 2 cache is being flushed.
- * @KBASE_SHADERS_READY_OFF_CORESTACK_ON: The core stacks are on and the shaders
- *                                        are ready to be powered off.
- * @KBASE_SHADERS_PEND_OFF_CORESTACK_ON: The core stacks are on, and the shaders
- *                                       have been requested to power off
- * @KBASE_SHADERS_OFF_CORESTACK_PEND_OFF: The shaders are off, and the core stacks
- *                                        have been requested to power off
- * @KBASE_SHADERS_OFF_CORESTACK_OFF_TIMER_PEND_OFF: Shaders and corestacks are
- *                                                  off, but the tick timer
- *                                                  cancellation is still
- *                                                  pending.
- * @KBASE_SHADERS_RESET_WAIT: The GPU is resetting, shader and core stack power
- *                            states are unknown
  */
 enum kbase_shader_core_state {
 #define KBASEP_SHADER_STATE(n) KBASE_SHADERS_ ## n,
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
index da32510..a2f96b5 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
@@ -407,9 +407,9 @@ u64 kbase_pm_get_present_cores(struct kbase_device *kbdev,
 
 	switch (type) {
 	case KBASE_PM_CORE_L2:
-		return kbdev->gpu_props.props.raw_props.l2_present;
+		return kbdev->gpu_props.curr_config.l2_present;
 	case KBASE_PM_CORE_SHADER:
-		return kbdev->gpu_props.props.raw_props.shader_present;
+		return kbdev->gpu_props.curr_config.shader_present;
 	case KBASE_PM_CORE_TILER:
 		return kbdev->gpu_props.props.raw_props.tiler_present;
 	case KBASE_PM_CORE_STACK:
@@ -695,8 +695,12 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev)
 		case KBASE_MCU_ON_HWCNT_ENABLE:
 			backend->hwcnt_desired = true;
 			if (backend->hwcnt_disabled) {
+				unsigned long flags;
+
+				kbase_csf_scheduler_spin_lock(kbdev, &flags);
 				kbase_hwcnt_context_enable(
 					kbdev->hwcnt_gpu_ctx);
+				kbase_csf_scheduler_spin_unlock(kbdev, flags);
 				backend->hwcnt_disabled = false;
 			}
 			backend->mcu_state = KBASE_MCU_ON;
@@ -851,7 +855,7 @@ static const char *kbase_l2_core_state_to_string(enum kbase_l2_core_state state)
 static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 {
 	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
-	u64 l2_present = kbdev->gpu_props.props.raw_props.l2_present;
+	u64 l2_present = kbdev->gpu_props.curr_config.l2_present;
 #if !MALI_USE_CSF
 	u64 tiler_present = kbdev->gpu_props.props.raw_props.tiler_present;
 #endif
@@ -1255,7 +1259,6 @@ static int kbase_pm_shaders_update_state(struct kbase_device *kbdev)
 			&kbdev->pm.backend.shader_tick_timer;
 	enum kbase_shader_core_state prev_state;
 	u64 stacks_avail = 0;
-	int err = 0;
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
@@ -1350,8 +1353,18 @@ static int kbase_pm_shaders_update_state(struct kbase_device *kbdev)
 				backend->pm_shaders_core_mask = shaders_ready;
 				backend->hwcnt_desired = true;
 				if (backend->hwcnt_disabled) {
+#if MALI_USE_CSF
+					unsigned long flags;
+
+					kbase_csf_scheduler_spin_lock(kbdev,
+								      &flags);
+#endif
 					kbase_hwcnt_context_enable(
 						kbdev->hwcnt_gpu_ctx);
+#if MALI_USE_CSF
+					kbase_csf_scheduler_spin_unlock(kbdev,
+									flags);
+#endif
 					backend->hwcnt_disabled = false;
 				}
 
@@ -1531,8 +1544,18 @@ static int kbase_pm_shaders_update_state(struct kbase_device *kbdev)
 				backend->pm_shaders_core_mask = 0;
 				backend->hwcnt_desired = true;
 				if (backend->hwcnt_disabled) {
+#if MALI_USE_CSF
+					unsigned long flags;
+
+					kbase_csf_scheduler_spin_lock(kbdev,
+								      &flags);
+#endif
 					kbase_hwcnt_context_enable(
 						kbdev->hwcnt_gpu_ctx);
+#if MALI_USE_CSF
+					kbase_csf_scheduler_spin_unlock(kbdev,
+									flags);
+#endif
 					backend->hwcnt_disabled = false;
 				}
 				backend->shaders_state = KBASE_SHADERS_OFF_CORESTACK_OFF_TIMER_PEND_OFF;
@@ -1559,7 +1582,7 @@ static int kbase_pm_shaders_update_state(struct kbase_device *kbdev)
 
 	} while (backend->shaders_state != prev_state);
 
-	return err;
+	return 0;
 }
 #endif
 
@@ -1883,17 +1906,9 @@ static void kbase_pm_timed_out(struct kbase_device *kbdev)
 			kbase_reg_read(kbdev, GPU_CONTROL_REG(
 					L2_PWRTRANS_LO)));
 
-#if MALI_USE_CSF
-	/* PM timeout probably means hardware counters will stop working.
-	 * Put the backend into the unrecoverable error state to cause
-	 * current and subsequent counter operations to immediately
-	 * fail, avoiding the risk of a hang.
-	 */
-	kbase_hwcnt_backend_csf_on_unrecoverable_error(&kbdev->hwcnt_gpu_iface);
-#endif
-
 	dev_err(kbdev->dev, "Sending reset to GPU - all running jobs will be lost\n");
-	if (kbase_prepare_to_reset_gpu(kbdev))
+	if (kbase_prepare_to_reset_gpu(kbdev,
+				       RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
 		kbase_reset_gpu(kbdev);
 }
 
@@ -2105,6 +2120,13 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume)
 						PM_NO_RESET);
 		}
 	}
+	/*
+	 * This point means that the GPU trasitioned to ON. So there is a chance
+	 * that a repartitioning occurred. In this case the current config
+	 * should be read again.
+	 */
+	kbase_gpuprops_get_curr_config_props(kbdev,
+		&kbdev->gpu_props.curr_config);
 #endif /* CONFIG_MALI_ARBITER_SUPPORT */
 
 	mutex_lock(&kbdev->mmu_hw_mutex);
@@ -2253,7 +2275,7 @@ static enum hrtimer_restart kbasep_reset_timeout(struct hrtimer *timer)
 	struct kbasep_reset_timeout_data *rtdata =
 		container_of(timer, struct kbasep_reset_timeout_data, timer);
 
-	rtdata->timed_out = 1;
+	rtdata->timed_out = true;
 
 	/* Set the wait queue to wake up kbase_pm_init_hw even though the reset
 	 * hasn't completed
@@ -2263,14 +2285,13 @@ static enum hrtimer_restart kbasep_reset_timeout(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-static int kbase_set_jm_quirks(struct kbase_device *kbdev, const u32 prod_id)
+static int kbase_set_gpu_quirks(struct kbase_device *kbdev, const u32 prod_id)
 {
 #if MALI_USE_CSF
-	kbdev->hw_quirks_jm = kbase_reg_read(kbdev,
-				GPU_CONTROL_REG(CSF_CONFIG));
+	kbdev->hw_quirks_gpu =
+		kbase_reg_read(kbdev, GPU_CONTROL_REG(CSF_CONFIG));
 #else
-	u32 hw_quirks_jm = kbase_reg_read(kbdev,
-				GPU_CONTROL_REG(JM_CONFIG));
+	u32 hw_quirks_gpu = kbase_reg_read(kbdev, GPU_CONTROL_REG(JM_CONFIG));
 
 	if (GPU_ID2_MODEL_MATCH_VALUE(prod_id) == GPU_ID2_PRODUCT_TMIX) {
 		/* Only for tMIx */
@@ -2284,39 +2305,38 @@ static int kbase_set_jm_quirks(struct kbase_device *kbdev, const u32 prod_id)
 		 */
 		if (coherency_features ==
 				COHERENCY_FEATURE_BIT(COHERENCY_ACE)) {
-			hw_quirks_jm |= (COHERENCY_ACE_LITE |
-					COHERENCY_ACE) <<
-					JM_FORCE_COHERENCY_FEATURES_SHIFT;
+			hw_quirks_gpu |= (COHERENCY_ACE_LITE | COHERENCY_ACE)
+					 << JM_FORCE_COHERENCY_FEATURES_SHIFT;
 		}
 	}
 
 	if (kbase_is_gpu_removed(kbdev))
 		return -EIO;
 
-	kbdev->hw_quirks_jm = hw_quirks_jm;
+	kbdev->hw_quirks_gpu = hw_quirks_gpu;
 
 #endif /* !MALI_USE_CSF */
 	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_IDVS_GROUP_SIZE)) {
 		int default_idvs_group_size = 0xF;
-		u32 tmp;
+		u32 group_size = 0;
 
-		if (of_property_read_u32(kbdev->dev->of_node,
-					"idvs-group-size", &tmp))
-			tmp = default_idvs_group_size;
+		if (of_property_read_u32(kbdev->dev->of_node, "idvs-group-size",
+					 &group_size))
+			group_size = default_idvs_group_size;
 
-		if (tmp > IDVS_GROUP_MAX_SIZE) {
+		if (group_size > IDVS_GROUP_MAX_SIZE) {
 			dev_err(kbdev->dev,
 				"idvs-group-size of %d is too large. Maximum value is %d",
-				tmp, IDVS_GROUP_MAX_SIZE);
-			tmp = default_idvs_group_size;
+				group_size, IDVS_GROUP_MAX_SIZE);
+			group_size = default_idvs_group_size;
 		}
 
-		kbdev->hw_quirks_jm |= tmp << IDVS_GROUP_SIZE_SHIFT;
+		kbdev->hw_quirks_gpu |= group_size << IDVS_GROUP_SIZE_SHIFT;
 	}
 
 #define MANUAL_POWER_CONTROL ((u32)(1 << 8))
 	if (corestack_driver_control)
-		kbdev->hw_quirks_jm |= MANUAL_POWER_CONTROL;
+		kbdev->hw_quirks_gpu |= MANUAL_POWER_CONTROL;
 
 	return 0;
 }
@@ -2370,18 +2390,17 @@ static int kbase_pm_hw_issues_detect(struct kbase_device *kbdev)
 				GPU_ID_VERSION_PRODUCT_ID_SHIFT;
 	int error = 0;
 
-	kbdev->hw_quirks_jm = 0;
+	kbdev->hw_quirks_gpu = 0;
 	kbdev->hw_quirks_sc = 0;
 	kbdev->hw_quirks_tiler = 0;
 	kbdev->hw_quirks_mmu = 0;
 
-	if (!of_property_read_u32(np, "quirks_jm",
-				&kbdev->hw_quirks_jm)) {
+	if (!of_property_read_u32(np, "quirks_gpu", &kbdev->hw_quirks_gpu)) {
 		dev_info(kbdev->dev,
-			"Found quirks_jm = [0x%x] in Devicetree\n",
-			kbdev->hw_quirks_jm);
+			 "Found quirks_gpu = [0x%x] in Devicetree\n",
+			 kbdev->hw_quirks_gpu);
 	} else {
-		error = kbase_set_jm_quirks(kbdev, prod_id);
+		error = kbase_set_gpu_quirks(kbdev, prod_id);
 		if (error)
 			return error;
 	}
@@ -2432,10 +2451,10 @@ static void kbase_pm_hw_issues_apply(struct kbase_device *kbdev)
 			kbdev->hw_quirks_mmu);
 #if MALI_USE_CSF
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(CSF_CONFIG),
-			kbdev->hw_quirks_jm);
+			kbdev->hw_quirks_gpu);
 #else
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(JM_CONFIG),
-			kbdev->hw_quirks_jm);
+			kbdev->hw_quirks_gpu);
 #endif
 }
 
@@ -2466,6 +2485,7 @@ void kbase_pm_cache_snoop_disable(struct kbase_device *kbdev)
 	}
 }
 
+#if !MALI_USE_CSF
 static void reenable_protected_mode_hwcnt(struct kbase_device *kbdev)
 {
 	unsigned long irq_flags;
@@ -2478,6 +2498,7 @@ static void reenable_protected_mode_hwcnt(struct kbase_device *kbdev)
 	}
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, irq_flags);
 }
+#endif
 
 static int kbase_pm_do_reset(struct kbase_device *kbdev)
 {
@@ -2504,7 +2525,7 @@ static int kbase_pm_do_reset(struct kbase_device *kbdev)
 
 	/* Initialize a structure for tracking the status of the reset */
 	rtdata.kbdev = kbdev;
-	rtdata.timed_out = 0;
+	rtdata.timed_out = false;
 
 	/* Create a timer to use as a timeout on the reset */
 	hrtimer_init_on_stack(&rtdata.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -2516,7 +2537,7 @@ static int kbase_pm_do_reset(struct kbase_device *kbdev)
 	/* Wait for the RESET_COMPLETED interrupt to be raised */
 	kbase_pm_wait_for_reset(kbdev);
 
-	if (rtdata.timed_out == 0) {
+	if (!rtdata.timed_out) {
 		/* GPU has been reset */
 		hrtimer_cancel(&rtdata.timer);
 		destroy_hrtimer_on_stack(&rtdata.timer);
@@ -2556,7 +2577,7 @@ static int kbase_pm_do_reset(struct kbase_device *kbdev)
 					GPU_COMMAND_HARD_RESET);
 
 		/* Restart the timer to wait for the hard reset to complete */
-		rtdata.timed_out = 0;
+		rtdata.timed_out = false;
 
 		hrtimer_start(&rtdata.timer, HR_TIMER_DELAY_MSEC(RESET_TIMEOUT),
 					HRTIMER_MODE_REL);
@@ -2564,7 +2585,7 @@ static int kbase_pm_do_reset(struct kbase_device *kbdev)
 		/* Wait for the RESET_COMPLETED interrupt to be raised */
 		kbase_pm_wait_for_reset(kbdev);
 
-		if (rtdata.timed_out == 0) {
+		if (!rtdata.timed_out) {
 			/* GPU has been reset */
 			hrtimer_cancel(&rtdata.timer);
 			destroy_hrtimer_on_stack(&rtdata.timer);
@@ -2637,8 +2658,13 @@ int kbase_pm_init_hw(struct kbase_device *kbdev, unsigned int flags)
 	spin_lock_irqsave(&kbdev->hwaccess_lock, irq_flags);
 #if MALI_USE_CSF
 	if (kbdev->protected_mode) {
+		unsigned long flags;
+
 		kbase_ipa_control_protm_exited(kbdev);
+
+		kbase_csf_scheduler_spin_lock(kbdev, &flags);
 		kbase_hwcnt_backend_csf_protm_exited(&kbdev->hwcnt_gpu_iface);
+		kbase_csf_scheduler_spin_unlock(kbdev, flags);
 	}
 #endif
 	kbdev->protected_mode = false;
@@ -2685,12 +2711,14 @@ int kbase_pm_init_hw(struct kbase_device *kbdev, unsigned int flags)
 		kbase_pm_enable_interrupts(kbdev);
 
 exit:
+#if !MALI_USE_CSF
 	if (!kbdev->pm.backend.protected_entry_transition_override) {
 		/* Re-enable GPU hardware counters if we're resetting from
 		 * protected mode.
 		 */
 		reenable_protected_mode_hwcnt(kbdev);
 	}
+#endif
 
 	return err;
 }
@@ -2726,8 +2754,9 @@ kbase_pm_request_gpu_cycle_counter_do_request(struct kbase_device *kbdev)
 		/* This might happen after GPU reset.
 		 * Then counter needs to be kicked.
 		 */
-		if (!(kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_STATUS)) &
-		      GPU_STATUS_CYCLE_COUNT_ACTIVE)) {
+		if (!IS_ENABLED(CONFIG_MALI_NO_MALI) &&
+		    (!(kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_STATUS)) &
+		       GPU_STATUS_CYCLE_COUNT_ACTIVE))) {
 			kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
 					GPU_COMMAND_CYCLE_COUNT_START);
 		}
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_internal.h b/mali_kbase/backend/gpu/mali_kbase_pm_internal.h
index f6b8485..500578f 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_internal.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_internal.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2010-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -224,6 +224,7 @@ void kbase_pm_reset_done(struct kbase_device *kbdev);
  *
  * Return: 0 on success, error code on error
  */
+int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev);
 #else
 /**
  * kbase_pm_wait_for_desired_state - Wait for the desired power state to be
@@ -247,8 +248,8 @@ void kbase_pm_reset_done(struct kbase_device *kbdev);
  *
  * Return: 0 on success, error code on error
  */
-#endif
 int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev);
+#endif
 
 /**
  * kbase_pm_wait_for_l2_powered - Wait for the L2 cache to be powered on
@@ -534,8 +535,22 @@ void kbase_pm_get_dvfs_metrics(struct kbase_device *kbdev,
 
 #ifdef CONFIG_MALI_MIDGARD_DVFS
 
+#if MALI_USE_CSF
+/**
+ * kbase_platform_dvfs_event - Report utilisation to DVFS code for CSF GPU
+ *
+ * Function provided by platform specific code when DVFS is enabled to allow
+ * the power management metrics system to report utilisation.
+ *
+ * @kbdev:         The kbase device structure for the device (must be a
+ *                 valid pointer)
+ * @utilisation:   The current calculated utilisation by the metrics system.
+ * Return:         Returns 0 on failure and non zero on success.
+ */
+int kbase_platform_dvfs_event(struct kbase_device *kbdev, u32 utilisation);
+#else
 /**
- * kbase_platform_dvfs_event - Report utilisation to DVFS code
+ * kbase_platform_dvfs_event - Report utilisation to DVFS code for JM GPU
  *
  * Function provided by platform specific code when DVFS is enabled to allow
  * the power management metrics system to report utilisation.
@@ -548,10 +563,6 @@ void kbase_pm_get_dvfs_metrics(struct kbase_device *kbdev,
  *                 group.
  * Return:         Returns 0 on failure and non zero on success.
  */
-
-#if MALI_USE_CSF
-int kbase_platform_dvfs_event(struct kbase_device *kbdev, u32 utilisation);
-#else
 int kbase_platform_dvfs_event(struct kbase_device *kbdev, u32 utilisation,
 			      u32 util_gl_share, u32 util_cl_share[2]);
 #endif
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_l2_states.h b/mali_kbase/backend/gpu/mali_kbase_pm_l2_states.h
index b9bd364..d66b928 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_l2_states.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_l2_states.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2018-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -24,6 +24,19 @@
  * The function-like macro KBASEP_L2_STATE() must be defined before including
  * this header file. This header file can be included multiple times in the
  * same compilation unit with different definitions of KBASEP_L2_STATE().
+ *
+ * @OFF:              The L2 cache and tiler are off
+ * @PEND_ON:          The L2 cache and tiler are powering on
+ * @RESTORE_CLOCKS:   The GPU clock is restored. Conditionally used.
+ * @ON_HWCNT_ENABLE:  The L2 cache and tiler are on, and hwcnt is being enabled
+ * @ON:               The L2 cache and tiler are on, and hwcnt is enabled
+ * @ON_HWCNT_DISABLE: The L2 cache and tiler are on, and hwcnt is being disabled
+ * @SLOW_DOWN_CLOCKS: The GPU clock is set to appropriate or lowest clock.
+ *                    Conditionally used.
+ * @POWER_DOWN:       The L2 cache and tiler are about to be powered off
+ * @PEND_OFF:         The L2 cache and tiler are powering off
+ * @RESET_WAIT:       The GPU is resetting, L2 cache and tiler power state are
+ *                    unknown
  */
 KBASEP_L2_STATE(OFF)
 KBASEP_L2_STATE(PEND_ON)
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h b/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h
index c03adf3..eab30eb 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -24,6 +24,24 @@
  * The function-like macro KBASEP_MCU_STATE() must be defined before including
  * this header file. This header file can be included multiple times in the
  * same compilation unit with different definitions of KBASEP_MCU_STATE().
+ *
+ * @OFF:                      The MCU is powered off.
+ * @PEND_ON_RELOAD:           The warm boot of MCU or cold boot of MCU (with
+ *                            firmware reloading) is in progress.
+ * @ON_GLB_REINIT_PEND:       The MCU is enabled and Global configuration
+ *                            requests have been sent to the firmware.
+ * @ON_HWCNT_ENABLE:          The Global requests have completed and MCU is now
+ *                            ready for use and hwcnt is being enabled.
+ * @ON:                       The MCU is active and hwcnt has been enabled.
+ * @ON_CORE_ATTR_UPDATE_PEND: The MCU is active and mask of enabled shader cores
+ *                            is being updated.
+ * @ON_HWCNT_DISABLE:         The MCU is on and hwcnt is being disabled.
+ * @ON_HALT:                  The MCU is on and hwcnt has been disabled, MCU
+ *                            halt would be triggered.
+ * @ON_PEND_HALT:             MCU halt in progress, confirmation pending.
+ * @POWER_DOWN:               MCU halted operations, pending being disabled.
+ * @PEND_OFF:                 MCU is being disabled, pending on powering off.
+ * @RESET_WAIT:               The GPU is resetting, MCU state is unknown.
  */
 KBASEP_MCU_STATE(OFF)
 KBASEP_MCU_STATE(PEND_ON_RELOAD)
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_metrics.c b/mali_kbase/backend/gpu/mali_kbase_pm_metrics.c
index e5c7c71..769888f 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_metrics.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_metrics.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2011-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2011-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -360,9 +360,9 @@ KBASE_EXPORT_TEST_API(kbase_pm_get_dvfs_metrics);
 void kbase_pm_get_dvfs_action(struct kbase_device *kbdev)
 {
 	int utilisation;
-	int busy;
 	struct kbasep_pm_metrics *diff;
 #if !MALI_USE_CSF
+	int busy;
 	int util_gl_share;
 	int util_cl_share[2];
 #endif
@@ -377,9 +377,9 @@ void kbase_pm_get_dvfs_action(struct kbase_device *kbdev)
 	utilisation = (100 * diff->time_busy) /
 			max(diff->time_busy + diff->time_idle, 1u);
 
+#if !MALI_USE_CSF
 	busy = max(diff->busy_gl + diff->busy_cl[0] + diff->busy_cl[1], 1u);
 
-#if !MALI_USE_CSF
 	util_gl_share = (100 * diff->busy_gl) / busy;
 	util_cl_share[0] = (100 * diff->busy_cl[0]) / busy;
 	util_cl_share[1] = (100 * diff->busy_cl[1]) / busy;
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_policy.c b/mali_kbase/backend/gpu/mali_kbase_pm_policy.c
index 97bcb44..5c2aa0c 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_policy.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_policy.c
@@ -405,7 +405,7 @@ void kbase_pm_set_policy(struct kbase_device *kbdev,
 	/* Reverse the suspension done */
 	if (reset_gpu) {
 		dev_warn(kbdev->dev, "Resorting to GPU reset for policy change\n");
-		if (kbase_prepare_to_reset_gpu(kbdev))
+		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu(kbdev);
 		kbase_reset_gpu_wait(kbdev);
 	} else if (sched_suspend)
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_shader_states.h b/mali_kbase/backend/gpu/mali_kbase_pm_shader_states.h
index 766bf1d..2276713 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_shader_states.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_shader_states.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2018-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -25,6 +25,41 @@
  * including this header file. This header file can be included multiple
  * times in the same compilation unit with different definitions of
  * KBASEP_SHADER_STATE().
+ *
+ * @OFF_CORESTACK_OFF:                The shaders and core stacks are off
+ * @OFF_CORESTACK_PEND_ON:            The shaders are off, core stacks have been
+ *                                    requested to power on and hwcnt is being
+ *                                    disabled
+ * @PEND_ON_CORESTACK_ON:             Core stacks are on, shaders have been
+ *                                    requested to power on. Or after doing
+ *                                    partial shader on/off, checking whether
+ *                                    it's the desired state.
+ * @ON_CORESTACK_ON:                  The shaders and core stacks are on, and
+ *                                    hwcnt already enabled.
+ * @ON_CORESTACK_ON_RECHECK:          The shaders and core stacks are on, hwcnt
+ *                                    disabled, and checks to powering down or
+ *                                    re-enabling hwcnt.
+ * @WAIT_OFF_CORESTACK_ON:            The shaders have been requested to power
+ *                                    off, but they remain on for the duration
+ *                                    of the hysteresis timer
+ * @WAIT_GPU_IDLE:                    The shaders partial poweroff needs to
+ *                                    reach a state where jobs on the GPU are
+ *                                    finished including jobs currently running
+ *                                    and in the GPU queue because of
+ *                                    GPU2017-861
+ * @WAIT_FINISHED_CORESTACK_ON:       The hysteresis timer has expired
+ * @L2_FLUSHING_CORESTACK_ON:         The core stacks are on and the level 2
+ *                                    cache is being flushed.
+ * @READY_OFF_CORESTACK_ON:           The core stacks are on and the shaders are
+ *                                    ready to be powered off.
+ * @PEND_OFF_CORESTACK_ON:            The core stacks are on, and the shaders
+ *                                    have been requested to power off
+ * @OFF_CORESTACK_PEND_OFF:           The shaders are off, and the core stacks
+ *                                    have been requested to power off
+ * @OFF_CORESTACK_OFF_TIMER_PEND_OFF: Shaders and corestacks are off, but the
+ *                                    tick timer cancellation is still pending.
+ * @RESET_WAIT:                       The GPU is resetting, shader and core
+ *                                    stack power states are unknown
  */
 KBASEP_SHADER_STATE(OFF_CORESTACK_OFF)
 KBASEP_SHADER_STATE(OFF_CORESTACK_PEND_ON)
diff --git a/mali_kbase/backend/gpu/mali_kbase_time.c b/mali_kbase/backend/gpu/mali_kbase_time.c
index f964af0..ea7b21a 100644
--- a/mali_kbase/backend/gpu/mali_kbase_time.c
+++ b/mali_kbase/backend/gpu/mali_kbase_time.c
@@ -76,6 +76,9 @@ void kbase_backend_get_gpu_time_norequest(struct kbase_device *kbdev,
  */
 static bool timedwait_cycle_count_active(struct kbase_device *kbdev)
 {
+#ifdef CONFIG_MALI_NO_MALI
+	return true;
+#else
 	bool success = false;
 	const unsigned int timeout = 100;
 	const unsigned long remaining = jiffies + msecs_to_jiffies(timeout);
@@ -87,8 +90,8 @@ static bool timedwait_cycle_count_active(struct kbase_device *kbdev)
 			break;
 		}
 	}
-
 	return success;
+#endif
 }
 #endif
 
diff --git a/mali_kbase/csf/mali_kbase_csf.c b/mali_kbase/csf/mali_kbase_csf.c
index e35c570..e3e046c 100644
--- a/mali_kbase/csf/mali_kbase_csf.c
+++ b/mali_kbase/csf/mali_kbase_csf.c
@@ -27,7 +27,7 @@
 #include <linux/export.h>
 #include <linux/priority_control_manager.h>
 #include <linux/shmem_fs.h>
-#include "mali_gpu_csf_registers.h"
+#include <uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h>
 #include "mali_kbase_csf_tiler_heap.h"
 #include <mmu/mali_kbase_mmu.h>
 #include "mali_kbase_csf_timeout.h"
@@ -588,7 +588,7 @@ void kbase_csf_queue_terminate(struct kbase_context *kctx,
 
 		spin_lock_irqsave(&kctx->csf.event_lock, flags);
 		dev_dbg(kctx->kbdev->dev,
-			"Remove any pending command queue fatal from context %p\n",
+			"Remove any pending command queue fatal from context %pK\n",
 			(void *)kctx);
 		list_del_init(&queue->error.link);
 		spin_unlock_irqrestore(&kctx->csf.event_lock, flags);
@@ -1132,6 +1132,26 @@ static int create_suspend_buffers(struct kbase_context *const kctx,
 }
 
 /**
+ * generate_group_uid() - Makes an ID unique to all kernel base devices
+ *                        and contexts, for a queue group and CSG.
+ *
+ * Return:      A unique ID in the form of an unsigned 32-bit integer
+ */
+static u32 generate_group_uid(void)
+{
+	/* use first KBase device to store max UID */
+	struct kbase_device *kbdev = kbase_find_device(-1);
+	u32 uid = 1;
+
+	if (kbdev)
+		uid = (u32) atomic_inc_return(&kbdev->group_max_uid_in_devices);
+	else
+		WARN(1, "NULL kbase device pointer in group UID generation");
+
+	return uid;
+}
+
+/**
  * create_queue_group() - Create a queue group
  *
  * @kctx:	Address of the kbase context within which the queue group
@@ -1142,7 +1162,7 @@ static int create_suspend_buffers(struct kbase_context *const kctx,
  * Return: a queue group handle on success, or a negative error code on failure.
  */
 static int create_queue_group(struct kbase_context *const kctx,
-	const union kbase_ioctl_cs_queue_group_create *const create)
+	union kbase_ioctl_cs_queue_group_create *const create)
 {
 	int group_handle = find_free_group_handle(kctx);
 
@@ -1178,6 +1198,9 @@ static int create_queue_group(struct kbase_context *const kctx,
 			group->doorbell_nr = KBASEP_USER_DB_NR_INVALID;
 			group->faulted = false;
 
+			group->group_uid = generate_group_uid();
+			create->out.group_uid = group->group_uid;
+
 			INIT_LIST_HEAD(&group->link);
 			INIT_LIST_HEAD(&group->link_to_schedule);
 			INIT_LIST_HEAD(&group->error_fatal.link);
@@ -1409,7 +1432,7 @@ void kbase_csf_queue_group_terminate(struct kbase_context *kctx,
 		spin_lock_irqsave(&kctx->csf.event_lock, flags);
 
 		dev_dbg(kbdev->dev,
-			"Remove any pending group fatal error from context %p\n",
+			"Remove any pending group fatal error from context %pK\n",
 			(void *)group->kctx);
 
 		list_del_init(&group->error_tiler_oom.link);
@@ -1503,7 +1526,7 @@ static void add_error(struct kbase_context *const kctx,
 		error->data = *data;
 		list_add_tail(&error->link, &kctx->csf.error_list);
 		dev_dbg(kctx->kbdev->dev,
-			"Added error %p of type %d in context %p\n",
+			"Added error %pK of type %d in context %pK\n",
 			(void *)error, data->type, (void *)kctx);
 	}
 
@@ -1796,7 +1819,7 @@ int kbase_csf_event_wait_add(struct kbase_context *kctx,
 		spin_lock_irqsave(&kctx->csf.event_lock, flags);
 		list_add_tail(&event->link, &kctx->csf.event_callback_list);
 		dev_dbg(kctx->kbdev->dev,
-			"Added event handler %p with param %p\n", event,
+			"Added event handler %pK with param %pK\n", event,
 			event->param);
 		spin_unlock_irqrestore(&kctx->csf.event_lock, flags);
 
@@ -1818,7 +1841,7 @@ void kbase_csf_event_wait_remove(struct kbase_context *kctx,
 		if ((event->callback == callback) && (event->param == param)) {
 			list_del(&event->link);
 			dev_dbg(kctx->kbdev->dev,
-				"Removed event handler %p with param %p\n",
+				"Removed event handler %pK with param %pK\n",
 				event, event->param);
 			kfree(event);
 			break;
@@ -1841,7 +1864,7 @@ bool kbase_csf_read_error(struct kbase_context *kctx,
 			struct kbase_csf_notification, link);
 		list_del_init(&error_data->link);
 		*event_data = error_data->data;
-		dev_dbg(kctx->kbdev->dev, "Dequeued error %p in context %p\n",
+		dev_dbg(kctx->kbdev->dev, "Dequeued error %pK in context %pK\n",
 			(void *)error_data, (void *)kctx);
 	} else {
 		got_event = false;
@@ -1859,7 +1882,7 @@ bool kbase_csf_error_pending(struct kbase_context *kctx)
 
 	spin_lock_irqsave(&kctx->csf.event_lock, flags);
 	event_pended = !list_empty(&kctx->csf.error_list);
-	dev_dbg(kctx->kbdev->dev, "%s error is pending in context %p\n",
+	dev_dbg(kctx->kbdev->dev, "%s error is pending in context %pK\n",
 		event_pended ? "An" : "No", (void *)kctx);
 	spin_unlock_irqrestore(&kctx->csf.event_lock, flags);
 
@@ -1872,7 +1895,7 @@ void kbase_csf_event_signal(struct kbase_context *kctx, bool notify_gpu)
 	unsigned long flags;
 
 	dev_dbg(kctx->kbdev->dev,
-		"Signal event (%s GPU notify) for context %p\n",
+		"Signal event (%s GPU notify) for context %pK\n",
 		notify_gpu ? "with" : "without", (void *)kctx);
 
 	/* First increment the signal count and wake up event thread.
@@ -1903,7 +1926,7 @@ void kbase_csf_event_signal(struct kbase_context *kctx, bool notify_gpu)
 		enum kbase_csf_event_callback_action action;
 
 		dev_dbg(kctx->kbdev->dev,
-			"Calling event handler %p with param %p\n",
+			"Calling event handler %pK with param %pK\n",
 			(void *)event, event->param);
 		action = event->callback(event->param);
 		if (action == KBASE_CSF_EVENT_CALLBACK_REMOVE) {
@@ -1926,7 +1949,7 @@ void kbase_csf_event_wait_remove_all(struct kbase_context *kctx)
 		event, next_event, &kctx->csf.event_callback_list, link) {
 		list_del(&event->link);
 		dev_dbg(kctx->kbdev->dev,
-			"Removed event handler %p with param %p\n",
+			"Removed event handler %pK with param %pK\n",
 			(void *)event, event->param);
 		kfree(event);
 	}
@@ -2231,6 +2254,31 @@ static void protm_event_worker(struct work_struct *data)
 	kbase_csf_scheduler_group_protm_enter(group);
 }
 
+static void report_queue_fatal_error(struct kbase_queue *const queue,
+				     u32 cs_fatal, u64 cs_fatal_info,
+				     u8 group_handle)
+{
+	struct base_csf_notification error =
+		{ .type = BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR,
+		  .payload = {
+			  .csg_error = {
+				  .handle = group_handle,
+				  .error = {
+					  .error_type =
+						  BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL,
+					  .payload = {
+						  .fatal_queue = {
+							  .sideband =
+								  cs_fatal_info,
+							  .status = cs_fatal,
+							  .csi_index =
+								  queue->csi_index,
+						  } } } } } };
+
+	add_error(queue->kctx, &queue->error, &error);
+	kbase_event_wakeup(queue->kctx);
+}
+
 /**
  * handle_fault_event - Handler for CS fault.
  *
@@ -2268,51 +2316,10 @@ handle_fault_event(struct kbase_queue *const queue,
 			kbase_gpu_exception_name(cs_fault_exception_type),
 			cs_fault_exception_data, cs_fault_info_exception_data);
 
-	/* TODO GPUCORE-26291: We've'identified an issue with faulted CSIs not
-	 * making progress in some cases. Until the issue is resolved,
-	 * RESOURCE_EVICTION_TIMEOUT error shall be treated as a fatal error
-	 * to give userspace a chance to terminate the group. This is intended
-	 * to be a temporary workaround.
-	 */
 	if (cs_fault_exception_type ==
 	    CS_FAULT_EXCEPTION_TYPE_RESOURCE_EVICTION_TIMEOUT)
-		kbase_csf_add_queue_fatal_error(
-			queue, GPU_EXCEPTION_TYPE_SW_FAULT_2, 0);
-}
-
-static void report_queue_fatal_error(struct kbase_queue *const queue,
-				     u32 cs_fatal, u64 cs_fatal_info,
-				     u8 group_handle)
-{
-	struct base_csf_notification error = {
-		.type = BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR,
-		.payload = {
-			.csg_error = {
-				.handle = group_handle,
-				.error = {
-					.error_type =
-					BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL,
-					.payload = {
-						.fatal_queue = {
-						.sideband = cs_fatal_info,
-						.status = cs_fatal,
-						.csi_index = queue->csi_index,
-						}
-					}
-				}
-			}
-		}
-	};
-
-	add_error(queue->kctx, &queue->error, &error);
-	kbase_event_wakeup(queue->kctx);
-}
-
-void kbase_csf_add_queue_fatal_error(struct kbase_queue *const queue,
-				     u32 cs_fatal, u64 cs_fatal_info)
-{
-	report_queue_fatal_error(queue, cs_fatal, cs_fatal_info,
-				 queue->group->handle);
+		report_queue_fatal_error(queue, GPU_EXCEPTION_TYPE_SW_FAULT_2,
+					 0, queue->group->handle);
 }
 
 /**
@@ -2643,8 +2650,20 @@ static void process_csg_interrupts(struct kbase_device *const kbdev,
 	process_cs_interrupts(group, ginfo, irqreq, irqack);
 }
 
+/**
+ * process_prfcnt_interrupts - Process performance counter interrupts.
+ *
+ * @kbdev:   Instance of a GPU platform device that implements a CSF interface.
+ * @glb_req: Global request register value.
+ * @glb_ack: Global acknowledge register value.
+ *
+ * Handles interrupts issued by the firmware that relate to the performance
+ * counters. For example, on completion of a performance counter sample. It is
+ * expected that the scheduler spinlock is already held on calling this
+ * function.
+ */
 static void process_prfcnt_interrupts(struct kbase_device *kbdev, u32 glb_req,
-				      u32 glb_ack, unsigned long *flags)
+				      u32 glb_ack)
 {
 	const struct kbase_csf_global_iface *const global_iface =
 		&kbdev->csf.global_iface;
@@ -2656,14 +2675,11 @@ static void process_prfcnt_interrupts(struct kbase_device *kbdev, u32 glb_req,
 	    ((glb_req & GLB_REQ_PRFCNT_SAMPLE_MASK) ==
 	     (glb_ack & GLB_REQ_PRFCNT_SAMPLE_MASK))) {
 		kbdev->csf.hwcnt.request_pending = false;
-		kbase_csf_scheduler_spin_unlock(kbdev, *flags);
 
 		dev_dbg(kbdev->dev, "PRFCNT_SAMPLE done interrupt received.");
 
 		kbase_hwcnt_backend_csf_on_prfcnt_sample(
 			&kbdev->hwcnt_gpu_iface);
-
-		kbase_csf_scheduler_spin_lock(kbdev, flags);
 	}
 
 	/* Process PRFCNT_ENABLE interrupt. */
@@ -2671,32 +2687,25 @@ static void process_prfcnt_interrupts(struct kbase_device *kbdev, u32 glb_req,
 	    ((glb_req & GLB_REQ_PRFCNT_ENABLE_MASK) ==
 	     (glb_ack & GLB_REQ_PRFCNT_ENABLE_MASK))) {
 		kbdev->csf.hwcnt.enable_pending = false;
-		kbase_csf_scheduler_spin_unlock(kbdev, *flags);
 
 		dev_dbg(kbdev->dev,
 			"PRFCNT_ENABLE status changed interrupt received.");
 
-		if (glb_ack & GLB_REQ_PRFCNT_ENABLE_MASK) {
+		if (glb_ack & GLB_REQ_PRFCNT_ENABLE_MASK)
 			kbase_hwcnt_backend_csf_on_prfcnt_enable(
 				&kbdev->hwcnt_gpu_iface);
-		} else {
+		else
 			kbase_hwcnt_backend_csf_on_prfcnt_disable(
 				&kbdev->hwcnt_gpu_iface);
-		}
-
-		kbase_csf_scheduler_spin_lock(kbdev, flags);
 	}
 
 	/* Process PRFCNT_THRESHOLD interrupt. */
 	if ((glb_req ^ glb_ack) & GLB_REQ_PRFCNT_THRESHOLD_MASK) {
-		kbase_csf_scheduler_spin_unlock(kbdev, *flags);
 		dev_dbg(kbdev->dev, "PRFCNT_THRESHOLD interrupt received.");
 
 		kbase_hwcnt_backend_csf_on_prfcnt_threshold(
 			&kbdev->hwcnt_gpu_iface);
 
-		kbase_csf_scheduler_spin_lock(kbdev, flags);
-
 		/* Set the GLB_REQ.PRFCNT_THRESHOLD flag back to
 		 * the same value as GLB_ACK.PRFCNT_THRESHOLD
 		 * flag in order to enable reporting of another
@@ -2709,13 +2718,11 @@ static void process_prfcnt_interrupts(struct kbase_device *kbdev, u32 glb_req,
 
 	/* Process PRFCNT_OVERFLOW interrupt. */
 	if ((glb_req ^ glb_ack) & GLB_REQ_PRFCNT_OVERFLOW_MASK) {
-		kbase_csf_scheduler_spin_unlock(kbdev, *flags);
 		dev_dbg(kbdev->dev, "PRFCNT_OVERFLOW interrupt received.");
 
 		kbase_hwcnt_backend_csf_on_prfcnt_overflow(
 			&kbdev->hwcnt_gpu_iface);
 
-		kbase_csf_scheduler_spin_lock(kbdev, flags);
 		/* Set the GLB_REQ.PRFCNT_OVERFLOW flag back to
 		 * the same value as GLB_ACK.PRFCNT_OVERFLOW
 		 * flag in order to enable reporting of another
@@ -2790,8 +2797,7 @@ void kbase_csf_interrupt(struct kbase_device *kbdev, u32 val)
 				}
 			}
 
-			process_prfcnt_interrupts(kbdev, glb_req, glb_ack,
-						  &flags);
+			process_prfcnt_interrupts(kbdev, glb_req, glb_ack);
 
 			kbase_csf_scheduler_spin_unlock(kbdev, flags);
 
diff --git a/mali_kbase/csf/mali_kbase_csf.h b/mali_kbase/csf/mali_kbase_csf.h
index 6252515..effd468 100644
--- a/mali_kbase/csf/mali_kbase_csf.h
+++ b/mali_kbase/csf/mali_kbase_csf.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2018-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -366,19 +366,6 @@ void kbase_csf_add_group_fatal_error(
 	struct base_gpu_queue_group_error const *const err_payload);
 
 /**
- * kbase_csf_add_queue_fatal_error - Report a fatal queue error to userspace
- *
- * @queue:         Pointer to queue for which fatal event was received.
- * @cs_fatal:      Fault information
- * @cs_fatal_info: Additional fault information
- *
- * If a queue has already been in fatal error status,
- * subsequent fatal error on the queue should never take place.
- */
-void kbase_csf_add_queue_fatal_error(struct kbase_queue *const queue,
-				     u32 cs_fatal, u64 cs_fatal_info);
-
-/**
  * kbase_csf_interrupt - Handle interrupts issued by CSF firmware.
  *
  * @kbdev: The kbase device to handle an IRQ for
diff --git a/mali_kbase/csf/mali_kbase_csf_cpu_queue_debugfs.c b/mali_kbase/csf/mali_kbase_csf_cpu_queue_debugfs.c
index fb3a718..b54b2fc 100644
--- a/mali_kbase/csf/mali_kbase_csf_cpu_queue_debugfs.c
+++ b/mali_kbase/csf/mali_kbase_csf_cpu_queue_debugfs.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -154,8 +154,7 @@ int kbase_csf_cpu_queue_dump(struct kbase_context *kctx,
 
 	mutex_lock(&kctx->csf.lock);
 
-	if (kctx->csf.cpu_queue.buffer)
-		kfree(kctx->csf.cpu_queue.buffer);
+	kfree(kctx->csf.cpu_queue.buffer);
 
 	if (atomic_read(&kctx->csf.cpu_queue.dump_req_status) ==
 			BASE_CSF_CPU_QUEUE_DUMP_PENDING) {
diff --git a/mali_kbase/csf/mali_kbase_csf_defs.h b/mali_kbase/csf/mali_kbase_csf_defs.h
index a6f1958..0517399 100644
--- a/mali_kbase/csf/mali_kbase_csf_defs.h
+++ b/mali_kbase/csf/mali_kbase_csf_defs.h
@@ -401,6 +401,8 @@ struct kbase_protected_suspend_buffer {
  * @tiler_mask:     Mask of tiler endpoints the group is allowed to use.
  * @fragment_mask:  Mask of fragment endpoints the group is allowed to use.
  * @compute_mask:   Mask of compute endpoints the group is allowed to use.
+ * @group_uid:      32-bit wide unsigned identifier for the group, unique
+ *                  across all kbase devices and contexts.
  * @link:           Link to this queue group in the 'runnable_groups' list of
  *                  the corresponding kctx.
  * @link_to_schedule: Link to this queue group in the list of prepared groups
@@ -449,6 +451,8 @@ struct kbase_queue_group {
 	u64 fragment_mask;
 	u64 compute_mask;
 
+	u32 group_uid;
+
 	struct list_head link;
 	struct list_head link_to_schedule;
 	enum kbase_csf_group_state run_state;
@@ -801,9 +805,6 @@ struct kbase_csf_csg_slot {
  *                          other phases.
  * @non_idle_scanout_grps:  Count on the non-idle groups in the scan-out
  *                          list at the scheduling prepare stage.
- * @apply_async_protm:      Signalling the internal scheduling apply stage to
- *                          act with some special handling for entering the
- *                          protected mode asynchronously.
  * @pm_active_count:        Count indicating if the scheduler is owning a power
  *                          management reference count. Reference is taken when
  *                          the count becomes 1 and is dropped when the count
@@ -853,7 +854,6 @@ struct kbase_csf_scheduler {
 	struct work_struct gpu_idle_work;
 	atomic_t non_idle_offslot_grps;
 	u32 non_idle_scanout_grps;
-	bool apply_async_protm;
 	u32 pm_active_count;
 	unsigned int csg_scheduling_period_ms;
 	bool tick_timer_active;
@@ -1055,7 +1055,7 @@ struct kbase_csf_firmware_interface {
 	struct protected_memory_allocation **pma;
 };
 
-/**
+/*
  * struct kbase_csf_hwcnt - Object containing members for handling the dump of
  *                          HW counters.
  *
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware.c b/mali_kbase/csf/mali_kbase_csf_firmware.c
index ae039aa..73b8e03 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware.c
+++ b/mali_kbase/csf/mali_kbase_csf_firmware.c
@@ -48,10 +48,17 @@
 
 #define MALI_MAX_FIRMWARE_NAME_LEN ((size_t)20)
 
+
 static char fw_name[MALI_MAX_FIRMWARE_NAME_LEN] = "mali_csffw.bin";
 module_param_string(fw_name, fw_name, sizeof(fw_name), 0644);
 MODULE_PARM_DESC(fw_name, "firmware image");
 
+/* The waiting time for firmware to boot */
+static unsigned int csf_firmware_boot_timeout_ms = 500;
+module_param(csf_firmware_boot_timeout_ms, uint, 0444);
+MODULE_PARM_DESC(csf_firmware_boot_timeout_ms,
+		 "Maximum time to wait for firmware to boot.");
+
 #ifdef CONFIG_MALI_DEBUG
 /* Makes Driver wait indefinitely for an acknowledgment for the different
  * requests it sends to firmware. Otherwise the timeouts interfere with the
@@ -93,7 +100,6 @@ MODULE_PARM_DESC(fw_debug,
 
 #define TL_METADATA_ENTRY_NAME_OFFSET (0x8)
 
-#define CSF_FIRMWARE_BOOT_TIMEOUT_MS     (500)
 #define CSF_MAX_FW_STOP_LOOPS            (100000)
 
 #define CSF_GLB_REQ_CFG_MASK                                                   \
@@ -232,7 +238,7 @@ static void stop_csf_firmware(struct kbase_device *kbdev)
 static void wait_for_firmware_boot(struct kbase_device *kbdev)
 {
 	const long wait_timeout =
-		kbase_csf_timeout_in_jiffies(CSF_FIRMWARE_BOOT_TIMEOUT_MS);
+		kbase_csf_timeout_in_jiffies(csf_firmware_boot_timeout_ms);
 	long remaining;
 
 	/* Firmware will generate a global interface interrupt once booting
@@ -987,6 +993,7 @@ static int parse_capabilities(struct kbase_device *kbdev)
 
 	iface->group_stride = shared_info[GLB_GROUP_STRIDE/4];
 	iface->prfcnt_size = shared_info[GLB_PRFCNT_SIZE/4];
+	iface->instr_features = shared_info[GLB_INSTR_FEATURES / 4];
 
 	if ((GROUP_CONTROL_0 +
 		(unsigned long)iface->group_num * iface->group_stride) >
@@ -1239,14 +1246,8 @@ static void handle_internal_firmware_fatal(struct kbase_device *const kbdev)
 		kbase_ctx_sched_release_ctx_lock(kctx);
 	}
 
-	/* Internal FW error could mean hardware counters will stop working.
-	 * Put the backend into the unrecoverable error state to cause
-	 * current and subsequent counter operations to immediately
-	 * fail, avoiding the risk of a hang.
-	 */
-	kbase_hwcnt_backend_csf_on_unrecoverable_error(&kbdev->hwcnt_gpu_iface);
-
-	if (kbase_prepare_to_reset_gpu(kbdev))
+	if (kbase_prepare_to_reset_gpu(kbdev,
+				       RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
 		kbase_reset_gpu(kbdev);
 }
 
@@ -1669,6 +1670,7 @@ u32 kbase_csf_firmware_set_mcu_core_pwroff_time(struct kbase_device *kbdev, u32
 	return pwroff;
 }
 
+
 int kbase_csf_firmware_init(struct kbase_device *kbdev)
 {
 	const struct firmware *firmware;
@@ -1836,6 +1838,7 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev)
 	if (ret != 0)
 		goto error;
 
+
 	/* Firmware loaded successfully */
 	release_firmware(firmware);
 	KBASE_KTRACE_ADD(kbdev, FIRMWARE_BOOT, NULL,
@@ -1987,7 +1990,7 @@ void kbase_csf_firmware_disable_gpu_idle_timer(struct kbase_device *kbdev)
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 }
 
-int kbase_csf_firmware_ping(struct kbase_device *const kbdev)
+void kbase_csf_firmware_ping(struct kbase_device *const kbdev)
 {
 	const struct kbase_csf_global_iface *const global_iface =
 		&kbdev->csf.global_iface;
@@ -1997,7 +2000,11 @@ int kbase_csf_firmware_ping(struct kbase_device *const kbdev)
 	set_global_request(global_iface, GLB_REQ_PING_MASK);
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 	kbase_csf_scheduler_spin_unlock(kbdev, flags);
+}
 
+int kbase_csf_firmware_ping_wait(struct kbase_device *const kbdev)
+{
+	kbase_csf_firmware_ping(kbdev);
 	return wait_for_global_request(kbdev, GLB_REQ_PING_MASK);
 }
 
@@ -2040,11 +2047,17 @@ void kbase_csf_enter_protected_mode(struct kbase_device *kbdev)
 	err = wait_for_global_request(kbdev, GLB_REQ_PROTM_ENTER_MASK);
 
 	if (!err) {
+		unsigned long irq_flags;
+
 		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 		kbdev->protected_mode = true;
 		kbase_ipa_protection_mode_switch_event(kbdev);
 		kbase_ipa_control_protm_entered(kbdev);
+
+		kbase_csf_scheduler_spin_lock(kbdev, &irq_flags);
 		kbase_hwcnt_backend_csf_protm_entered(&kbdev->hwcnt_gpu_iface);
+		kbase_csf_scheduler_spin_unlock(kbdev, irq_flags);
+
 		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 	}
 }
@@ -2139,26 +2152,28 @@ static u32 copy_grp_and_stm(
 	return total_stream_num;
 }
 
-u32 kbase_csf_firmware_get_glb_iface(struct kbase_device *kbdev,
+u32 kbase_csf_firmware_get_glb_iface(
+	struct kbase_device *kbdev,
 	struct basep_cs_group_control *const group_data,
 	u32 const max_group_num,
 	struct basep_cs_stream_control *const stream_data,
 	u32 const max_total_stream_num, u32 *const glb_version,
-	u32 *const features, u32 *const group_num, u32 *const prfcnt_size)
+	u32 *const features, u32 *const group_num, u32 *const prfcnt_size,
+	u32 *instr_features)
 {
 	const struct kbase_csf_global_iface * const iface =
 		&kbdev->csf.global_iface;
 
-	if (WARN_ON(!glb_version) ||
-		WARN_ON(!features) ||
-		WARN_ON(!group_num) ||
-		WARN_ON(!prfcnt_size))
+	if (WARN_ON(!glb_version) || WARN_ON(!features) ||
+	    WARN_ON(!group_num) || WARN_ON(!prfcnt_size) ||
+	    WARN_ON(!instr_features))
 		return 0;
 
 	*glb_version = iface->version;
 	*features = iface->features;
 	*group_num = iface->group_num;
 	*prfcnt_size = iface->prfcnt_size;
+	*instr_features = iface->instr_features;
 
 	return copy_grp_and_stm(iface, group_data, max_group_num,
 		stream_data, max_total_stream_num);
@@ -2237,9 +2252,9 @@ int kbase_csf_firmware_mcu_shared_mapping_init(
 	mutex_lock(&kbdev->csf.reg_lock);
 	ret = kbase_add_va_region_rbtree(kbdev, va_reg, 0, num_pages, 1);
 	va_reg->flags &= ~KBASE_REG_FREE;
-	mutex_unlock(&kbdev->csf.reg_lock);
 	if (ret)
 		goto va_region_add_error;
+	mutex_unlock(&kbdev->csf.reg_lock);
 
 	gpu_map_properties &= (KBASE_REG_GPU_RD | KBASE_REG_GPU_WR);
 	gpu_map_properties |= gpu_map_prot;
@@ -2261,9 +2276,9 @@ int kbase_csf_firmware_mcu_shared_mapping_init(
 mmu_insert_pages_error:
 	mutex_lock(&kbdev->csf.reg_lock);
 	kbase_remove_va_region(va_reg);
-	mutex_unlock(&kbdev->csf.reg_lock);
 va_region_add_error:
 	kbase_free_alloced_region(va_reg);
+	mutex_unlock(&kbdev->csf.reg_lock);
 va_region_alloc_error:
 	vunmap(cpu_addr);
 vmap_error:
@@ -2293,8 +2308,8 @@ void kbase_csf_firmware_mcu_shared_mapping_term(
 	if (csf_mapping->va_reg) {
 		mutex_lock(&kbdev->csf.reg_lock);
 		kbase_remove_va_region(csf_mapping->va_reg);
-		mutex_unlock(&kbdev->csf.reg_lock);
 		kbase_free_alloced_region(csf_mapping->va_reg);
+		mutex_unlock(&kbdev->csf.reg_lock);
 	}
 
 	if (csf_mapping->phys) {
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware.h b/mali_kbase/csf/mali_kbase_csf_firmware.h
index a2dc4fd..13ff701 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware.h
+++ b/mali_kbase/csf/mali_kbase_csf_firmware.h
@@ -23,7 +23,7 @@
 #define _KBASE_CSF_FIRMWARE_H_
 
 #include "device/mali_kbase_device.h"
-#include "mali_gpu_csf_registers.h"
+#include <uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h>
 
 /*
  * PAGE_KERNEL_RO was only defined on 32bit ARM in 4.19 in:
@@ -266,6 +266,7 @@ u32 kbase_csf_firmware_csg_output(
  * @group_stride: Stride in bytes in JASID0 virtual address between
  *                CSG capability structures.
  * @prfcnt_size: Performance counters size.
+ * @instr_features: Instrumentation features.
  * @groups: Address of an array of CSG capability structures.
  */
 struct kbase_csf_global_iface {
@@ -277,6 +278,7 @@ struct kbase_csf_global_iface {
 	u32 group_num;
 	u32 group_stride;
 	u32 prfcnt_size;
+	u32 instr_features;
 	struct kbase_csf_cmd_stream_group_info *groups;
 };
 
@@ -397,13 +399,23 @@ void kbase_csf_firmware_term(struct kbase_device *kbdev);
 /**
  * kbase_csf_firmware_ping - Send the ping request to firmware.
  *
- * The function sends the ping request to firmware to confirm it is alive.
+ * The function sends the ping request to firmware.
+ *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ */
+void kbase_csf_firmware_ping(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_firmware_ping_wait - Send the ping request to firmware and waits.
+ *
+ * The function sends the ping request to firmware and waits to confirm it is
+ * alive.
  *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  *
  * Return: 0 on success, or negative on failure.
  */
-int kbase_csf_firmware_ping(struct kbase_device *kbdev);
+int kbase_csf_firmware_ping_wait(struct kbase_device *kbdev);
 
 /**
  * kbase_csf_firmware_set_timeout - Set a hardware endpoint progress timeout.
@@ -570,12 +582,14 @@ bool kbase_csf_firmware_core_attr_updated(struct kbase_device *kbdev);
  *                         in bytes. Bits 31:16 hold the size of firmware
  *                         performance counter data and 15:0 hold the size of
  *                         hardware performance counter data.
- */
-u32 kbase_csf_firmware_get_glb_iface(struct kbase_device *kbdev,
-	struct basep_cs_group_control *group_data, u32 max_group_num,
-	struct basep_cs_stream_control *stream_data, u32 max_total_stream_num,
-	u32 *glb_version, u32 *features, u32 *group_num, u32 *prfcnt_size);
-
+ * @instr_features:        Instrumentation features. Bits 7:4 hold the max size
+ *                         of events. Bits 3:0 hold the offset update rate.
+ */
+u32 kbase_csf_firmware_get_glb_iface(
+	struct kbase_device *kbdev, struct basep_cs_group_control *group_data,
+	u32 max_group_num, struct basep_cs_stream_control *stream_data,
+	u32 max_total_stream_num, u32 *glb_version, u32 *features,
+	u32 *group_num, u32 *prfcnt_size, u32 *instr_features);
 
 /**
  * Get CSF firmware header timeline metadata content
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c b/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c
index 6349917..a3901cd 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c
+++ b/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c
@@ -237,6 +237,9 @@ static int invent_capabilities(struct kbase_device *kbdev)
 	iface->kbdev = kbdev;
 	iface->features = 0;
 	iface->prfcnt_size = 64;
+	iface->instr_features =
+		0x81; /* update rate=1, max event size = 1<<8 = 256 */
+
 	iface->group_num = ARRAY_SIZE(interface->csg);
 	iface->group_stride = 0;
 
@@ -463,14 +466,8 @@ static void handle_internal_firmware_fatal(struct kbase_device *const kbdev)
 		kbase_ctx_sched_release_ctx_lock(kctx);
 	}
 
-	/* Internal FW error could mean hardware counters will stop working.
-	 * Put the backend into the unrecoverable error state to cause
-	 * current and subsequent counter operations to immediately
-	 * fail, avoiding the risk of a hang.
-	 */
-	kbase_hwcnt_backend_csf_on_unrecoverable_error(&kbdev->hwcnt_gpu_iface);
-
-	if (kbase_prepare_to_reset_gpu(kbdev))
+	if (kbase_prepare_to_reset_gpu(kbdev,
+				       RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
 		kbase_reset_gpu(kbdev);
 }
 
@@ -1032,7 +1029,7 @@ void kbase_csf_firmware_disable_gpu_idle_timer(struct kbase_device *kbdev)
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 }
 
-int kbase_csf_firmware_ping(struct kbase_device *const kbdev)
+void kbase_csf_firmware_ping(struct kbase_device *const kbdev)
 {
 	const struct kbase_csf_global_iface *const global_iface =
 		&kbdev->csf.global_iface;
@@ -1042,7 +1039,11 @@ int kbase_csf_firmware_ping(struct kbase_device *const kbdev)
 	set_global_request(global_iface, GLB_REQ_PING_MASK);
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 	kbase_csf_scheduler_spin_unlock(kbdev, flags);
+}
 
+int kbase_csf_firmware_ping_wait(struct kbase_device *const kbdev)
+{
+	kbase_csf_firmware_ping(kbdev);
 	return wait_for_global_request(kbdev, GLB_REQ_PING_MASK);
 }
 
@@ -1170,26 +1171,28 @@ static u32 copy_grp_and_stm(
 	return total_stream_num;
 }
 
-u32 kbase_csf_firmware_get_glb_iface(struct kbase_device *kbdev,
+u32 kbase_csf_firmware_get_glb_iface(
+	struct kbase_device *kbdev,
 	struct basep_cs_group_control *const group_data,
 	u32 const max_group_num,
 	struct basep_cs_stream_control *const stream_data,
 	u32 const max_total_stream_num, u32 *const glb_version,
-	u32 *const features, u32 *const group_num, u32 *const prfcnt_size)
+	u32 *const features, u32 *const group_num, u32 *const prfcnt_size,
+	u32 *const instr_features)
 {
 	const struct kbase_csf_global_iface * const iface =
 		&kbdev->csf.global_iface;
 
-	if (WARN_ON(!glb_version) ||
-		WARN_ON(!features) ||
-		WARN_ON(!group_num) ||
-		WARN_ON(!prfcnt_size))
+	if (WARN_ON(!glb_version) || WARN_ON(!features) ||
+	    WARN_ON(!group_num) || WARN_ON(!prfcnt_size) ||
+	    WARN_ON(!instr_features))
 		return 0;
 
 	*glb_version = iface->version;
 	*features = iface->features;
 	*group_num = iface->group_num;
 	*prfcnt_size = iface->prfcnt_size;
+	*instr_features = iface->instr_features;
 
 	return copy_grp_and_stm(iface, group_data, max_group_num,
 		stream_data, max_total_stream_num);
@@ -1269,9 +1272,9 @@ int kbase_csf_firmware_mcu_shared_mapping_init(
 	mutex_lock(&kbdev->csf.reg_lock);
 	ret = kbase_add_va_region_rbtree(kbdev, va_reg, 0, num_pages, 1);
 	va_reg->flags &= ~KBASE_REG_FREE;
-	mutex_unlock(&kbdev->csf.reg_lock);
 	if (ret)
 		goto va_region_add_error;
+	mutex_unlock(&kbdev->csf.reg_lock);
 
 	gpu_map_properties &= (KBASE_REG_GPU_RD | KBASE_REG_GPU_WR);
 	gpu_map_properties |= gpu_map_prot;
@@ -1293,9 +1296,9 @@ int kbase_csf_firmware_mcu_shared_mapping_init(
 mmu_insert_pages_error:
 	mutex_lock(&kbdev->csf.reg_lock);
 	kbase_remove_va_region(va_reg);
-	mutex_unlock(&kbdev->csf.reg_lock);
 va_region_add_error:
 	kbase_free_alloced_region(va_reg);
+	mutex_unlock(&kbdev->csf.reg_lock);
 va_region_alloc_error:
 	vunmap(cpu_addr);
 vmap_error:
@@ -1325,8 +1328,8 @@ void kbase_csf_firmware_mcu_shared_mapping_term(
 	if (csf_mapping->va_reg) {
 		mutex_lock(&kbdev->csf.reg_lock);
 		kbase_remove_va_region(csf_mapping->va_reg);
-		mutex_unlock(&kbdev->csf.reg_lock);
 		kbase_free_alloced_region(csf_mapping->va_reg);
+		mutex_unlock(&kbdev->csf.reg_lock);
 	}
 
 	if (csf_mapping->phys) {
diff --git a/mali_kbase/csf/mali_kbase_csf_kcpu.c b/mali_kbase/csf/mali_kbase_csf_kcpu.c
index e5aee61..1203d2c 100644
--- a/mali_kbase/csf/mali_kbase_csf_kcpu.c
+++ b/mali_kbase/csf/mali_kbase_csf_kcpu.c
@@ -257,7 +257,7 @@ static int kbase_kcpu_jit_allocate_process(
 				 * No prior JIT_FREE command is active. Roll
 				 * back previous allocations and fail.
 				 */
-				dev_warn_ratelimited(kctx->kbdev->dev, "JIT alloc command failed: %p\n", cmd);
+				dev_warn_ratelimited(kctx->kbdev->dev, "JIT alloc command failed: %pK\n", cmd);
 				ret = -ENOMEM;
 				goto fail;
 			}
@@ -858,10 +858,7 @@ static void kbase_kcpu_cqs_set_process(struct kbase_device *kbdev,
 				"Sync memory %llx already freed", cqs_set->objs[i].addr);
 			queue->has_error = true;
 		} else {
-			if (cqs_set->propagate_flags & (1 << i))
-				evt[BASEP_EVENT_ERR_INDEX] = queue->has_error;
-			else
-				evt[BASEP_EVENT_ERR_INDEX] = false;
+			evt[BASEP_EVENT_ERR_INDEX] = queue->has_error;
 			/* Set to signaled */
 			evt[BASEP_EVENT_VAL_INDEX]++;
 			kbase_phy_alloc_mapping_put(queue->kctx, mapping);
@@ -908,8 +905,267 @@ static int kbase_kcpu_cqs_set_prepare(
 	current_command->type = BASE_KCPU_COMMAND_TYPE_CQS_SET;
 	current_command->info.cqs_set.nr_objs = nr_objs;
 	current_command->info.cqs_set.objs = objs;
-	current_command->info.cqs_set.propagate_flags =
-					cqs_set_info->propagate_flags;
+
+	return 0;
+}
+
+static void cleanup_cqs_wait_operation(struct kbase_kcpu_command_queue *queue,
+		struct kbase_kcpu_command_cqs_wait_operation_info *cqs_wait_operation)
+{
+	WARN_ON(!cqs_wait_operation->nr_objs);
+	WARN_ON(!cqs_wait_operation->objs);
+	WARN_ON(!cqs_wait_operation->signaled);
+	WARN_ON(!queue->cqs_wait_count);
+
+	if (--queue->cqs_wait_count == 0) {
+		kbase_csf_event_wait_remove(queue->kctx,
+				event_cqs_callback, queue);
+	}
+
+	kfree(cqs_wait_operation->signaled);
+	kfree(cqs_wait_operation->objs);
+	cqs_wait_operation->signaled = NULL;
+	cqs_wait_operation->objs = NULL;
+}
+
+static int kbase_kcpu_cqs_wait_operation_process(struct kbase_device *kbdev,
+		struct kbase_kcpu_command_queue *queue,
+		struct kbase_kcpu_command_cqs_wait_operation_info *cqs_wait_operation)
+{
+	u32 i;
+
+	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+
+	if (WARN_ON(!cqs_wait_operation->objs))
+		return -EINVAL;
+
+	/* Skip the CQS waits that have already been signaled when processing */
+	for (i = find_first_zero_bit(cqs_wait_operation->signaled, cqs_wait_operation->nr_objs); i < cqs_wait_operation->nr_objs; i++) {
+		if (!test_bit(i, cqs_wait_operation->signaled)) {
+			struct kbase_vmap_struct *mapping;
+			bool sig_set;
+			u64 *evt = (u64 *)kbase_phy_alloc_mapping_get(queue->kctx,
+						cqs_wait_operation->objs[i].addr, &mapping);
+
+			/* GPUCORE-28172 RDT to review */
+			if (!queue->command_started)
+				queue->command_started = true;
+
+			if (!evt) {
+				dev_warn(kbdev->dev,
+					"Sync memory %llx already freed", cqs_wait_operation->objs[i].addr);
+				queue->has_error = true;
+				return -EINVAL;
+			}
+
+			switch (cqs_wait_operation->objs[i].operation) {
+			case BASEP_CQS_WAIT_OPERATION_LE:
+				sig_set = *evt <= cqs_wait_operation->objs[i].val;
+				break;
+			case BASEP_CQS_WAIT_OPERATION_GT:
+				sig_set = *evt > cqs_wait_operation->objs[i].val;
+				break;
+			default:
+				dev_warn(kbdev->dev,
+					"Unsupported CQS wait operation %d", cqs_wait_operation->objs[i].operation);
+
+				kbase_phy_alloc_mapping_put(queue->kctx, mapping);
+				queue->has_error = true;
+
+				return -EINVAL;
+			}
+
+			/* Increment evt up to the error_state value depending on the CQS data type */
+			switch (cqs_wait_operation->objs[i].data_type) {
+			default:
+				dev_warn(kbdev->dev, "Unreachable data_type=%d", cqs_wait_operation->objs[i].data_type);
+			/* Fallthrough - hint to compiler that there's really only 2 options at present */
+			case BASEP_CQS_DATA_TYPE_U32:
+				evt = (u64 *)((u8 *)evt + sizeof(u32));
+				break;
+			case BASEP_CQS_DATA_TYPE_U64:
+				evt = (u64 *)((u8 *)evt + sizeof(u64));
+				break;
+			}
+
+			if (sig_set) {
+				bitmap_set(cqs_wait_operation->signaled, i, 1);
+				if ((cqs_wait_operation->inherit_err_flags & (1U << i)) &&
+				    *evt > 0) {
+					queue->has_error = true;
+				}
+
+				/* GPUCORE-28172 RDT to review */
+
+				queue->command_started = false;
+			}
+
+			kbase_phy_alloc_mapping_put(queue->kctx, mapping);
+
+			if (!sig_set)
+				break;
+		}
+	}
+
+	/* For the queue to progress further, all cqs objects should get
+	 * signaled.
+	 */
+	return bitmap_full(cqs_wait_operation->signaled, cqs_wait_operation->nr_objs);
+}
+
+static int kbase_kcpu_cqs_wait_operation_prepare(struct kbase_kcpu_command_queue *queue,
+		struct base_kcpu_command_cqs_wait_operation_info *cqs_wait_operation_info,
+		struct kbase_kcpu_command *current_command)
+{
+	struct base_cqs_wait_operation_info *objs;
+	unsigned int nr_objs = cqs_wait_operation_info->nr_objs;
+
+	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+
+	if (nr_objs > BASEP_KCPU_CQS_MAX_NUM_OBJS)
+		return -EINVAL;
+
+	if (!nr_objs)
+		return -EINVAL;
+
+	objs = kcalloc(nr_objs, sizeof(*objs), GFP_KERNEL);
+	if (!objs)
+		return -ENOMEM;
+
+	if (copy_from_user(objs, u64_to_user_ptr(cqs_wait_operation_info->objs),
+			nr_objs * sizeof(*objs))) {
+		kfree(objs);
+		return -ENOMEM;
+	}
+
+	if (++queue->cqs_wait_count == 1) {
+		if (kbase_csf_event_wait_add(queue->kctx,
+				event_cqs_callback, queue)) {
+			kfree(objs);
+			queue->cqs_wait_count--;
+			return -ENOMEM;
+		}
+	}
+
+	current_command->type = BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION;
+	current_command->info.cqs_wait_operation.nr_objs = nr_objs;
+	current_command->info.cqs_wait_operation.objs = objs;
+	current_command->info.cqs_wait_operation.inherit_err_flags =
+					cqs_wait_operation_info->inherit_err_flags;
+
+	current_command->info.cqs_wait_operation.signaled = kcalloc(BITS_TO_LONGS(nr_objs),
+		sizeof(*current_command->info.cqs_wait_operation.signaled), GFP_KERNEL);
+	if (!current_command->info.cqs_wait_operation.signaled) {
+		if (--queue->cqs_wait_count == 0) {
+			kbase_csf_event_wait_remove(queue->kctx,
+				event_cqs_callback, queue);
+		}
+
+		kfree(objs);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void kbase_kcpu_cqs_set_operation_process(
+		struct kbase_device *kbdev,
+		struct kbase_kcpu_command_queue *queue,
+		struct kbase_kcpu_command_cqs_set_operation_info *cqs_set_operation)
+{
+	unsigned int i;
+
+	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+
+	if (WARN_ON(!cqs_set_operation->objs))
+		return;
+
+	for (i = 0; i < cqs_set_operation->nr_objs; i++) {
+		struct kbase_vmap_struct *mapping;
+		u64 *evt;
+
+		evt = (u64 *)kbase_phy_alloc_mapping_get(
+			queue->kctx, cqs_set_operation->objs[i].addr, &mapping);
+
+		/* GPUCORE-28172 RDT to review */
+
+		if (!evt) {
+			dev_warn(kbdev->dev,
+				"Sync memory %llx already freed", cqs_set_operation->objs[i].addr);
+			queue->has_error = true;
+		} else {
+			switch (cqs_set_operation->objs[i].operation) {
+			case BASEP_CQS_SET_OPERATION_ADD:
+				*evt += cqs_set_operation->objs[i].val;
+				break;
+			case BASEP_CQS_SET_OPERATION_SET:
+				*evt = cqs_set_operation->objs[i].val;
+				break;
+			default:
+				dev_warn(kbdev->dev,
+					"Unsupported CQS set operation %d", cqs_set_operation->objs[i].operation);
+				queue->has_error = true;
+				break;
+			}
+
+			/* Increment evt up to the error_state value depending on the CQS data type */
+			switch (cqs_set_operation->objs[i].data_type) {
+			default:
+				dev_warn(kbdev->dev, "Unreachable data_type=%d", cqs_set_operation->objs[i].data_type);
+			/* Fallthrough - hint to compiler that there's really only 2 options at present */
+			case BASEP_CQS_DATA_TYPE_U32:
+				evt = (u64 *)((u8 *)evt + sizeof(u32));
+				break;
+			case BASEP_CQS_DATA_TYPE_U64:
+				evt = (u64 *)((u8 *)evt + sizeof(u64));
+				break;
+			}
+
+			/* GPUCORE-28172 RDT to review */
+
+			/* Always propagate errors */
+			*evt = queue->has_error;
+
+			kbase_phy_alloc_mapping_put(queue->kctx, mapping);
+		}
+	}
+
+	kbase_csf_event_signal_notify_gpu(queue->kctx);
+
+	kfree(cqs_set_operation->objs);
+	cqs_set_operation->objs = NULL;
+}
+
+static int kbase_kcpu_cqs_set_operation_prepare(
+		struct kbase_kcpu_command_queue *kcpu_queue,
+		struct base_kcpu_command_cqs_set_operation_info *cqs_set_operation_info,
+		struct kbase_kcpu_command *current_command)
+{
+	struct kbase_context *const kctx = kcpu_queue->kctx;
+	struct base_cqs_set_operation_info *objs;
+	unsigned int nr_objs = cqs_set_operation_info->nr_objs;
+
+	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+
+	if (nr_objs > BASEP_KCPU_CQS_MAX_NUM_OBJS)
+		return -EINVAL;
+
+	if (!nr_objs)
+		return -EINVAL;
+
+	objs = kcalloc(nr_objs, sizeof(*objs), GFP_KERNEL);
+	if (!objs)
+		return -ENOMEM;
+
+	if (copy_from_user(objs, u64_to_user_ptr(cqs_set_operation_info->objs),
+			nr_objs * sizeof(*objs))) {
+		kfree(objs);
+		return -ENOMEM;
+	}
+
+	current_command->type = BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION;
+	current_command->info.cqs_set_operation.nr_objs = nr_objs;
+	current_command->info.cqs_set_operation.objs = objs;
 
 	return 0;
 }
@@ -1365,6 +1621,28 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
 				&cmd->info.cqs_set);
 
 			break;
+		case BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION:
+			status = kbase_kcpu_cqs_wait_operation_process(kbdev, queue,
+						&cmd->info.cqs_wait_operation);
+
+			if (!status && !ignore_waits) {
+				process_next = false;
+			} else {
+				/* Either all CQS objects were signaled or
+				 * there was an error or the queue itself is
+				 * being deleted.
+				 * In all cases can move to the next command.
+				 * TBD: handle the error
+				 */
+				cleanup_cqs_wait_operation(queue,	&cmd->info.cqs_wait_operation);
+			}
+
+			break;
+		case BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION:
+			kbase_kcpu_cqs_set_operation_process(kbdev, queue,
+				&cmd->info.cqs_set_operation);
+
+			break;
 		case BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER:
 			/* Clear the queue's error state */
 			queue->has_error = false;
@@ -1404,7 +1682,7 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
 				queue->kctx, NULL, cmd->info.import.gpu_va);
 			kbase_gpu_vm_unlock(queue->kctx);
 
-			if (ret == false) {
+			if (!ret) {
 				queue->has_error = true;
 				dev_warn(kbdev->dev,
 						"failed to release the reference. resource not found\n");
@@ -1425,7 +1703,7 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
 				queue->kctx, NULL, cmd->info.import.gpu_va);
 			kbase_gpu_vm_unlock(queue->kctx);
 
-			if (ret == false) {
+			if (!ret) {
 				queue->has_error = true;
 				dev_warn(kbdev->dev,
 						"failed to release the reference. resource not found\n");
@@ -1591,6 +1869,16 @@ static void KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_COMMAND(
 		}
 		break;
 	}
+	case BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION:
+	{
+		/* GPUCORE-28172 RDT to review */
+		break;
+	}
+	case BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION:
+	{
+		/* GPUCORE-28172 RDT to review */
+		break;
+	}
 	case BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER:
 		KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_ERROR_BARRIER(kbdev,
 									queue);
@@ -1758,6 +2046,14 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,
 			ret = kbase_kcpu_cqs_set_prepare(queue,
 					&command.info.cqs_set, kcpu_cmd);
 			break;
+		case BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION:
+			ret = kbase_kcpu_cqs_wait_operation_prepare(queue,
+					&command.info.cqs_wait_operation, kcpu_cmd);
+			break;
+		case BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION:
+			ret = kbase_kcpu_cqs_set_operation_prepare(queue,
+					&command.info.cqs_set_operation, kcpu_cmd);
+			break;
 		case BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER:
 			kcpu_cmd->type = BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER;
 			ret = 0;
diff --git a/mali_kbase/csf/mali_kbase_csf_kcpu.h b/mali_kbase/csf/mali_kbase_csf_kcpu.h
index a528572..86aa7dc 100644
--- a/mali_kbase/csf/mali_kbase_csf_kcpu.h
+++ b/mali_kbase/csf/mali_kbase_csf_kcpu.h
@@ -69,13 +69,10 @@ struct kbase_kcpu_command_fence_info {
  * @objs:	Array of structures which define CQS objects to be used by
  *		the kcpu command.
  * @nr_objs:	Number of CQS objects in the array.
- * @propagate_flags:  Bit-pattern for the CQSs in the array that are set
- *		      to propagate queue error-state to the flagged CQSs.
  */
 struct kbase_kcpu_command_cqs_set_info {
 	struct base_cqs_set *objs;
 	unsigned int nr_objs;
-	u32 propagate_flags;
 };
 
 /**
@@ -99,6 +96,36 @@ struct kbase_kcpu_command_cqs_wait_info {
 };
 
 /**
+ * struct kbase_kcpu_command_cqs_set_operation_info - Structure which holds information
+ *				about CQS objects for the kcpu CQS timeline set command
+ *
+ * @objs:	Array of structures which define CQS timeline objects to be used by
+ *		the kcpu command.
+ * @nr_objs:	Number of CQS objects in the array.
+ */
+struct kbase_kcpu_command_cqs_set_operation_info {
+	struct base_cqs_set_operation_info *objs;
+	unsigned int nr_objs;
+};
+
+/**
+ * struct kbase_kcpu_command_cqs_wait_operation_info - Structure which holds information
+ *				about CQS objects for the kcpu CQS timeline wait command
+ *
+ * @objs:	Array of structures which define CQS timeline objects to be used by
+ *		the kcpu command.
+ * @signaled:	Bit array used to report the status of the CQS wait objects.
+ *              1 is signaled, 0 otherwise.
+ * @nr_objs:	Number of CQS objects in the array.
+ */
+struct kbase_kcpu_command_cqs_wait_operation_info {
+	struct base_cqs_wait_operation_info *objs;
+	unsigned long *signaled;
+	unsigned int nr_objs;
+	u32 inherit_err_flags;
+};
+
+/**
  * struct kbase_kcpu_command_jit_alloc_info - Structure which holds information
  *				needed for the kcpu command for jit allocations
  *
@@ -200,6 +227,8 @@ struct kbase_kcpu_command {
 		struct kbase_kcpu_command_fence_info fence;
 		struct kbase_kcpu_command_cqs_wait_info cqs_wait;
 		struct kbase_kcpu_command_cqs_set_info cqs_set;
+		struct kbase_kcpu_command_cqs_wait_operation_info cqs_wait_operation;
+		struct kbase_kcpu_command_cqs_set_operation_info cqs_set_operation;
 		struct kbase_kcpu_command_import_info import;
 		struct kbase_kcpu_command_jit_alloc_info jit_alloc;
 		struct kbase_kcpu_command_jit_free_info jit_free;
diff --git a/mali_kbase/csf/mali_kbase_csf_kcpu_debugfs.c b/mali_kbase/csf/mali_kbase_csf_kcpu_debugfs.c
index 5c2e8e3..d59e77c 100644
--- a/mali_kbase/csf/mali_kbase_csf_kcpu_debugfs.c
+++ b/mali_kbase/csf/mali_kbase_csf_kcpu_debugfs.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -95,7 +95,7 @@ static void kbasep_csf_kcpu_debugfs_print_queue(struct seq_file *file,
 			struct kbase_sync_fence_info info;
 
 			kbase_sync_fence_info_get(cmd->info.fence.fence, &info);
-			seq_printf(file, ",  Fence      %p %s %s",
+			seq_printf(file, ",  Fence      %pK %s %s",
 				   info.fence, info.name,
 				   kbase_sync_status_string(info.status));
 			break;
diff --git a/mali_kbase/csf/mali_kbase_csf_reset_gpu.c b/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
index b59ffd4..e8da0f3 100644
--- a/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
+++ b/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
@@ -307,6 +307,31 @@ static void kbase_csf_dump_firmware_trace_buffer(struct kbase_device *kbdev)
 	kfree(buf);
 }
 
+/**
+ * kbase_csf_hwcnt_on_reset_error() - Sets HWCNT to appropriate state in the
+ *                                    event of an error during GPU reset.
+ * @kbdev: Pointer to KBase device
+ */
+static void kbase_csf_hwcnt_on_reset_error(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+
+	/* Treat this as an unrecoverable error for HWCNT */
+	kbase_hwcnt_backend_csf_on_unrecoverable_error(&kbdev->hwcnt_gpu_iface);
+
+	/* Re-enable counters to ensure matching enable/disable pair.
+	 * This might reduce the hwcnt disable count to 0, and therefore
+	 * trigger actual re-enabling of hwcnt.
+	 * However, as the backend is now in the unrecoverable error state,
+	 * re-enabling will immediately fail and put the context into the error
+	 * state, preventing the hardware from being touched (which could have
+	 * risked a hang).
+	 */
+	kbase_csf_scheduler_spin_lock(kbdev, &flags);
+	kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
+	kbase_csf_scheduler_spin_unlock(kbdev, flags);
+}
+
 static int kbase_csf_reset_gpu_now(struct kbase_device *kbdev,
 				   bool firmware_inited, bool silent)
 {
@@ -396,8 +421,10 @@ static int kbase_csf_reset_gpu_now(struct kbase_device *kbdev,
 
 	mutex_unlock(&kbdev->pm.lock);
 
-	if (WARN_ON(err))
-		goto error;
+	if (WARN_ON(err)) {
+		kbase_csf_hwcnt_on_reset_error(kbdev);
+		return err;
+	}
 
 	mutex_lock(&kbdev->mmu_hw_mutex);
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
@@ -414,40 +441,20 @@ static int kbase_csf_reset_gpu_now(struct kbase_device *kbdev,
 	err = kbase_pm_wait_for_desired_state(kbdev);
 	mutex_unlock(&kbdev->pm.lock);
 
-	if (err)
-		goto error;
+	if (WARN_ON(err)) {
+		kbase_csf_hwcnt_on_reset_error(kbdev);
+		return err;
+	}
 
 	/* Re-enable GPU hardware counters */
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	kbase_csf_scheduler_spin_lock(kbdev, &flags);
 	kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	kbase_csf_scheduler_spin_unlock(kbdev, flags);
 
 	if (!silent)
 		dev_err(kbdev->dev, "Reset complete");
 
 	return 0;
-error:
-	WARN_ON(!err);
-
-	/* If hardware init failed, we assume hardware counters will
-	 * not work and put the backend into the unrecoverable error
-	 * state.
-	 */
-	kbase_hwcnt_backend_csf_on_unrecoverable_error(&kbdev->hwcnt_gpu_iface);
-
-	/* Re-enable counters to ensure matching enable/disable pair.
-	 * This might reduce the hwcnt disable count to 0, and therefore
-	 * trigger actual re-enabling of hwcnt.
-	 * However, as the backend is now in the unrecoverable error state,
-	 * re-enabling will immediately fail and put the context into the error
-	 * state, preventing the hardware from being touched (which could have
-	 * risked a hang).
-	 */
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
-	return err;
 }
 
 static void kbase_csf_reset_gpu_worker(struct work_struct *data)
@@ -484,25 +491,29 @@ static void kbase_csf_reset_gpu_worker(struct work_struct *data)
 	kbase_csf_reset_end_hw_access(kbdev, err, firmware_inited);
 }
 
-bool kbase_prepare_to_reset_gpu(struct kbase_device *kbdev)
+bool kbase_prepare_to_reset_gpu(struct kbase_device *kbdev, unsigned int flags)
 {
+	if (flags & RESET_FLAGS_HWC_UNRECOVERABLE_ERROR)
+		kbase_hwcnt_backend_csf_on_unrecoverable_error(
+			&kbdev->hwcnt_gpu_iface);
+
 	if (atomic_cmpxchg(&kbdev->csf.reset.state,
 			KBASE_CSF_RESET_GPU_NOT_PENDING,
 			KBASE_CSF_RESET_GPU_PREPARED) !=
-			KBASE_CSF_RESET_GPU_NOT_PENDING) {
+			KBASE_CSF_RESET_GPU_NOT_PENDING)
 		/* Some other thread is already resetting the GPU */
 		return false;
-	}
 
 	return true;
 }
 KBASE_EXPORT_TEST_API(kbase_prepare_to_reset_gpu);
 
-bool kbase_prepare_to_reset_gpu_locked(struct kbase_device *kbdev)
+bool kbase_prepare_to_reset_gpu_locked(struct kbase_device *kbdev,
+				       unsigned int flags)
 {
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
-	return kbase_prepare_to_reset_gpu(kbdev);
+	return kbase_prepare_to_reset_gpu(kbdev, flags);
 }
 
 void kbase_reset_gpu(struct kbase_device *kbdev)
diff --git a/mali_kbase/csf/mali_kbase_csf_scheduler.c b/mali_kbase/csf/mali_kbase_csf_scheduler.c
index b9dc59c..84d6f81 100644
--- a/mali_kbase/csf/mali_kbase_csf_scheduler.c
+++ b/mali_kbase/csf/mali_kbase_csf_scheduler.c
@@ -28,8 +28,8 @@
 #include "../tl/mali_kbase_tracepoints.h"
 #include "backend/gpu/mali_kbase_pm_internal.h"
 #include <linux/export.h>
-#include "mali_gpu_csf_registers.h"
-#include <mali_base_kernel.h>
+#include <uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h>
+#include <uapi/gpu/arm/midgard/mali_base_kernel.h>
 
 /* Value to indicate that a queue group is not groups_to_schedule list */
 #define KBASEP_GROUP_PREPARED_SEQ_NUM_INVALID (U32_MAX)
@@ -373,6 +373,45 @@ static bool queue_group_scheduled_locked(struct kbase_queue_group *group)
 }
 
 /**
+ * scheduler_wait_protm_quit() - Wait for GPU to exit protected mode.
+ *
+ * @kbdev: Pointer to the GPU device
+ *
+ * This function waits for the GPU to exit protected mode which is confirmed
+ * when active_protm_grp is set to NULL.
+ */
+static void scheduler_wait_protm_quit(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+	long wt = kbase_csf_timeout_in_jiffies(kbdev->csf.fw_timeout_ms);
+	long remaining;
+
+	lockdep_assert_held(&scheduler->lock);
+
+	remaining = wait_event_timeout(kbdev->csf.event_wait,
+			!kbase_csf_scheduler_protected_mode_in_use(kbdev), wt);
+
+	if (!remaining)
+		dev_warn(kbdev->dev, "Timeout, protm_quit wait skipped");
+}
+
+/**
+ * scheduler_force_protm_exit() - Force GPU to exit protected mode.
+ *
+ * @kbdev: Pointer to the GPU device
+ *
+ * This function sends a ping request to the firmware and waits for the GPU
+ * to exit protected mode.
+ */
+static void scheduler_force_protm_exit(struct kbase_device *kbdev)
+{
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+
+	kbase_csf_firmware_ping(kbdev);
+	scheduler_wait_protm_quit(kbdev);
+}
+
+/**
  * scheduler_timer_is_enabled_nolock() - Check if the scheduler wakes up
  * automatically for periodic tasks.
  *
@@ -607,7 +646,7 @@ static int halt_stream_sync(struct kbase_queue *queue)
 		if (!remaining) {
 			dev_warn(kbdev->dev, "Timed out waiting for queue to start on csi %d bound to group %d on slot %d",
 				 csi_index, group->handle, group->csg_nr);
-			if (kbase_prepare_to_reset_gpu(kbdev))
+			if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 				kbase_reset_gpu(kbdev);
 
 			return -ETIMEDOUT;
@@ -629,26 +668,14 @@ static int halt_stream_sync(struct kbase_queue *queue)
 		(CS_ACK_STATE_GET(kbase_csf_firmware_cs_output(stream, CS_ACK))
 		 == CS_ACK_STATE_STOP), remaining);
 
-	/* Queues that have failed to stop in time shall raise a fatal error
-	 * as their group would fail to suspend which could no longer be safely
-	 * resumed.
-	 */
 	if (!remaining) {
-		unsigned long flags;
-
 		dev_warn(kbdev->dev, "Timed out waiting for queue to stop on csi %d bound to group %d on slot %d",
 			 queue->csi_index, group->handle, group->csg_nr);
 
-		spin_lock_irqsave(&kbdev->csf.scheduler.interrupt_lock, flags);
-		kbase_csf_add_queue_fatal_error(
-			queue, GPU_EXCEPTION_TYPE_SW_FAULT_2, 0);
-		spin_unlock_irqrestore(&kbdev->csf.scheduler.interrupt_lock,
-				       flags);
-
 		/* TODO GPUCORE-25328: The CSG can't be terminated, the GPU
 		 * will be reset as a work-around.
 		 */
-		if (kbase_prepare_to_reset_gpu(kbdev))
+		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu(kbdev);
 	}
 	return (remaining) ? 0 : -ETIMEDOUT;
@@ -722,27 +749,6 @@ static int sched_halt_stream(struct kbase_queue *queue)
 		}
 	}
 retry:
-	/* First wait for the group to reach a stable state. IDLE state is
-	 * an intermediate state that is only set by Scheduler at the start
-	 * of a tick (prior to scanout) for groups that received idle
-	 * notification, then later the idle group is moved to one of the
-	 * suspended states or the runnable state.
-	 */
-	while (group->run_state == KBASE_CSF_GROUP_IDLE) {
-		mutex_unlock(&scheduler->lock);
-		remaining = wait_event_timeout(kbdev->csf.event_wait,
-					       group->run_state !=
-						       KBASE_CSF_GROUP_IDLE,
-					       kbdev->csf.fw_timeout_ms);
-		mutex_lock(&scheduler->lock);
-		if (!remaining) {
-			dev_warn(kbdev->dev,
-				 "Timed out waiting for state change of Group-%d when stopping a queue on csi %d",
-				 group->handle, queue->csi_index);
-		}
-	}
-
-	WARN_ON(group->run_state == KBASE_CSF_GROUP_IDLE);
 	/* Update the group state so that it can get scheduled soon */
 	update_idle_suspended_group_state(group);
 
@@ -1559,7 +1565,7 @@ static void update_offslot_non_idle_cnt_on_grp_suspend(
 
 	lockdep_assert_held(&scheduler->lock);
 
-	if (scheduler->state == SCHED_BUSY || scheduler->apply_async_protm) {
+	if (scheduler->state == SCHED_BUSY) {
 		/* active phase or, async entering the protected mode */
 		if (group->prepared_seq_num >=
 		    scheduler->non_idle_scanout_grps) {
@@ -1731,7 +1737,6 @@ static bool cleanup_csg_slot(struct kbase_queue_group *group)
 	/* The csg does not need cleanup other than drop its AS */
 	spin_lock_irqsave(&kctx->kbdev->hwaccess_lock, flags);
 	as_fault = kbase_ctx_flag(kctx, KCTX_AS_DISABLED_ON_FAULT);
-	WARN_ON(kctx->mmu_flush_pend_state != KCTX_MMU_FLUSH_NOT_PEND);
 	kbase_ctx_sched_release_ctx(kctx);
 	if (unlikely(group->faulted))
 		as_fault = true;
@@ -1779,11 +1784,12 @@ static void update_csg_slot_priority(struct kbase_queue_group *group, u8 prio)
 	csg_slot = &kbdev->csf.scheduler.csg_slots[slot];
 	ginfo = &kbdev->csf.global_iface.groups[slot];
 
+	/* CSGs remaining on-slot can be either idle or runnable.
+	 * This also applies in protected mode.
+	 */
 	WARN_ON(!((group->run_state == KBASE_CSF_GROUP_RUNNABLE) ||
 		(group->run_state == KBASE_CSF_GROUP_IDLE)));
 
-	group->run_state = KBASE_CSF_GROUP_RUNNABLE;
-
 	/* Update consumes a group from scanout */
 	update_offslot_non_idle_cnt_for_onslot_grp(group);
 
@@ -1858,12 +1864,11 @@ static void program_csg_slot(struct kbase_queue_group *group, s8 slot,
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	kbase_ctx_sched_retain_ctx(kctx);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-	kbase_mmu_deferred_flush_invalidate(kctx);
 	mutex_unlock(&kbdev->mmu_hw_mutex);
 
 	if (kctx->as_nr == KBASEP_AS_NR_INVALID) {
-		dev_dbg(kbdev->dev, "Could not get a valid AS for group %d of context %d_%d on slot %d\n",
-			group->handle, kctx->tgid, kctx->id, slot);
+		dev_warn(kbdev->dev, "Could not get a valid AS for group %d of context %d_%d on slot %d\n",
+			 group->handle, kctx->tgid, kctx->id, slot);
 		return;
 	}
 
@@ -1896,6 +1901,7 @@ static void program_csg_slot(struct kbase_queue_group *group, s8 slot,
 	kbase_csf_firmware_csg_input(ginfo, CSG_ALLOW_OTHER,
 				     tiler_mask & U32_MAX);
 
+
 	ep_cfg = CSG_EP_REQ_COMPUTE_EP_SET(ep_cfg, compute_max);
 	ep_cfg = CSG_EP_REQ_FRAGMENT_EP_SET(ep_cfg, fragment_max);
 	ep_cfg = CSG_EP_REQ_TILER_EP_SET(ep_cfg, tiler_max);
@@ -2043,7 +2049,7 @@ static int term_group_sync(struct kbase_queue_group *group)
 	if (!remaining) {
 		dev_warn(kbdev->dev, "term request timed out for group %d on slot %d",
 			 group->handle, group->csg_nr);
-		if (kbase_prepare_to_reset_gpu(kbdev))
+		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu(kbdev);
 		err = -ETIMEDOUT;
 	}
@@ -2112,9 +2118,10 @@ static int scheduler_group_schedule(struct kbase_queue_group *group)
 {
 	struct kbase_context *kctx = group->kctx;
 	struct kbase_device *kbdev = kctx->kbdev;
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
 
 	lockdep_assert_held(&kctx->csf.lock);
-	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+	lockdep_assert_held(&scheduler->lock);
 
 	KBASE_KTRACE_ADD_CSF_GRP(kbdev, GROUP_SCHEDULE, group, group->run_state);
 	if (group->run_state == KBASE_CSF_GROUP_SUSPENDED_ON_WAIT_SYNC)
@@ -2125,8 +2132,39 @@ static int scheduler_group_schedule(struct kbase_queue_group *group)
 
 		if (group->run_state == KBASE_CSF_GROUP_SUSPENDED_ON_IDLE)
 			update_idle_suspended_group_state(group);
-		else
+		else {
+			struct kbase_queue_group *protm_grp;
+			unsigned long flags;
+
+			WARN_ON(!kbasep_csf_scheduler_group_is_on_slot_locked(
+				group));
+
 			group->run_state = KBASE_CSF_GROUP_RUNNABLE;
+
+			/* A normal mode CSG could be idle onslot during
+			 * protected mode. In this case clear the
+			 * appropriate bit in csg_slots_idle_mask.
+			 */
+			spin_lock_irqsave(&scheduler->interrupt_lock, flags);
+			protm_grp = scheduler->active_protm_grp;
+			if (protm_grp && protm_grp != group)
+				clear_bit((unsigned int)group->csg_nr,
+					  scheduler->csg_slots_idle_mask);
+			spin_unlock_irqrestore(&scheduler->interrupt_lock,
+					       flags);
+
+			/* If GPU is in protected mode then any doorbells rang
+			 * would have no effect. Check if GPU is in protected
+			 * mode and if this group has higher priority than the
+			 * active protected mode group. If so prompt the FW
+			 * to exit protected mode.
+			 */
+			if (protm_grp &&
+			    group->scan_seq_num < protm_grp->scan_seq_num) {
+				/* Prompt the FW to exit protected mode */
+				scheduler_force_protm_exit(kbdev);
+			}
+		}
 	} else if (!queue_group_scheduled_locked(group)) {
 		insert_group_to_runnable(&kbdev->csf.scheduler, group,
 			KBASE_CSF_GROUP_RUNNABLE);
@@ -2511,7 +2549,7 @@ static void program_suspending_csg_slots(struct kbase_device *kbdev)
 				 */
 				dev_warn(
 					kbdev->dev,
-					"Group %p on slot %u failed to suspend\n",
+					"Group %pK on slot %u failed to suspend\n",
 					(void *)group, i);
 
 				/* The group has failed suspension, stop
@@ -2541,11 +2579,13 @@ static void program_suspending_csg_slots(struct kbase_device *kbdev)
 			if (WARN_ON(i == num_groups))
 				break;
 			program_vacant_csg_slot(kbdev, (s8)i);
-			if (WARN_ON(!csg_slot_in_use(kbdev, (int)i)))
+			if (!csg_slot_in_use(kbdev, (int)i)) {
+				dev_warn(kbdev->dev, "Couldn't use CSG slot %d despite being vacant", i);
 				break;
+			}
 		}
 	} else {
-		if (kbase_prepare_to_reset_gpu(kbdev))
+		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu(kbdev);
 	}
 }
@@ -2611,7 +2651,7 @@ static void wait_csg_slots_start(struct kbase_device *kbdev)
 			dev_warn(kbdev->dev, "Timed out waiting for CSG slots to start, slots: 0x%*pb\n",
 				 num_groups, slot_mask);
 
-			if (kbase_prepare_to_reset_gpu(kbdev))
+			if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 				kbase_reset_gpu(kbdev);
 			break;
 		}
@@ -3287,7 +3327,8 @@ static void scheduler_handle_idle_slots(struct kbase_device *kbdev)
 			continue;
 		if (WARN_ON(!group))
 			continue;
-		if (WARN_ON(group->run_state != KBASE_CSF_GROUP_RUNNABLE))
+		if (WARN_ON(group->run_state != KBASE_CSF_GROUP_RUNNABLE &&
+					group->run_state != KBASE_CSF_GROUP_IDLE))
 			continue;
 		if (WARN_ON(group->priority >= KBASE_QUEUE_GROUP_PRIORITY_COUNT))
 			continue;
@@ -3295,7 +3336,8 @@ static void scheduler_handle_idle_slots(struct kbase_device *kbdev)
 		if (group_on_slot_is_idle(kbdev, i)) {
 			group->run_state = KBASE_CSF_GROUP_IDLE;
 			set_bit(i, scheduler->csg_slots_idle_mask);
-		}
+		} else
+			group->run_state = KBASE_CSF_GROUP_RUNNABLE;
 	}
 
 	bitmap_or(scheduler->csg_slots_idle_mask,
@@ -3381,7 +3423,7 @@ static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev,
 		dev_warn(kbdev->dev, "Timed out waiting for CSG slots to suspend on power down, slot_mask: 0x%*pb\n",
 			 kbdev->csf.global_iface.group_num, slot_mask);
 
-		if (kbase_prepare_to_reset_gpu(kbdev))
+		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu(kbdev);
 
 		if (is_suspend) {
@@ -3526,21 +3568,6 @@ static int scheduler_prepare(struct kbase_device *kbdev)
 	return 0;
 }
 
-static void scheduler_wait_protm_quit(struct kbase_device *kbdev)
-{
-	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
-	long wt = kbase_csf_timeout_in_jiffies(kbdev->csf.fw_timeout_ms);
-	long remaining;
-
-	lockdep_assert_held(&scheduler->lock);
-
-	remaining = wait_event_timeout(kbdev->csf.event_wait,
-			!kbase_csf_scheduler_protected_mode_in_use(kbdev), wt);
-
-	if (!remaining)
-		dev_warn(kbdev->dev, "Timeout, protm_quit wait skipped");
-}
-
 static void scheduler_handle_idle_timer_onoff(struct kbase_device *kbdev)
 {
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
@@ -3572,6 +3599,8 @@ static void schedule_actions(struct kbase_device *kbdev)
 	unsigned long flags;
 	struct kbase_queue_group *protm_grp;
 	int ret;
+	bool skip_idle_slots_update;
+	bool new_protm_top_grp = false;
 
 	kbase_reset_gpu_assert_prevented(kbdev);
 	lockdep_assert_held(&scheduler->lock);
@@ -3582,7 +3611,14 @@ static void schedule_actions(struct kbase_device *kbdev)
 		return;
 	}
 
-	scheduler_handle_idle_slots(kbdev);
+	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
+	skip_idle_slots_update = kbase_csf_scheduler_protected_mode_in_use(kbdev);
+	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
+
+	/* Skip updating on-slot idle CSGs if GPU is in protected mode. */
+	if (!skip_idle_slots_update)
+		scheduler_handle_idle_slots(kbdev);
+
 	scheduler_prepare(kbdev);
 	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
 	protm_grp = scheduler->active_protm_grp;
@@ -3613,12 +3649,12 @@ static void schedule_actions(struct kbase_device *kbdev)
 				scheduler->top_grp->kctx->tgid,
 				scheduler->top_grp->kctx->id);
 
-			/* Due to GPUCORE-24491 only the top-group is allowed
-			 * to be on slot and all other on slot groups have to
-			 * be suspended before entering protected mode.
-			 * This would change in GPUCORE-24492.
+			/* When entering protected mode all CSG slots can be occupied
+			 * but only the protected mode CSG will be running. Any event
+			 * that would trigger the execution of an on-slot idle CSG will
+			 * need to be handled by the host during protected mode.
 			 */
-			scheduler->num_csg_slots_for_tick = 1;
+			new_protm_top_grp = true;
 		}
 
 		spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
@@ -3635,12 +3671,12 @@ static void schedule_actions(struct kbase_device *kbdev)
 		 * locked in the secure mode.
 		 */
 		if (protm_grp)
-			scheduler_wait_protm_quit(kbdev);
+			scheduler_force_protm_exit(kbdev);
 
 		wait_csg_slots_start(kbdev);
 		wait_csg_slots_finish_prio_update(kbdev);
 
-		if (scheduler->num_csg_slots_for_tick == 1) {
+		if (new_protm_top_grp) {
 			scheduler_group_check_protm_enter(kbdev,
 						scheduler->top_grp);
 		}
@@ -3913,8 +3949,7 @@ void kbase_csf_scheduler_reset(struct kbase_device *kbdev)
 	WARN_ON(!kbase_reset_gpu_is_active(kbdev));
 
 	KBASE_KTRACE_ADD(kbdev, SCHEDULER_RESET, NULL, 0u);
-	if (!kbase_csf_scheduler_protected_mode_in_use(kbdev) &&
-	    !suspend_active_queue_groups_on_reset(kbdev)) {
+	if (!suspend_active_queue_groups_on_reset(kbdev)) {
 		/* As all groups have been successfully evicted from the CSG
 		 * slots, clear out thee scheduler data fields and return
 		 */
@@ -4002,21 +4037,14 @@ static void firmware_aliveness_monitor(struct work_struct *work)
 
 	kbase_pm_wait_for_desired_state(kbdev);
 
-	err = kbase_csf_firmware_ping(kbdev);
+	err = kbase_csf_firmware_ping_wait(kbdev);
 
 	if (err) {
-		/* FW not responding means hardware counters will stop working.
-		 * Put the backend into the unrecoverable error state to cause
-		 * current and subsequent counter operations to immediately
-		 * fail, avoiding the risk of a hang.
-		 */
-		kbase_hwcnt_backend_csf_on_unrecoverable_error(
-			&kbdev->hwcnt_gpu_iface);
-
 		/* It is acceptable to enqueue a reset whilst we've prevented
 		 * them, it will happen after we've allowed them again
 		 */
-		if (kbase_prepare_to_reset_gpu(kbdev))
+		if (kbase_prepare_to_reset_gpu(
+			    kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
 			kbase_reset_gpu(kbdev);
 	} else if (get_nr_active_csgs(kbdev) == 1) {
 		queue_delayed_work(system_long_wq,
@@ -4132,7 +4160,9 @@ static bool group_sync_updated(struct kbase_queue_group *group)
 	bool updated = false;
 	int stream;
 
-	WARN_ON(group->run_state != KBASE_CSF_GROUP_SUSPENDED_ON_WAIT_SYNC);
+	/* Groups can also be blocked on-slot during protected mode. */
+	WARN_ON(group->run_state != KBASE_CSF_GROUP_SUSPENDED_ON_WAIT_SYNC &&
+		    group->run_state != KBASE_CSF_GROUP_IDLE);
 
 	for (stream = 0; stream < MAX_SUPPORTED_STREAMS_PER_GROUP; ++stream) {
 		struct kbase_queue *const queue = group->bound_queues[stream];
@@ -4233,40 +4263,159 @@ void kbase_csf_scheduler_group_protm_enter(struct kbase_queue_group *group)
 
 	mutex_lock(&scheduler->lock);
 
-	/* Check if the group is now eligible for execution in protected mode
-	 * and accordingly undertake full scheduling actions as due to
-	 * GPUCORE-24491 the on slot groups other than the top group have to
-	 * be suspended first before entering protected mode.
-	 */
-	if (scheduler_get_protm_enter_async_group(kbdev, group)) {
-		scheduler->apply_async_protm = true;
-		schedule_actions(kbdev);
-		scheduler->apply_async_protm = false;
-	}
+	/* Check if the group is now eligible for execution in protected mode. */
+	if (scheduler_get_protm_enter_async_group(kbdev, group))
+		scheduler_group_check_protm_enter(kbdev, group);
 
 	mutex_unlock(&scheduler->lock);
 	kbase_reset_gpu_allow(kbdev);
 }
 
 /**
+ * check_sync_update_for_idle_group_protm() - Check the sync wait condition
+ *                                            for all the queues bound to
+ *                                            the given group.
+ *
+ * @group:    Pointer to the group that requires evaluation.
+ *
+ * This function is called if the GPU is in protected mode and there are on
+ * slot idle groups with higher priority than the active protected mode group.
+ * This function will evaluate the sync condition, if any, of all the queues
+ * bound to the given group.
+ *
+ * Return true if the sync condition of at least one queue has been satisfied.
+ */
+static bool check_sync_update_for_idle_group_protm(
+		struct kbase_queue_group *group)
+{
+	struct kbase_device *const kbdev = group->kctx->kbdev;
+	struct kbase_csf_scheduler *const scheduler =
+				&kbdev->csf.scheduler;
+	bool sync_update_done = false;
+	int i;
+
+	lockdep_assert_held(&scheduler->lock);
+
+	for (i = 0; i < MAX_SUPPORTED_STREAMS_PER_GROUP; i++) {
+		struct kbase_queue *queue = group->bound_queues[i];
+
+		if (queue && queue->enabled && !sync_update_done) {
+			struct kbase_csf_cmd_stream_group_info *const ginfo =
+				&kbdev->csf.global_iface.groups[group->csg_nr];
+			struct kbase_csf_cmd_stream_info *const stream =
+				&ginfo->streams[queue->csi_index];
+			u32 status = kbase_csf_firmware_cs_output(
+					stream, CS_STATUS_WAIT);
+			unsigned long flags;
+
+			if (!CS_STATUS_WAIT_SYNC_WAIT_GET(status))
+				continue;
+
+			/* Save the information of sync object of the command
+			 * queue so the callback function, 'group_sync_updated'
+			 * can evaluate the sync object when it gets updated
+			 * later.
+			 */
+			queue->status_wait = status;
+			queue->sync_ptr = kbase_csf_firmware_cs_output(
+				stream, CS_STATUS_WAIT_SYNC_POINTER_LO);
+			queue->sync_ptr |= (u64)kbase_csf_firmware_cs_output(
+				stream, CS_STATUS_WAIT_SYNC_POINTER_HI) << 32;
+			queue->sync_value = kbase_csf_firmware_cs_output(
+				stream, CS_STATUS_WAIT_SYNC_VALUE);
+
+			if (!evaluate_sync_update(queue))
+				continue;
+
+			/* Update csg_slots_idle_mask and group's run_state */
+			spin_lock_irqsave(&scheduler->interrupt_lock, flags);
+			clear_bit((unsigned int)group->csg_nr,
+					scheduler->csg_slots_idle_mask);
+			spin_unlock_irqrestore(&scheduler->interrupt_lock,
+					       flags);
+			group->run_state = KBASE_CSF_GROUP_RUNNABLE;
+
+			KBASE_KTRACE_ADD_CSF_GRP(kbdev, GROUP_SYNC_UPDATE_DONE, group, 0u);
+			sync_update_done = true;
+		}
+	}
+
+	return sync_update_done;
+}
+
+/**
+ * check_sync_update_for_idle_groups_protm() - Check the sync wait condition
+ *                                             for the idle groups on slot
+ *                                             during protected mode.
+ *
+ * @kbdev:    Pointer to the GPU device
+ *
+ * This function checks the gpu queues of all the idle groups on slot during
+ * protected mode that has a higher priority than the active protected mode
+ * group.
+ *
+ * Return true if the sync condition of at least one queue in a group has been
+ * satisfied.
+ */
+static bool check_sync_update_for_idle_groups_protm(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+	struct kbase_queue_group *protm_grp;
+	bool exit_protm = false;
+	unsigned long flags;
+	u32 num_groups;
+	u32 i;
+
+	lockdep_assert_held(&scheduler->lock);
+
+	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
+	protm_grp = scheduler->active_protm_grp;
+	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
+
+	if (!protm_grp)
+		return exit_protm;
+
+	num_groups = kbdev->csf.global_iface.group_num;
+
+	for_each_set_bit(i, scheduler->csg_slots_idle_mask, num_groups) {
+		struct kbase_csf_csg_slot *csg_slot =
+					&scheduler->csg_slots[i];
+		struct kbase_queue_group *group = csg_slot->resident_group;
+
+		if (group->scan_seq_num < protm_grp->scan_seq_num) {
+			/* If sync update has been performed for the group that
+			 * has a higher priority than the protm group, then we
+			 * need to exit protected mode.
+			 */
+			if (check_sync_update_for_idle_group_protm(group))
+				exit_protm = true;
+		}
+	}
+
+	return exit_protm;
+}
+
+/**
  * check_group_sync_update_worker() - Check the sync wait condition for all the
  *                                    blocked queue groups
  *
  * @work:    Pointer to the context-specific work item for evaluating the wait
  *           condition for all the queue groups in idle_wait_groups list.
  *
- * This function checks the gpu queues of all the groups present in
- * idle_wait_groups list of a context. If the sync wait condition
- * for at least one queue bound to the group has been satisfied then
- * the group is moved to the per context list of runnable groups so
- * that Scheduler can consider scheduling the group in next tick.
+ * This function checks the gpu queues of all the groups present in both
+ * idle_wait_groups list of a context and all on slot idle groups (if GPU
+ * is in protected mode).
+ * If the sync wait condition for at least one queue bound to the group has
+ * been satisfied then the group is moved to the per context list of
+ * runnable groups so that Scheduler can consider scheduling the group
+ * in next tick or exit protected mode.
  */
 static void check_group_sync_update_worker(struct work_struct *work)
 {
 	struct kbase_context *const kctx = container_of(work,
 		struct kbase_context, csf.sched.sync_update_work);
-	struct kbase_csf_scheduler *const scheduler =
-		&kctx->kbdev->csf.scheduler;
+	struct kbase_device *const kbdev = kctx->kbdev;
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
 
 	mutex_lock(&scheduler->lock);
 
@@ -4280,13 +4429,16 @@ static void check_group_sync_update_worker(struct work_struct *work)
 				 * groups list of the context.
 				 */
 				update_idle_suspended_group_state(group);
-				KBASE_KTRACE_ADD_CSF_GRP(kctx->kbdev, GROUP_SYNC_UPDATE_DONE, group, 0u);
+				KBASE_KTRACE_ADD_CSF_GRP(kbdev, GROUP_SYNC_UPDATE_DONE, group, 0u);
 			}
 		}
 	} else {
 		WARN_ON(!list_empty(&kctx->csf.sched.idle_wait_groups));
 	}
 
+	if (check_sync_update_for_idle_groups_protm(kbdev))
+		scheduler_force_protm_exit(kbdev);
+
 	mutex_unlock(&scheduler->lock);
 }
 
@@ -4402,7 +4554,6 @@ int kbase_csf_scheduler_init(struct kbase_device *kbdev)
 	scheduler->tock_pending_request = false;
 	scheduler->active_protm_grp = NULL;
 	scheduler->gpu_idle_fw_timer_enabled = false;
-	scheduler->apply_async_protm = false;
 	scheduler->csg_scheduling_period_ms = CSF_SCHEDULER_TIME_TICK_MS;
 	scheduler_doorbell_init(kbdev);
 
diff --git a/mali_kbase/csf/mali_kbase_csf_scheduler.h b/mali_kbase/csf/mali_kbase_csf_scheduler.h
index 20d1bc9..1607ff6 100644
--- a/mali_kbase/csf/mali_kbase_csf_scheduler.h
+++ b/mali_kbase/csf/mali_kbase_csf_scheduler.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -125,7 +125,7 @@ struct kbase_queue_group *kbase_csf_scheduler_get_group_on_slot(
  * kbase_csf_scheduler_group_deschedule() - Deschedule a GPU command queue
  *                                          group from the firmware.
  *
- * @group: Pointer to the queue group to be scheduled.
+ * @group: Pointer to the queue group to be descheduled.
  *
  * This function would disable the scheduling of GPU command queue group on
  * firmware.
@@ -174,7 +174,7 @@ int kbase_csf_scheduler_context_init(struct kbase_context *kctx);
 int kbase_csf_scheduler_init(struct kbase_device *kbdev);
 
 /**
- * kbase_csf_scheduler_context_init() - Terminate the context-specific part
+ * kbase_csf_scheduler_context_term() - Terminate the context-specific part
  *                                      for CSF scheduler.
  *
  * @kctx: Pointer to kbase context that is being terminated.
diff --git a/mali_kbase/csf/mali_kbase_csf_tiler_heap.c b/mali_kbase/csf/mali_kbase_csf_tiler_heap.c
index 0b4fb5a..9e4ed17 100644
--- a/mali_kbase/csf/mali_kbase_csf_tiler_heap.c
+++ b/mali_kbase/csf/mali_kbase_csf_tiler_heap.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -596,14 +596,14 @@ int kbase_csf_tiler_heap_alloc_new_chunk(struct kbase_context *kctx,
 	if (likely(heap)) {
 		err = alloc_new_chunk(heap, nr_in_flight, pending_frag_count,
 			new_chunk_ptr);
-	}
 
-	KBASE_TLSTREAM_AUX_TILER_HEAP_STATS(
-		kctx->kbdev, kctx->id, heap->heap_id,
-		PFN_UP(heap->chunk_size * heap->max_chunks),
-		PFN_UP(heap->chunk_size * heap->chunk_count), heap->max_chunks,
-		heap->chunk_size, heap->chunk_count, heap->target_in_flight,
-		nr_in_flight);
+		KBASE_TLSTREAM_AUX_TILER_HEAP_STATS(
+			kctx->kbdev, kctx->id, heap->heap_id,
+			PFN_UP(heap->chunk_size * heap->max_chunks),
+			PFN_UP(heap->chunk_size * heap->chunk_count),
+			heap->max_chunks, heap->chunk_size, heap->chunk_count,
+			heap->target_in_flight, nr_in_flight);
+	}
 
 	mutex_unlock(&kctx->csf.tiler_heaps.lock);
 
diff --git a/mali_kbase/csf/mali_kbase_csf_trace_buffer.c b/mali_kbase/csf/mali_kbase_csf_trace_buffer.c
index 7e9eb75..afcc90b 100644
--- a/mali_kbase/csf/mali_kbase_csf_trace_buffer.c
+++ b/mali_kbase/csf/mali_kbase_csf_trace_buffer.c
@@ -289,10 +289,6 @@ int kbase_csf_firmware_parse_trace_buffer_entry(struct kbase_device *kbdev,
 			trace_buffer->trace_enable_entry_count = entry[6];
 			trace_buffer->num_pages = trace_buffer_data[i].size;
 
-			/* Temporary workaround until handled by GPUCORE-27330 */
-			if (!strcmp(trace_buffer_data[i].name, "timeline"))
-				trace_buffer->updatable = 0;
-
 			for (j = 0; j < CSF_FIRMWARE_TRACE_ENABLE_INIT_MASK_MAX; j++) {
 				trace_buffer->trace_enable_init_mask[j] =
 					trace_buffer_data[i].trace_enable_init_mask[j];
@@ -456,6 +452,7 @@ int kbase_csf_firmware_trace_buffer_update_trace_enable_bit(
 			dev_warn(
 				kbdev->dev,
 				"GPU reset already in progress when enabling firmware timeline.");
+			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 			return -EAGAIN;
 		}
 	}
diff --git a/mali_kbase/device/backend/mali_kbase_device_csf.c b/mali_kbase/device/backend/mali_kbase_device_csf.c
index f657bcb..cb2c2e2 100644
--- a/mali_kbase/device/backend/mali_kbase_device_csf.c
+++ b/mali_kbase/device/backend/mali_kbase_device_csf.c
@@ -23,6 +23,7 @@
 #include "../mali_kbase_device.h"
 
 #include <mali_kbase_hwaccess_backend.h>
+#include <mali_kbase_hwcnt_backend_csf_if_fw.h>
 #include <mali_kbase_ctx_sched.h>
 #include <mali_kbase_reset_gpu.h>
 #include <csf/mali_kbase_csf.h>
@@ -170,6 +171,77 @@ static void kbase_backend_late_term(struct kbase_device *kbdev)
 	kbase_hwaccess_pm_term(kbdev);
 }
 
+/**
+ * kbase_device_hwcnt_backend_csf_if_init - Create hardware counter backend
+ *                                          firmware interface.
+ * @kbdev:	Device pointer
+ */
+static int kbase_device_hwcnt_backend_csf_if_init(struct kbase_device *kbdev)
+{
+	return kbase_hwcnt_backend_csf_if_fw_create(
+		kbdev, &kbdev->hwcnt_backend_csf_if_fw);
+}
+
+/**
+ * kbase_device_hwcnt_backend_csf_if_term - Terminate hardware counter backend
+ *                                          firmware interface.
+ * @kbdev:	Device pointer
+ */
+static void kbase_device_hwcnt_backend_csf_if_term(struct kbase_device *kbdev)
+{
+	kbase_hwcnt_backend_csf_if_fw_destroy(&kbdev->hwcnt_backend_csf_if_fw);
+}
+
+/**
+ * kbase_device_hwcnt_backend_csf_init - Create hardware counter backend.
+ * @kbdev:	Device pointer
+ */
+
+static int kbase_device_hwcnt_backend_csf_init(struct kbase_device *kbdev)
+{
+	return kbase_hwcnt_backend_csf_create(
+		&kbdev->hwcnt_backend_csf_if_fw,
+		KBASE_HWCNT_BACKEND_CSF_RING_BUFFER_COUNT,
+		&kbdev->hwcnt_gpu_iface);
+}
+
+/**
+ * kbase_device_hwcnt_backend_csf_term - Terminate hardware counter backend.
+ * @kbdev:	Device pointer
+ */
+static void kbase_device_hwcnt_backend_csf_term(struct kbase_device *kbdev)
+{
+	kbase_hwcnt_backend_csf_destroy(&kbdev->hwcnt_gpu_iface);
+}
+
+/**
+ * kbase_device_hwcnt_backend_csf_metadata_init - Initialize hardware counter
+ *                                                metadata.
+ * @kbdev:	Device pointer
+ */
+static int
+kbase_device_hwcnt_backend_csf_metadata_init(struct kbase_device *kbdev)
+{
+	/* For CSF GPUs, HWC metadata needs to query information from CSF
+	 * firmware, so the initialization of HWC metadata only can be called
+	 * after firmware initialized, but firmware initialization depends on
+	 * HWC backend initialization, so we need to separate HWC backend
+	 * metadata initialization from HWC backend initialization.
+	 */
+	return kbase_hwcnt_backend_csf_metadata_init(&kbdev->hwcnt_gpu_iface);
+}
+
+/**
+ * kbase_device_hwcnt_backend_csf_metadata_term - Terminate hardware counter
+ *                                                metadata.
+ * @kbdev:	Device pointer
+ */
+static void
+kbase_device_hwcnt_backend_csf_metadata_term(struct kbase_device *kbdev)
+{
+	kbase_hwcnt_backend_csf_metadata_term(&kbdev->hwcnt_gpu_iface);
+}
+
 static const struct kbase_device_init dev_init[] = {
 #ifdef CONFIG_MALI_NO_MALI
 	{kbase_gpu_device_create, kbase_gpu_device_destroy,
@@ -244,12 +316,10 @@ static const struct kbase_device_init dev_init[] = {
 	 * paragraph that starts with "Word of warning", currently the
 	 * second-last paragraph.
 	 */
-	{kbase_sysfs_init, kbase_sysfs_term, "SysFS group creation failed"},
+	{kbase_sysfs_init, kbase_sysfs_term,
+			"SysFS group creation failed"},
 	{kbase_device_misc_register, kbase_device_misc_deregister,
 			"Misc device registration failed"},
-#ifdef CONFIG_MALI_BUSLOG
-	{buslog_init, buslog_term, "Bus log client registration failed"},
-#endif
 	{kbase_gpuprops_populate_user_buffer, kbase_gpuprops_free_user_buffer,
 			"GPU property population failed"},
 #endif
diff --git a/mali_kbase/device/backend/mali_kbase_device_hw_csf.c b/mali_kbase/device/backend/mali_kbase_device_hw_csf.c
index 4d11a82..259e42a 100644
--- a/mali_kbase/device/backend/mali_kbase_device_hw_csf.c
+++ b/mali_kbase/device/backend/mali_kbase_device_hw_csf.c
@@ -69,17 +69,9 @@ static bool kbase_gpu_fault_interrupt(struct kbase_device *kbdev)
 		if (!as_valid || (as_nr == MCU_AS_NR)) {
 			kbase_report_gpu_fault(kbdev, status, as_nr, as_valid);
 
-			/* MCU bus fault could mean hardware counters will stop
-			 * working.
-			 * Put the backend into the unrecoverable error state to
-			 * cause current and subsequent counter operations to
-			 * immediately fail, avoiding the risk of a hang.
-			 */
-			kbase_hwcnt_backend_csf_on_unrecoverable_error(
-				&kbdev->hwcnt_gpu_iface);
-
 			dev_err(kbdev->dev, "GPU bus fault triggering gpu-reset ...\n");
-			if (kbase_prepare_to_reset_gpu(kbdev))
+			if (kbase_prepare_to_reset_gpu(
+				    kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
 				kbase_reset_gpu(kbdev);
 		} else {
 			/* Handle Bus fault */
@@ -133,16 +125,8 @@ void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val)
 		}
 		kbase_csf_scheduler_spin_unlock(kbdev, flags);
 
-		/* Protected fault means we're unlikely to have the counter
-		 * operations we might do during reset acknowledged.
-		 * Put the backend into the unrecoverable error state to cause
-		 * current and subsequent counter operations to immediately
-		 * fail, avoiding the risk of a hang.
-		 */
-		kbase_hwcnt_backend_csf_on_unrecoverable_error(
-			&kbdev->hwcnt_gpu_iface);
-
-		if (kbase_prepare_to_reset_gpu(kbdev))
+		if (kbase_prepare_to_reset_gpu(
+			    kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
 			kbase_reset_gpu(kbdev);
 	}
 
diff --git a/mali_kbase/device/backend/mali_kbase_device_jm.c b/mali_kbase/device/backend/mali_kbase_device_jm.c
index 8052fba..9301310 100644
--- a/mali_kbase/device/backend/mali_kbase_device_jm.c
+++ b/mali_kbase/device/backend/mali_kbase_device_jm.c
@@ -21,6 +21,7 @@
 
 #include "../mali_kbase_device_internal.h"
 #include "../mali_kbase_device.h"
+#include "../mali_kbase_hwaccess_instr.h"
 
 #include <mali_kbase_config_defaults.h>
 #include <mali_kbase_hwaccess_backend.h>
@@ -107,6 +108,7 @@ static int kbase_backend_late_init(struct kbase_device *kbdev)
 	return 0;
 
 fail_update_l2_features:
+	kbase_backend_devfreq_term(kbdev);
 fail_devfreq_init:
 	kbase_job_slot_term(kbdev);
 fail_job_slot:
@@ -144,6 +146,16 @@ static void kbase_backend_late_term(struct kbase_device *kbdev)
 	kbase_hwaccess_pm_term(kbdev);
 }
 
+static int kbase_device_hwcnt_backend_jm_init(struct kbase_device *kbdev)
+{
+	return kbase_hwcnt_backend_jm_create(kbdev, &kbdev->hwcnt_gpu_iface);
+}
+
+static void kbase_device_hwcnt_backend_jm_term(struct kbase_device *kbdev)
+{
+	kbase_hwcnt_backend_jm_destroy(&kbdev->hwcnt_gpu_iface);
+}
+
 static const struct kbase_device_init dev_init[] = {
 #ifdef CONFIG_MALI_NO_MALI
 	{kbase_gpu_device_create, kbase_gpu_device_destroy,
@@ -183,6 +195,8 @@ static const struct kbase_device_init dev_init[] = {
 	{kbase_clk_rate_trace_manager_init,
 			kbase_clk_rate_trace_manager_term,
 			"Clock rate trace manager initialization failed"},
+	{kbase_instr_backend_init, kbase_instr_backend_term,
+			"Instrumentation backend initialization failed"},
 	{kbase_device_hwcnt_backend_jm_init,
 			kbase_device_hwcnt_backend_jm_term,
 			"GPU hwcnt backend creation failed"},
@@ -215,9 +229,6 @@ static const struct kbase_device_init dev_init[] = {
 	{kbase_sysfs_init, kbase_sysfs_term, "SysFS group creation failed"},
 	{kbase_device_misc_register, kbase_device_misc_deregister,
 			"Misc device registration failed"},
-#ifdef CONFIG_MALI_BUSLOG
-	{buslog_init, buslog_term, "Bus log client registration failed"},
-#endif
 	{kbase_gpuprops_populate_user_buffer, kbase_gpuprops_free_user_buffer,
 			"GPU property population failed"},
 #endif
@@ -254,7 +265,8 @@ int kbase_device_init(struct kbase_device *kbdev)
 	for (i = 0; i < ARRAY_SIZE(dev_init); i++) {
 		err = dev_init[i].init(kbdev);
 		if (err) {
-			dev_err(kbdev->dev, "%s error = %d\n",
+			if (err != -EPROBE_DEFER)
+				dev_err(kbdev->dev, "%s error = %d\n",
 						dev_init[i].err_mes, err);
 			kbase_device_term_partial(kbdev, i);
 			break;
diff --git a/mali_kbase/device/mali_kbase_device.c b/mali_kbase/device/mali_kbase_device.c
index a90c8cd..5e900d0 100644
--- a/mali_kbase/device/mali_kbase_device.c
+++ b/mali_kbase/device/mali_kbase_device.c
@@ -40,9 +40,6 @@
 
 #include <tl/mali_kbase_timeline.h>
 #include "mali_kbase_vinstr.h"
-#if MALI_USE_CSF
-#include <mali_kbase_hwcnt_backend_csf_if_fw.h>
-#endif
 #include "mali_kbase_hwcnt_context.h"
 #include "mali_kbase_hwcnt_virtualizer.h"
 
@@ -227,10 +224,6 @@ int kbase_device_misc_init(struct kbase_device * const kbdev)
 	if (err)
 		goto dma_set_mask_failed;
 
-#if !MALI_USE_CSF
-	spin_lock_init(&kbdev->hwcnt.lock);
-#endif
-
 	err = kbase_ktrace_init(kbdev);
 	if (err)
 		goto term_as;
@@ -241,20 +234,11 @@ int kbase_device_misc_init(struct kbase_device * const kbdev)
 
 	atomic_set(&kbdev->ctx_num, 0);
 
-#if !MALI_USE_CSF
-	err = kbase_instr_backend_init(kbdev);
-	if (err)
-		goto term_trace;
-#endif
-
 	kbdev->pm.dvfs_period = DEFAULT_PM_DVFS_PERIOD;
 
 	kbdev->reset_timeout_ms = DEFAULT_RESET_TIMEOUT_MS;
 
-	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_AARCH64_MMU))
-		kbdev->mmu_mode = kbase_mmu_mode_get_aarch64();
-	else
-		kbdev->mmu_mode = kbase_mmu_mode_get_lpae();
+	kbdev->mmu_mode = kbase_mmu_mode_get_aarch64();
 
 	mutex_init(&kbdev->kctx_list_lock);
 	INIT_LIST_HEAD(&kbdev->kctx_list);
@@ -263,11 +247,6 @@ int kbase_device_misc_init(struct kbase_device * const kbdev)
 
 	return 0;
 
-#if !MALI_USE_CSF
-term_trace:
-	kbase_ktrace_term(kbdev);
-#endif
-
 term_as:
 	kbase_device_all_as_term(kbdev);
 dma_set_mask_failed:
@@ -285,10 +264,6 @@ void kbase_device_misc_term(struct kbase_device *kbdev)
 	kbase_debug_assert_register_hook(NULL, NULL);
 #endif
 
-#if !MALI_USE_CSF
-	kbase_instr_backend_term(kbdev);
-#endif
-
 	kbase_ktrace_term(kbdev);
 
 	kbase_device_all_as_term(kbdev);
@@ -311,60 +286,6 @@ void kbase_increment_device_id(void)
 	kbase_dev_nr++;
 }
 
-#if MALI_USE_CSF
-
-int kbase_device_hwcnt_backend_csf_if_init(struct kbase_device *kbdev)
-{
-	return kbase_hwcnt_backend_csf_if_fw_create(
-		kbdev, &kbdev->hwcnt_backend_csf_if_fw);
-}
-
-void kbase_device_hwcnt_backend_csf_if_term(struct kbase_device *kbdev)
-{
-	kbase_hwcnt_backend_csf_if_fw_destroy(&kbdev->hwcnt_backend_csf_if_fw);
-}
-
-int kbase_device_hwcnt_backend_csf_init(struct kbase_device *kbdev)
-{
-	return kbase_hwcnt_backend_csf_create(
-		&kbdev->hwcnt_backend_csf_if_fw,
-		KBASE_HWCNT_BACKEND_CSF_RING_BUFFER_COUNT,
-		&kbdev->hwcnt_gpu_iface);
-}
-
-void kbase_device_hwcnt_backend_csf_term(struct kbase_device *kbdev)
-{
-	kbase_hwcnt_backend_csf_destroy(&kbdev->hwcnt_gpu_iface);
-}
-
-int kbase_device_hwcnt_backend_csf_metadata_init(struct kbase_device *kbdev)
-{
-	/* For CSF GPUs, HWC metadata needs to query informatoin from CSF
-	 * firmware, so the initialization of HWC metadata only can be called
-	 * after firmware initialised, but firmware initialization depends on
-	 * HWC backend initialization, so we need to separate HWC backend
-	 * metadata initialization from HWC backend initialization.
-	 */
-	return kbase_hwcnt_backend_csf_metadata_init(&kbdev->hwcnt_gpu_iface);
-}
-
-void kbase_device_hwcnt_backend_csf_metadata_term(struct kbase_device *kbdev)
-{
-	kbase_hwcnt_backend_csf_metadata_term(&kbdev->hwcnt_gpu_iface);
-}
-#else
-
-int kbase_device_hwcnt_backend_jm_init(struct kbase_device *kbdev)
-{
-	return kbase_hwcnt_backend_jm_create(kbdev, &kbdev->hwcnt_gpu_iface);
-}
-
-void kbase_device_hwcnt_backend_jm_term(struct kbase_device *kbdev)
-{
-	kbase_hwcnt_backend_jm_destroy(&kbdev->hwcnt_gpu_iface);
-}
-#endif /* MALI_USE_CSF */
-
 int kbase_device_hwcnt_context_init(struct kbase_device *kbdev)
 {
 	return kbase_hwcnt_context_init(&kbdev->hwcnt_gpu_iface,
@@ -484,7 +405,14 @@ int kbase_device_early_init(struct kbase_device *kbdev)
 	/* We're done accessing the GPU registers for now. */
 	kbase_pm_register_access_disable(kbdev);
 
+#ifdef CONFIG_MALI_ARBITER_SUPPORT
+	if (kbdev->arb.arb_if)
+		err = kbase_arbiter_pm_install_interrupts(kbdev);
+	else
+		err = kbase_install_interrupts(kbdev);
+#else
 	err = kbase_install_interrupts(kbdev);
+#endif
 	if (err)
 		goto fail_interrupts;
 
diff --git a/mali_kbase/device/mali_kbase_device_internal.h b/mali_kbase/device/mali_kbase_device_internal.h
index 2705e67..067f33c 100644
--- a/mali_kbase/device/mali_kbase_device_internal.h
+++ b/mali_kbase/device/mali_kbase_device_internal.h
@@ -42,18 +42,6 @@ void kbase_device_vinstr_term(struct kbase_device *kbdev);
 int kbase_device_timeline_init(struct kbase_device *kbdev);
 void kbase_device_timeline_term(struct kbase_device *kbdev);
 
-#if MALI_USE_CSF
-int kbase_device_hwcnt_backend_csf_init(struct kbase_device *kbdev);
-void kbase_device_hwcnt_backend_csf_term(struct kbase_device *kbdev);
-int kbase_device_hwcnt_backend_csf_if_init(struct kbase_device *kbdev);
-void kbase_device_hwcnt_backend_csf_if_term(struct kbase_device *kbdev);
-int kbase_device_hwcnt_backend_csf_metadata_init(struct kbase_device *kbdev);
-void kbase_device_hwcnt_backend_csf_metadata_term(struct kbase_device *kbdev);
-#else
-int kbase_device_hwcnt_backend_jm_init(struct kbase_device *kbdev);
-void kbase_device_hwcnt_backend_jm_term(struct kbase_device *kbdev);
-#endif
-
 int kbase_device_hwcnt_context_init(struct kbase_device *kbdev);
 void kbase_device_hwcnt_context_term(struct kbase_device *kbdev);
 
diff --git a/mali_kbase/gpu/backend/mali_kbase_gpu_fault_csf.c b/mali_kbase/gpu/backend/mali_kbase_gpu_fault_csf.c
index fa70afc..16eae0a 100644
--- a/mali_kbase/gpu/backend/mali_kbase_gpu_fault_csf.c
+++ b/mali_kbase/gpu/backend/mali_kbase_gpu_fault_csf.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -20,7 +20,7 @@
  */
 
 #include <mali_kbase.h>
-#include "csf/mali_gpu_csf_registers.h"
+#include <uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h>
 #include "../mali_kbase_gpu_fault.h"
 
 const char *kbase_gpu_exception_name(u32 const exception_code)
diff --git a/mali_kbase/gpu/mali_kbase_gpu.h b/mali_kbase/gpu/mali_kbase_gpu.h
deleted file mode 100644
index dba0e28..0000000
--- a/mali_kbase/gpu/mali_kbase_gpu.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU license.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- */
-
-#ifndef _KBASE_GPU_H_
-#define _KBASE_GPU_H_
-
-#include "mali_kbase_gpu_regmap.h"
-#include "mali_kbase_gpu_fault.h"
-#include "mali_kbase_gpu_coherency.h"
-#include "mali_kbase_gpu_id.h"
-
-#endif /* _KBASE_GPU_H_ */
diff --git a/mali_kbase/gpu/mali_kbase_gpu_regmap.h b/mali_kbase/gpu/mali_kbase_gpu_regmap.h
index b7a566f..05a229d 100644
--- a/mali_kbase/gpu/mali_kbase_gpu_regmap.h
+++ b/mali_kbase/gpu/mali_kbase_gpu_regmap.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2010-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -22,420 +22,12 @@
 #ifndef _KBASE_GPU_REGMAP_H_
 #define _KBASE_GPU_REGMAP_H_
 
-#include "mali_kbase_gpu_coherency.h"
-#include "mali_kbase_gpu_id.h"
-#if MALI_USE_CSF
-#include "backend/mali_kbase_gpu_regmap_csf.h"
-#else
-#include "backend/mali_kbase_gpu_regmap_jm.h"
-#endif
-
-/* Begin Register Offsets */
-/* GPU control registers */
-
-#define GPU_CONTROL_BASE        0x0000
-#define GPU_CONTROL_REG(r)      (GPU_CONTROL_BASE + (r))
-#define GPU_ID                  0x000   /* (RO) GPU and revision identifier */
-#define L2_FEATURES             0x004   /* (RO) Level 2 cache features */
-#define TILER_FEATURES          0x00C   /* (RO) Tiler Features */
-#define MEM_FEATURES            0x010   /* (RO) Memory system features */
-#define MMU_FEATURES            0x014   /* (RO) MMU features */
-#define AS_PRESENT              0x018   /* (RO) Address space slots present */
-#define GPU_IRQ_RAWSTAT         0x020   /* (RW) */
-#define GPU_IRQ_CLEAR           0x024   /* (WO) */
-#define GPU_IRQ_MASK            0x028   /* (RW) */
-#define GPU_IRQ_STATUS          0x02C   /* (RO) */
-
-#define GPU_COMMAND             0x030   /* (WO) */
-#define GPU_STATUS              0x034   /* (RO) */
-
-#define GPU_DBGEN               (1 << 8)    /* DBGEN wire status */
-
-#define GPU_FAULTSTATUS         0x03C   /* (RO) GPU exception type and fault status */
-#define GPU_FAULTADDRESS_LO     0x040   /* (RO) GPU exception fault address, low word */
-#define GPU_FAULTADDRESS_HI     0x044   /* (RO) GPU exception fault address, high word */
-
-#define L2_CONFIG               0x048   /* (RW) Level 2 cache configuration */
-
-#define GROUPS_L2_COHERENT      (1 << 0) /* Cores groups are l2 coherent */
-#define SUPER_L2_COHERENT       (1 << 1) /* Shader cores within a core
-					  * supergroup are l2 coherent
-					  */
-
-#define PWR_KEY                 0x050   /* (WO) Power manager key register */
-#define PWR_OVERRIDE0           0x054   /* (RW) Power manager override settings */
-#define PWR_OVERRIDE1           0x058   /* (RW) Power manager override settings */
-#define GPU_FEATURES_LO         0x060   /* (RO) GPU features, low word */
-#define GPU_FEATURES_HI         0x064   /* (RO) GPU features, high word */
-#define CYCLE_COUNT_LO          0x090   /* (RO) Cycle counter, low word */
-#define CYCLE_COUNT_HI          0x094   /* (RO) Cycle counter, high word */
-#define TIMESTAMP_LO            0x098   /* (RO) Global time stamp counter, low word */
-#define TIMESTAMP_HI            0x09C   /* (RO) Global time stamp counter, high word */
-
-#define THREAD_MAX_THREADS      0x0A0   /* (RO) Maximum number of threads per core */
-#define THREAD_MAX_WORKGROUP_SIZE 0x0A4 /* (RO) Maximum workgroup size */
-#define THREAD_MAX_BARRIER_SIZE 0x0A8   /* (RO) Maximum threads waiting at a barrier */
-#define THREAD_FEATURES         0x0AC   /* (RO) Thread features */
-#define THREAD_TLS_ALLOC        0x310   /* (RO) Number of threads per core that TLS must be allocated for */
-
-#define TEXTURE_FEATURES_0      0x0B0   /* (RO) Support flags for indexed texture formats 0..31 */
-#define TEXTURE_FEATURES_1      0x0B4   /* (RO) Support flags for indexed texture formats 32..63 */
-#define TEXTURE_FEATURES_2      0x0B8   /* (RO) Support flags for indexed texture formats 64..95 */
-#define TEXTURE_FEATURES_3      0x0BC   /* (RO) Support flags for texture order */
-
-#define TEXTURE_FEATURES_REG(n) GPU_CONTROL_REG(TEXTURE_FEATURES_0 + ((n) << 2))
-
-#define SHADER_PRESENT_LO       0x100   /* (RO) Shader core present bitmap, low word */
-#define SHADER_PRESENT_HI       0x104   /* (RO) Shader core present bitmap, high word */
-
-#define TILER_PRESENT_LO        0x110   /* (RO) Tiler core present bitmap, low word */
-#define TILER_PRESENT_HI        0x114   /* (RO) Tiler core present bitmap, high word */
-
-#define L2_PRESENT_LO           0x120   /* (RO) Level 2 cache present bitmap, low word */
-#define L2_PRESENT_HI           0x124   /* (RO) Level 2 cache present bitmap, high word */
-
-#define STACK_PRESENT_LO        0xE00   /* (RO) Core stack present bitmap, low word */
-#define STACK_PRESENT_HI        0xE04   /* (RO) Core stack present bitmap, high word */
-
-#define SHADER_READY_LO         0x140   /* (RO) Shader core ready bitmap, low word */
-#define SHADER_READY_HI         0x144   /* (RO) Shader core ready bitmap, high word */
-
-#define TILER_READY_LO          0x150   /* (RO) Tiler core ready bitmap, low word */
-#define TILER_READY_HI          0x154   /* (RO) Tiler core ready bitmap, high word */
-
-#define L2_READY_LO             0x160   /* (RO) Level 2 cache ready bitmap, low word */
-#define L2_READY_HI             0x164   /* (RO) Level 2 cache ready bitmap, high word */
-
-#define STACK_READY_LO          0xE10   /* (RO) Core stack ready bitmap, low word */
-#define STACK_READY_HI          0xE14   /* (RO) Core stack ready bitmap, high word */
-
-#define SHADER_PWRON_LO         0x180   /* (WO) Shader core power on bitmap, low word */
-#define SHADER_PWRON_HI         0x184   /* (WO) Shader core power on bitmap, high word */
-
-#define TILER_PWRON_LO          0x190   /* (WO) Tiler core power on bitmap, low word */
-#define TILER_PWRON_HI          0x194   /* (WO) Tiler core power on bitmap, high word */
-
-#define L2_PWRON_LO             0x1A0   /* (WO) Level 2 cache power on bitmap, low word */
-#define L2_PWRON_HI             0x1A4   /* (WO) Level 2 cache power on bitmap, high word */
-
-#define STACK_PWRON_LO          0xE20   /* (RO) Core stack power on bitmap, low word */
-#define STACK_PWRON_HI          0xE24   /* (RO) Core stack power on bitmap, high word */
-
-#define SHADER_PWROFF_LO        0x1C0   /* (WO) Shader core power off bitmap, low word */
-#define SHADER_PWROFF_HI        0x1C4   /* (WO) Shader core power off bitmap, high word */
-
-#define TILER_PWROFF_LO         0x1D0   /* (WO) Tiler core power off bitmap, low word */
-#define TILER_PWROFF_HI         0x1D4   /* (WO) Tiler core power off bitmap, high word */
-
-#define L2_PWROFF_LO            0x1E0   /* (WO) Level 2 cache power off bitmap, low word */
-#define L2_PWROFF_HI            0x1E4   /* (WO) Level 2 cache power off bitmap, high word */
-
-#define STACK_PWROFF_LO         0xE30   /* (RO) Core stack power off bitmap, low word */
-#define STACK_PWROFF_HI         0xE34   /* (RO) Core stack power off bitmap, high word */
-
-#define SHADER_PWRTRANS_LO      0x200   /* (RO) Shader core power transition bitmap, low word */
-#define SHADER_PWRTRANS_HI      0x204   /* (RO) Shader core power transition bitmap, high word */
-
-#define TILER_PWRTRANS_LO       0x210   /* (RO) Tiler core power transition bitmap, low word */
-#define TILER_PWRTRANS_HI       0x214   /* (RO) Tiler core power transition bitmap, high word */
-
-#define L2_PWRTRANS_LO          0x220   /* (RO) Level 2 cache power transition bitmap, low word */
-#define L2_PWRTRANS_HI          0x224   /* (RO) Level 2 cache power transition bitmap, high word */
-
-#define ASN_HASH_0              0x02C0 /* (RW) ASN hash function argument 0 */
-#define ASN_HASH(n)             (ASN_HASH_0 + (n)*4)
-#define ASN_HASH_COUNT          3
-
-#define STACK_PWRTRANS_LO       0xE40   /* (RO) Core stack power transition bitmap, low word */
-#define STACK_PWRTRANS_HI       0xE44   /* (RO) Core stack power transition bitmap, high word */
-
-#define SHADER_PWRACTIVE_LO     0x240   /* (RO) Shader core active bitmap, low word */
-#define SHADER_PWRACTIVE_HI     0x244   /* (RO) Shader core active bitmap, high word */
-
-#define TILER_PWRACTIVE_LO      0x250   /* (RO) Tiler core active bitmap, low word */
-#define TILER_PWRACTIVE_HI      0x254   /* (RO) Tiler core active bitmap, high word */
-
-#define L2_PWRACTIVE_LO         0x260   /* (RO) Level 2 cache active bitmap, low word */
-#define L2_PWRACTIVE_HI         0x264   /* (RO) Level 2 cache active bitmap, high word */
-
-#define COHERENCY_FEATURES      0x300   /* (RO) Coherency features present */
-#define COHERENCY_ENABLE        0x304   /* (RW) Coherency enable */
-
-#define SHADER_CONFIG           0xF04   /* (RW) Shader core configuration (implementation-specific) */
-#define TILER_CONFIG            0xF08   /* (RW) Tiler core configuration (implementation-specific) */
-#define L2_MMU_CONFIG           0xF0C   /* (RW) L2 cache and MMU configuration (implementation-specific) */
-
-/* Job control registers */
-
-#define JOB_CONTROL_BASE        0x1000
-
-#define JOB_CONTROL_REG(r)      (JOB_CONTROL_BASE + (r))
-
-#define JOB_IRQ_RAWSTAT         0x000   /* Raw interrupt status register */
-#define JOB_IRQ_CLEAR           0x004   /* Interrupt clear register */
-#define JOB_IRQ_MASK            0x008   /* Interrupt mask register */
-#define JOB_IRQ_STATUS          0x00C   /* Interrupt status register */
-
-/* MMU control registers */
-
-#define MEMORY_MANAGEMENT_BASE  0x2000
-#define MMU_REG(r)              (MEMORY_MANAGEMENT_BASE + (r))
-
-#define MMU_IRQ_RAWSTAT         0x000   /* (RW) Raw interrupt status register */
-#define MMU_IRQ_CLEAR           0x004   /* (WO) Interrupt clear register */
-#define MMU_IRQ_MASK            0x008   /* (RW) Interrupt mask register */
-#define MMU_IRQ_STATUS          0x00C   /* (RO) Interrupt status register */
-
-#define MMU_AS0                 0x400   /* Configuration registers for address space 0 */
-#define MMU_AS1                 0x440   /* Configuration registers for address space 1 */
-#define MMU_AS2                 0x480   /* Configuration registers for address space 2 */
-#define MMU_AS3                 0x4C0   /* Configuration registers for address space 3 */
-#define MMU_AS4                 0x500   /* Configuration registers for address space 4 */
-#define MMU_AS5                 0x540   /* Configuration registers for address space 5 */
-#define MMU_AS6                 0x580   /* Configuration registers for address space 6 */
-#define MMU_AS7                 0x5C0   /* Configuration registers for address space 7 */
-#define MMU_AS8                 0x600   /* Configuration registers for address space 8 */
-#define MMU_AS9                 0x640   /* Configuration registers for address space 9 */
-#define MMU_AS10                0x680   /* Configuration registers for address space 10 */
-#define MMU_AS11                0x6C0   /* Configuration registers for address space 11 */
-#define MMU_AS12                0x700   /* Configuration registers for address space 12 */
-#define MMU_AS13                0x740   /* Configuration registers for address space 13 */
-#define MMU_AS14                0x780   /* Configuration registers for address space 14 */
-#define MMU_AS15                0x7C0   /* Configuration registers for address space 15 */
-
-/* MMU address space control registers */
-
-#define MMU_AS_REG(n, r)        (MMU_REG(MMU_AS0 + ((n) << 6)) + (r))
-
-#define AS_TRANSTAB_LO         0x00	/* (RW) Translation Table Base Address for address space n, low word */
-#define AS_TRANSTAB_HI         0x04	/* (RW) Translation Table Base Address for address space n, high word */
-#define AS_MEMATTR_LO          0x08	/* (RW) Memory attributes for address space n, low word. */
-#define AS_MEMATTR_HI          0x0C	/* (RW) Memory attributes for address space n, high word. */
-#define AS_LOCKADDR_LO         0x10	/* (RW) Lock region address for address space n, low word */
-#define AS_LOCKADDR_HI         0x14	/* (RW) Lock region address for address space n, high word */
-#define AS_COMMAND             0x18	/* (WO) MMU command register for address space n */
-#define AS_FAULTSTATUS         0x1C	/* (RO) MMU fault status register for address space n */
-#define AS_FAULTADDRESS_LO     0x20	/* (RO) Fault Address for address space n, low word */
-#define AS_FAULTADDRESS_HI     0x24	/* (RO) Fault Address for address space n, high word */
-#define AS_STATUS              0x28	/* (RO) Status flags for address space n */
-
-/* (RW) Translation table configuration for address space n, low word */
-#define AS_TRANSCFG_LO         0x30
-/* (RW) Translation table configuration for address space n, high word */
-#define AS_TRANSCFG_HI         0x34
-/* (RO) Secondary fault address for address space n, low word */
-#define AS_FAULTEXTRA_LO       0x38
-/* (RO) Secondary fault address for address space n, high word */
-#define AS_FAULTEXTRA_HI       0x3C
-
-/* End Register Offsets */
+#include <uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_regmap.h>
 
 /* Include POWER_CHANGED_SINGLE in debug builds for use in irq latency test. */
 #ifdef CONFIG_MALI_DEBUG
+#undef GPU_IRQ_REG_ALL
 #define GPU_IRQ_REG_ALL (GPU_IRQ_REG_COMMON | POWER_CHANGED_SINGLE)
-#else /* CONFIG_MALI_DEBUG */
-#define GPU_IRQ_REG_ALL (GPU_IRQ_REG_COMMON)
 #endif /* CONFIG_MALI_DEBUG */
 
-/*
- * MMU_IRQ_RAWSTAT register values. Values are valid also for
- * MMU_IRQ_CLEAR, MMU_IRQ_MASK, MMU_IRQ_STATUS registers.
- */
-
-#define MMU_PAGE_FAULT_FLAGS    16
-
-/* Macros returning a bitmask to retrieve page fault or bus error flags from
- * MMU registers
- */
-#define MMU_PAGE_FAULT(n)       (1UL << (n))
-#define MMU_BUS_ERROR(n)        (1UL << ((n) + MMU_PAGE_FAULT_FLAGS))
-
-/*
- * Begin LPAE MMU TRANSTAB register values
- */
-#define AS_TRANSTAB_LPAE_ADDR_SPACE_MASK   0xfffff000
-#define AS_TRANSTAB_LPAE_ADRMODE_UNMAPPED  (0u << 0)
-#define AS_TRANSTAB_LPAE_ADRMODE_IDENTITY  (1u << 1)
-#define AS_TRANSTAB_LPAE_ADRMODE_TABLE     (3u << 0)
-#define AS_TRANSTAB_LPAE_READ_INNER        (1u << 2)
-#define AS_TRANSTAB_LPAE_SHARE_OUTER       (1u << 4)
-
-#define AS_TRANSTAB_LPAE_ADRMODE_MASK      0x00000003
-
-/*
- * Begin AARCH64 MMU TRANSTAB register values
- */
-#define MMU_HW_OUTA_BITS 40
-#define AS_TRANSTAB_BASE_MASK ((1ULL << MMU_HW_OUTA_BITS) - (1ULL << 4))
-
-/*
- * Begin MMU STATUS register values
- */
-#define AS_STATUS_AS_ACTIVE 0x01
-
-#define AS_FAULTSTATUS_EXCEPTION_CODE_MASK                      (0x7<<3)
-#define AS_FAULTSTATUS_EXCEPTION_CODE_TRANSLATION_FAULT         (0x0<<3)
-#define AS_FAULTSTATUS_EXCEPTION_CODE_PERMISSION_FAULT          (0x1<<3)
-#define AS_FAULTSTATUS_EXCEPTION_CODE_TRANSTAB_BUS_FAULT        (0x2<<3)
-#define AS_FAULTSTATUS_EXCEPTION_CODE_ACCESS_FLAG               (0x3<<3)
-#define AS_FAULTSTATUS_EXCEPTION_CODE_ADDRESS_SIZE_FAULT        (0x4<<3)
-#define AS_FAULTSTATUS_EXCEPTION_CODE_MEMORY_ATTRIBUTES_FAULT   (0x5<<3)
-
-#define AS_FAULTSTATUS_EXCEPTION_TYPE_SHIFT 0
-#define AS_FAULTSTATUS_EXCEPTION_TYPE_MASK (0xFF << AS_FAULTSTATUS_EXCEPTION_TYPE_SHIFT)
-#define AS_FAULTSTATUS_EXCEPTION_TYPE_GET(reg_val) \
-	(((reg_val)&AS_FAULTSTATUS_EXCEPTION_TYPE_MASK) >> AS_FAULTSTATUS_EXCEPTION_TYPE_SHIFT)
-#define AS_FAULTSTATUS_EXCEPTION_TYPE_TRANSLATION_FAULT_0 0xC0
-
-#define AS_FAULTSTATUS_ACCESS_TYPE_SHIFT 8
-#define AS_FAULTSTATUS_ACCESS_TYPE_MASK (0x3 << AS_FAULTSTATUS_ACCESS_TYPE_SHIFT)
-#define AS_FAULTSTATUS_ACCESS_TYPE_GET(reg_val) \
-	(((reg_val)&AS_FAULTSTATUS_ACCESS_TYPE_MASK) >> AS_FAULTSTATUS_ACCESS_TYPE_SHIFT)
-
-#define AS_FAULTSTATUS_ACCESS_TYPE_ATOMIC       (0x0)
-#define AS_FAULTSTATUS_ACCESS_TYPE_EX           (0x1)
-#define AS_FAULTSTATUS_ACCESS_TYPE_READ         (0x2)
-#define AS_FAULTSTATUS_ACCESS_TYPE_WRITE        (0x3)
-
-#define AS_FAULTSTATUS_SOURCE_ID_SHIFT 16
-#define AS_FAULTSTATUS_SOURCE_ID_MASK (0xFFFF << AS_FAULTSTATUS_SOURCE_ID_SHIFT)
-#define AS_FAULTSTATUS_SOURCE_ID_GET(reg_val) \
-	(((reg_val)&AS_FAULTSTATUS_SOURCE_ID_MASK) >> AS_FAULTSTATUS_SOURCE_ID_SHIFT)
-
-/*
- * Begin MMU TRANSCFG register values
- */
-#define AS_TRANSCFG_ADRMODE_LEGACY      0
-#define AS_TRANSCFG_ADRMODE_UNMAPPED    1
-#define AS_TRANSCFG_ADRMODE_IDENTITY    2
-#define AS_TRANSCFG_ADRMODE_AARCH64_4K  6
-#define AS_TRANSCFG_ADRMODE_AARCH64_64K 8
-
-#define AS_TRANSCFG_ADRMODE_MASK        0xF
-
-/*
- * Begin TRANSCFG register values
- */
-#define AS_TRANSCFG_PTW_MEMATTR_MASK (3ull << 24)
-#define AS_TRANSCFG_PTW_MEMATTR_NON_CACHEABLE (1ull << 24)
-#define AS_TRANSCFG_PTW_MEMATTR_WRITE_BACK (2ull << 24)
-
-#define AS_TRANSCFG_PTW_SH_MASK ((3ull << 28))
-#define AS_TRANSCFG_PTW_SH_OS (2ull << 28)
-#define AS_TRANSCFG_PTW_SH_IS (3ull << 28)
-#define AS_TRANSCFG_R_ALLOCATE (1ull << 30)
-
-/*
- * Begin Command Values
- */
-
-/* AS_COMMAND register commands */
-#define AS_COMMAND_NOP         0x00	/* NOP Operation */
-#define AS_COMMAND_UPDATE      0x01	/* Broadcasts the values in AS_TRANSTAB and ASn_MEMATTR to all MMUs */
-#define AS_COMMAND_LOCK        0x02	/* Issue a lock region command to all MMUs */
-#define AS_COMMAND_UNLOCK      0x03	/* Issue a flush region command to all MMUs */
-/* Flush all L2 caches then issue a flush region command to all MMUs
- * (deprecated - only for use with T60x)
- */
-#define AS_COMMAND_FLUSH 0x04
-/* Flush all L2 caches then issue a flush region command to all MMUs */
-#define AS_COMMAND_FLUSH_PT 0x04
-/* Wait for memory accesses to complete, flush all the L1s cache then flush all
- * L2 caches then issue a flush region command to all MMUs
- */
-#define AS_COMMAND_FLUSH_MEM 0x05
-
-/* GPU_STATUS values */
-#define GPU_STATUS_PRFCNT_ACTIVE            (1 << 2)    /* Set if the performance counters are active. */
-#define GPU_STATUS_CYCLE_COUNT_ACTIVE       (1 << 6)    /* Set if the cycle counter is active. */
-#define GPU_STATUS_PROTECTED_MODE_ACTIVE    (1 << 7)    /* Set if protected mode is active */
-
-/* PRFCNT_CONFIG register values */
-#define PRFCNT_CONFIG_MODE_SHIFT        0 /* Counter mode position. */
-#define PRFCNT_CONFIG_AS_SHIFT          4 /* Address space bitmap position. */
-#define PRFCNT_CONFIG_SETSELECT_SHIFT   8 /* Set select position. */
-
-/* The performance counters are disabled. */
-#define PRFCNT_CONFIG_MODE_OFF          0
-/* The performance counters are enabled, but are only written out when a
- * PRFCNT_SAMPLE command is issued using the GPU_COMMAND register.
- */
-#define PRFCNT_CONFIG_MODE_MANUAL       1
-/* The performance counters are enabled, and are written out each time a tile
- * finishes rendering.
- */
-#define PRFCNT_CONFIG_MODE_TILE         2
-
-/* AS<n>_MEMATTR values from MMU_MEMATTR_STAGE1: */
-/* Use GPU implementation-defined caching policy. */
-#define AS_MEMATTR_IMPL_DEF_CACHE_POLICY 0x88ull
-/* The attribute set to force all resources to be cached. */
-#define AS_MEMATTR_FORCE_TO_CACHE_ALL    0x8Full
-/* Inner write-alloc cache setup, no outer caching */
-#define AS_MEMATTR_WRITE_ALLOC           0x8Dull
-
-/* Use GPU implementation-defined  caching policy. */
-#define AS_MEMATTR_LPAE_IMPL_DEF_CACHE_POLICY 0x48ull
-/* The attribute set to force all resources to be cached. */
-#define AS_MEMATTR_LPAE_FORCE_TO_CACHE_ALL    0x4Full
-/* Inner write-alloc cache setup, no outer caching */
-#define AS_MEMATTR_LPAE_WRITE_ALLOC           0x4Dull
-/* Set to implementation defined, outer caching */
-#define AS_MEMATTR_LPAE_OUTER_IMPL_DEF        0x88ull
-/* Set to write back memory, outer caching */
-#define AS_MEMATTR_LPAE_OUTER_WA              0x8Dull
-/* There is no LPAE support for non-cacheable, since the memory type is always
- * write-back.
- * Marking this setting as reserved for LPAE
- */
-#define AS_MEMATTR_LPAE_NON_CACHEABLE_RESERVED
-
-/* L2_MMU_CONFIG register */
-#define L2_MMU_CONFIG_ALLOW_SNOOP_DISPARITY_SHIFT       (23)
-#define L2_MMU_CONFIG_ALLOW_SNOOP_DISPARITY             (0x1 << L2_MMU_CONFIG_ALLOW_SNOOP_DISPARITY_SHIFT)
-
-/* End L2_MMU_CONFIG register */
-
-/* THREAD_* registers */
-
-/* THREAD_FEATURES IMPLEMENTATION_TECHNOLOGY values */
-#define IMPLEMENTATION_UNSPECIFIED  0
-#define IMPLEMENTATION_SILICON      1
-#define IMPLEMENTATION_FPGA         2
-#define IMPLEMENTATION_MODEL        3
-
-/* Default values when registers are not supported by the implemented hardware */
-#define THREAD_MT_DEFAULT     256
-#define THREAD_MWS_DEFAULT    256
-#define THREAD_MBS_DEFAULT    256
-#define THREAD_MR_DEFAULT     1024
-#define THREAD_MTQ_DEFAULT    4
-#define THREAD_MTGS_DEFAULT   10
-
-/* End THREAD_* registers */
-
-/* SHADER_CONFIG register */
-#define SC_LS_ALLOW_ATTR_TYPES      (1ul << 16)
-#define SC_TLS_HASH_ENABLE          (1ul << 17)
-#define SC_LS_ATTR_CHECK_DISABLE    (1ul << 18)
-#define SC_VAR_ALGORITHM            (1ul << 29)
-/* End SHADER_CONFIG register */
-
-/* TILER_CONFIG register */
-#define TC_CLOCK_GATE_OVERRIDE      (1ul << 0)
-/* End TILER_CONFIG register */
-
-/* L2_CONFIG register */
-#define L2_CONFIG_SIZE_SHIFT        16
-#define L2_CONFIG_SIZE_MASK         (0xFFul << L2_CONFIG_SIZE_SHIFT)
-#define L2_CONFIG_HASH_SHIFT        24
-#define L2_CONFIG_HASH_MASK         (0xFFul << L2_CONFIG_HASH_SHIFT)
-#define L2_CONFIG_ASN_HASH_ENABLE_SHIFT        24
-#define L2_CONFIG_ASN_HASH_ENABLE_MASK         (1ul << L2_CONFIG_ASN_HASH_ENABLE_SHIFT)
-/* End L2_CONFIG register */
-
-/* IDVS_GROUP register */
-#define IDVS_GROUP_SIZE_SHIFT (16)
-#define IDVS_GROUP_MAX_SIZE (0x3F)
-
 #endif /* _KBASE_GPU_REGMAP_H_ */
diff --git a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_jm.c b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_jm.c
index d7648cd..00c0f60 100644
--- a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_jm.c
+++ b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_jm.c
@@ -23,7 +23,9 @@
 
 #include "mali_kbase_ipa_counter_common_jm.h"
 #include "mali_kbase.h"
-
+#ifdef CONFIG_MALI_NO_MALI
+#include <backend/gpu/mali_kbase_model_dummy.h>
+#endif
 
 /* Performance counter blocks base offsets */
 #define JM_BASE             (0 * KBASE_IPA_NR_BYTES_PER_BLOCK)
@@ -94,10 +96,15 @@ static u32 kbase_g7x_power_model_get_memsys_counter(struct kbase_ipa_model_vinst
 static u32 kbase_g7x_power_model_get_sc_counter(struct kbase_ipa_model_vinstr_data *model_data,
 						u32 counter_block_offset)
 {
+#ifdef CONFIG_MALI_NO_MALI
+	const u32 sc_base = MEMSYS_BASE +
+		(KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS *
+		 KBASE_IPA_NR_BYTES_PER_BLOCK);
+#else
 	const u32 sc_base = MEMSYS_BASE +
 		(model_data->kbdev->gpu_props.props.l2_props.num_l2_slices *
 		 KBASE_IPA_NR_BYTES_PER_BLOCK);
-
+#endif
 	return sc_base + counter_block_offset;
 }
 
diff --git a/mali_kbase/jm/mali_kbase_jm_js.h b/mali_kbase/jm/mali_kbase_jm_js.h
index 06adb36..e327536 100644
--- a/mali_kbase/jm/mali_kbase_jm_js.h
+++ b/mali_kbase/jm/mali_kbase_jm_js.h
@@ -657,7 +657,7 @@ static inline bool kbasep_js_is_submit_allowed(
 	test_bit = (u16) (1u << kctx->as_nr);
 
 	is_allowed = (bool) (js_devdata->runpool_irq.submit_allowed & test_bit);
-	dev_dbg(kctx->kbdev->dev, "JS: submit %s allowed on %p (as=%d)",
+	dev_dbg(kctx->kbdev->dev, "JS: submit %s allowed on %pK (as=%d)",
 			is_allowed ? "is" : "isn't", (void *)kctx, kctx->as_nr);
 	return is_allowed;
 }
@@ -684,7 +684,7 @@ static inline void kbasep_js_set_submit_allowed(
 
 	set_bit = (u16) (1u << kctx->as_nr);
 
-	dev_dbg(kctx->kbdev->dev, "JS: Setting Submit Allowed on %p (as=%d)",
+	dev_dbg(kctx->kbdev->dev, "JS: Setting Submit Allowed on %pK (as=%d)",
 			kctx, kctx->as_nr);
 
 	js_devdata->runpool_irq.submit_allowed |= set_bit;
@@ -715,7 +715,7 @@ static inline void kbasep_js_clear_submit_allowed(
 	clear_bit = (u16) (1u << kctx->as_nr);
 	clear_mask = ~clear_bit;
 
-	dev_dbg(kctx->kbdev->dev, "JS: Clearing Submit Allowed on %p (as=%d)",
+	dev_dbg(kctx->kbdev->dev, "JS: Clearing Submit Allowed on %pK (as=%d)",
 			kctx, kctx->as_nr);
 
 	js_devdata->runpool_irq.submit_allowed &= clear_mask;
diff --git a/mali_kbase/jm/mali_kbase_js_defs.h b/mali_kbase/jm/mali_kbase_js_defs.h
index 997cd49..183f0b0 100644
--- a/mali_kbase/jm/mali_kbase_js_defs.h
+++ b/mali_kbase/jm/mali_kbase_js_defs.h
@@ -171,7 +171,8 @@ enum {
  * Internal atom priority defines for kbase_jd_atom::sched_prio
  */
 enum {
-	KBASE_JS_ATOM_SCHED_PRIO_REALTIME = 0,
+	KBASE_JS_ATOM_SCHED_PRIO_FIRST = 0,
+	KBASE_JS_ATOM_SCHED_PRIO_REALTIME = KBASE_JS_ATOM_SCHED_PRIO_FIRST,
 	KBASE_JS_ATOM_SCHED_PRIO_HIGH,
 	KBASE_JS_ATOM_SCHED_PRIO_MED,
 	KBASE_JS_ATOM_SCHED_PRIO_LOW,
diff --git a/mali_kbase/mali_base_hwconfig_features.h b/mali_kbase/mali_base_hwconfig_features.h
index d6f31cf..bdc769f 100644
--- a/mali_kbase/mali_base_hwconfig_features.h
+++ b/mali_kbase/mali_base_hwconfig_features.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2014-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -49,7 +49,6 @@ enum base_hw_feature {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_TLS_HASHING,
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
@@ -85,7 +84,6 @@ static const enum base_hw_feature base_hw_features_tMIx[] = {
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_COHERENCY_REG,
-	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_END
 };
 
@@ -112,7 +110,6 @@ static const enum base_hw_feature base_hw_features_tHEx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
-	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_END
 };
 
@@ -139,7 +136,6 @@ static const enum base_hw_feature base_hw_features_tSIx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
-	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_END
 };
 
@@ -166,7 +162,6 @@ static const enum base_hw_feature base_hw_features_tDVx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
-	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_END
 };
 
@@ -193,7 +188,6 @@ static const enum base_hw_feature base_hw_features_tNOx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
-	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_TLS_HASHING,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
@@ -222,7 +216,6 @@ static const enum base_hw_feature base_hw_features_tGOx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
-	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_TLS_HASHING,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
@@ -250,7 +243,6 @@ static const enum base_hw_feature base_hw_features_tTRx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
-	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
 	BASE_HW_FEATURE_END
@@ -278,7 +270,6 @@ static const enum base_hw_feature base_hw_features_tNAx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
-	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
 	BASE_HW_FEATURE_END
@@ -306,7 +297,6 @@ static const enum base_hw_feature base_hw_features_tBEx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
-	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
@@ -335,7 +325,6 @@ static const enum base_hw_feature base_hw_features_tBAx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
-	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
@@ -364,7 +353,6 @@ static const enum base_hw_feature base_hw_features_tDUx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
-	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
@@ -393,7 +381,6 @@ static const enum base_hw_feature base_hw_features_tODx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
-	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
 	BASE_HW_FEATURE_END
diff --git a/mali_kbase/mali_base_hwconfig_issues.h b/mali_kbase/mali_base_hwconfig_issues.h
index 0afabb1..a61eeb2 100644
--- a/mali_kbase/mali_base_hwconfig_issues.h
+++ b/mali_kbase/mali_base_hwconfig_issues.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2014-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
diff --git a/mali_kbase/mali_kbase.h b/mali_kbase/mali_kbase.h
index a78ff43..b6683b9 100644
--- a/mali_kbase/mali_kbase.h
+++ b/mali_kbase/mali_kbase.h
@@ -45,7 +45,7 @@
 #include <linux/workqueue.h>
 #include <linux/interrupt.h>
 
-#include "mali_base_kernel.h"
+#include <uapi/gpu/arm/midgard/mali_base_kernel.h>
 #include <mali_kbase_linux.h>
 
 /*
@@ -64,7 +64,7 @@
 #include "mali_kbase_gpu_memory_debugfs.h"
 #include "mali_kbase_mem_profile_debugfs.h"
 #include "mali_kbase_gpuprops.h"
-#include "mali_kbase_ioctl.h"
+#include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h>
 #if !MALI_USE_CSF
 #include "mali_kbase_debug_job_fault.h"
 #include "mali_kbase_jd_debugfs.h"
@@ -213,10 +213,6 @@ void registers_unmap(struct kbase_device *kbdev);
 
 int kbase_device_coherency_init(struct kbase_device *kbdev);
 
-#ifdef CONFIG_MALI_BUSLOG
-int buslog_init(struct kbase_device *kbdev);
-void buslog_term(struct kbase_device *kbdev);
-#endif
 
 #if !MALI_USE_CSF
 int kbase_jd_init(struct kbase_context *kctx);
diff --git a/mali_kbase/mali_kbase_cache_policy.h b/mali_kbase/mali_kbase_cache_policy.h
index 817710a..2cd3079 100644
--- a/mali_kbase/mali_kbase_cache_policy.h
+++ b/mali_kbase/mali_kbase_cache_policy.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2012-2013, 2015, 2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2012-2013, 2015, 2020-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -27,7 +27,7 @@
 #define _KBASE_CACHE_POLICY_H_
 
 #include "mali_kbase.h"
-#include "mali_base_kernel.h"
+#include <uapi/gpu/arm/midgard/mali_base_kernel.h>
 
 /**
  * kbase_cache_enabled - Choose the cache policy for a specific region
diff --git a/mali_kbase/mali_kbase_core_linux.c b/mali_kbase/mali_kbase_core_linux.c
index 4e5155a..96fcbcd 100644
--- a/mali_kbase/mali_kbase_core_linux.c
+++ b/mali_kbase/mali_kbase_core_linux.c
@@ -53,7 +53,7 @@
 #include <mali_kbase_hwaccess_instr.h>
 #endif
 #include <mali_kbase_reset_gpu.h>
-#include "mali_kbase_ioctl.h"
+#include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h>
 #if !MALI_USE_CSF
 #include "mali_kbase_kinstr_jm.h"
 #endif
@@ -1150,10 +1150,7 @@ static int kbase_api_mem_alias(struct kbase_context *kctx,
 	u64 flags;
 	int err;
 
-	if (alias->in.nents == 0 || alias->in.nents > 2048)
-		return -EINVAL;
-
-	if (alias->in.stride > (U64_MAX / 2048))
+	if (alias->in.nents == 0 || alias->in.nents > BASE_MEM_ALIAS_MAX_ENTS)
 		return -EINVAL;
 
 	ai = vmalloc(sizeof(*ai) * alias->in.nents);
@@ -1357,18 +1354,6 @@ static int kbase_api_sticky_resource_unmap(struct kbase_context *kctx,
 }
 
 #if MALI_UNIT_TEST
-static int kbase_api_tlstream_test(struct kbase_context *kctx,
-		struct kbase_ioctl_tlstream_test *test)
-{
-	kbase_timeline_test(
-			kctx->kbdev,
-			test->tpw_count,
-			test->msg_delay,
-			test->msg_count,
-			test->aux_msg);
-
-	return 0;
-}
 
 static int kbase_api_tlstream_stats(struct kbase_context *kctx,
 		struct kbase_ioctl_tlstream_stats *stats)
@@ -1508,14 +1493,11 @@ static int kbase_ioctl_cs_get_glb_iface(struct kbase_context *kctx,
 	}
 
 	if (!err) {
-		param->out.total_stream_num =
-			kbase_csf_firmware_get_glb_iface(kctx->kbdev,
-				group_data, max_group_num,
-				stream_data, max_total_stream_num,
-				&param->out.glb_version, &param->out.features,
-				&param->out.group_num, &param->out.prfcnt_size);
-
-		param->out.padding = 0;
+		param->out.total_stream_num = kbase_csf_firmware_get_glb_iface(
+			kctx->kbdev, group_data, max_group_num, stream_data,
+			max_total_stream_num, &param->out.glb_version,
+			&param->out.features, &param->out.group_num,
+			&param->out.prfcnt_size, &param->out.instr_features);
 
 		if (copy_to_user(user_groups, group_data,
 			MIN(max_group_num, param->out.group_num) *
@@ -1619,6 +1601,23 @@ static int kbasep_ioctl_context_priority_check(struct kbase_context *kctx,
 		return ret;                                                    \
 	} while (0)
 
+static int kbasep_ioctl_set_limited_core_count(struct kbase_context *kctx,
+			struct kbase_ioctl_set_limited_core_count *set_limited_core_count)
+{
+	const u64 shader_core_mask =
+		kbase_pm_get_present_cores(kctx->kbdev, KBASE_PM_CORE_SHADER);
+	const u64 limited_core_mask =
+		((u64)1 << (set_limited_core_count->max_core_count)) - 1;
+
+	if ((shader_core_mask & limited_core_mask) == 0) {
+		/* At least one shader core must be available after applying the mask */
+		return -EINVAL;
+	}
+
+	kctx->limited_core_mask = limited_core_mask;
+	return 0;
+}
+
 static long kbase_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct kbase_file *const kfile = filp->private_data;
@@ -1980,12 +1979,6 @@ static long kbase_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		break;
 #endif /* MALI_USE_CSF */
 #if MALI_UNIT_TEST
-	case KBASE_IOCTL_TLSTREAM_TEST:
-		KBASE_HANDLE_IOCTL_IN(KBASE_IOCTL_TLSTREAM_TEST,
-				kbase_api_tlstream_test,
-				struct kbase_ioctl_tlstream_test,
-				kctx);
-		break;
 	case KBASE_IOCTL_TLSTREAM_STATS:
 		KBASE_HANDLE_IOCTL_OUT(KBASE_IOCTL_TLSTREAM_STATS,
 				kbase_api_tlstream_stats,
@@ -1999,6 +1992,12 @@ static long kbase_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 				struct kbase_ioctl_context_priority_check,
 				kctx);
 		break;
+	case KBASE_IOCTL_SET_LIMITED_CORE_COUNT:
+		KBASE_HANDLE_IOCTL_IN(KBASE_IOCTL_SET_LIMITED_CORE_COUNT,
+				kbasep_ioctl_set_limited_core_count,
+				struct kbase_ioctl_set_limited_core_count,
+				kctx);
+		break;
 	}
 
 	dev_warn(kbdev->dev, "Unknown ioctl 0x%x nr:%d", cmd, _IOC_NR(cmd));
@@ -2115,7 +2114,7 @@ static unsigned int kbase_poll(struct file *filp, poll_table *wait)
 void kbase_event_wakeup(struct kbase_context *kctx)
 {
 	KBASE_DEBUG_ASSERT(kctx);
-	dev_dbg(kctx->kbdev->dev, "Waking event queue for context %p\n",
+	dev_dbg(kctx->kbdev->dev, "Waking event queue for context %pK\n",
 		(void *)kctx);
 	wake_up_interruptible(&kctx->event_queue);
 }
@@ -3086,7 +3085,7 @@ static ssize_t kbase_show_gpuinfo(struct device *dev,
 		{ .id = GPU_ID2_PRODUCT_TBEX >> GPU_ID_VERSION_PRODUCT_ID_SHIFT,
 		  .name = "Mali-G78" },
 		{ .id = GPU_ID2_PRODUCT_TBAX >> GPU_ID_VERSION_PRODUCT_ID_SHIFT,
-		  .name = "Mali-TBAX" },
+		  .name = "Mali-G78AE" },
 		{ .id = GPU_ID2_PRODUCT_LBEX >> GPU_ID_VERSION_PRODUCT_ID_SHIFT,
 		  .name = "Mali-G68" },
 		{ .id = GPU_ID2_PRODUCT_TNAX >> GPU_ID_VERSION_PRODUCT_ID_SHIFT,
@@ -4094,21 +4093,28 @@ static void kbasep_protected_mode_hwcnt_disable_worker(struct work_struct *data)
 {
 	struct kbase_device *kbdev = container_of(data, struct kbase_device,
 		protected_mode_hwcnt_disable_work);
+	spinlock_t *backend_lock;
 	unsigned long flags;
 
 	bool do_disable;
 
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+#if MALI_USE_CSF
+	backend_lock = &kbdev->csf.scheduler.interrupt_lock;
+#else
+	backend_lock = &kbdev->hwaccess_lock;
+#endif
+
+	spin_lock_irqsave(backend_lock, flags);
 	do_disable = !kbdev->protected_mode_hwcnt_desired &&
 		!kbdev->protected_mode_hwcnt_disabled;
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	spin_unlock_irqrestore(backend_lock, flags);
 
 	if (!do_disable)
 		return;
 
 	kbase_hwcnt_context_disable(kbdev->hwcnt_gpu_ctx);
 
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	spin_lock_irqsave(backend_lock, flags);
 	do_disable = !kbdev->protected_mode_hwcnt_desired &&
 		!kbdev->protected_mode_hwcnt_disabled;
 
@@ -4128,9 +4134,10 @@ static void kbasep_protected_mode_hwcnt_disable_worker(struct work_struct *data)
 		kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
 	}
 
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	spin_unlock_irqrestore(backend_lock, flags);
 }
 
+#ifndef PLATFORM_PROTECTED_CALLBACKS
 static int kbasep_protected_mode_enable(struct protected_mode_device *pdev)
 {
 	struct kbase_device *kbdev = pdev->data;
@@ -4150,7 +4157,6 @@ static const struct protected_mode_ops kbasep_native_protected_ops = {
 	.protected_mode_disable = kbasep_protected_mode_disable
 };
 
-#ifndef PLATFORM_PROTECTED_CALLBACKS
 #define PLATFORM_PROTECTED_CALLBACKS (&kbasep_native_protected_ops)
 #endif /* PLATFORM_PROTECTED_CALLBACKS */
 
@@ -4330,6 +4336,7 @@ int kbase_device_pm_init(struct kbase_device *kbdev)
 	u32 gpu_model_id;
 
 	if (kbase_is_pv_enabled(kbdev->dev->of_node)) {
+		dev_info(kbdev->dev, "Arbitration interface enabled\n");
 		if (kbase_is_pm_enabled(kbdev->dev->of_node)) {
 			/* Arbitration AND power management invalid */
 			dev_err(kbdev->dev, "Invalid combination of arbitration AND power management\n");
@@ -4353,7 +4360,8 @@ int kbase_device_pm_init(struct kbase_device *kbdev)
 			gpu_model_id = GPU_ID2_MODEL_MATCH_VALUE(product_id);
 
 			if (gpu_model_id != GPU_ID2_PRODUCT_TGOX
-				&& gpu_model_id != GPU_ID2_PRODUCT_TNOX) {
+				&& gpu_model_id != GPU_ID2_PRODUCT_TNOX
+				&& gpu_model_id != GPU_ID2_PRODUCT_TBAX) {
 				kbase_arbiter_pm_early_term(kbdev);
 				dev_err(kbdev->dev, "GPU platform not suitable for arbitration\n");
 				return -EPERM;
@@ -4542,7 +4550,7 @@ void power_control_term(struct kbase_device *kbdev)
 static void trigger_reset(struct kbase_device *kbdev)
 {
 	kbase_pm_context_active(kbdev);
-	if (kbase_prepare_to_reset_gpu(kbdev))
+	if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 		kbase_reset_gpu(kbdev);
 	kbase_pm_context_idle(kbdev);
 }
@@ -4570,7 +4578,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_##type##_quirks, type##_quirks_get,\
 MAKE_QUIRK_ACCESSORS(sc);
 MAKE_QUIRK_ACCESSORS(tiler);
 MAKE_QUIRK_ACCESSORS(mmu);
-MAKE_QUIRK_ACCESSORS(jm);
+MAKE_QUIRK_ACCESSORS(gpu);
 
 static ssize_t kbase_device_debugfs_reset_write(struct file *file,
 		const char __user *ubuf, size_t count, loff_t *ppos)
@@ -4691,7 +4699,9 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev)
 	kbdev->mali_debugfs_directory = debugfs_create_dir(kbdev->devname,
 			NULL);
 	if (!kbdev->mali_debugfs_directory) {
-		dev_err(kbdev->dev, "Couldn't create mali debugfs directory\n");
+		dev_err(kbdev->dev,
+			"Couldn't create mali debugfs directory: %s\n",
+			kbdev->devname);
 		err = -ENOMEM;
 		goto out;
 	}
@@ -4746,9 +4756,8 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev)
 	debugfs_create_file("quirks_mmu", 0644,
 			kbdev->mali_debugfs_directory, kbdev,
 			&fops_mmu_quirks);
-	debugfs_create_file("quirks_jm", 0644,
-			kbdev->mali_debugfs_directory, kbdev,
-			&fops_jm_quirks);
+	debugfs_create_file("quirks_gpu", 0644, kbdev->mali_debugfs_directory,
+			    kbdev, &fops_gpu_quirks);
 
 	debugfs_create_bool("infinite_cache", mode,
 			debugfs_ctx_defaults_directory,
@@ -4878,40 +4887,6 @@ int kbase_device_coherency_init(struct kbase_device *kbdev)
 	return 0;
 }
 
-#ifdef CONFIG_MALI_BUSLOG
-
-/* Callback used by the kbase bus logger client, to initiate a GPU reset
- * when the bus log is restarted.  GPU reset is used as reference point
- * in HW bus log analyses.
- */
-static void kbase_logging_started_cb(void *data)
-{
-	struct kbase_device *kbdev = (struct kbase_device *)data;
-
-	if (kbase_prepare_to_reset_gpu(kbdev))
-		kbase_reset_gpu(kbdev);
-	dev_info(kbdev->dev, "KBASE - Bus logger restarted\n");
-}
-
-int buslog_init(struct kbase_device *kbdev)
-{
-	int err = 0;
-
-	err = bl_core_client_register(kbdev->devname,
-					kbase_logging_started_cb,
-					kbdev, &kbdev->buslogger,
-					THIS_MODULE, NULL);
-	if (err == 0)
-		bl_core_set_threshold(kbdev->buslogger, 1024*1024*1024);
-
-	return err;
-}
-
-void buslog_term(struct kbase_device *kbdev)
-{
-	bl_core_client_unregister(kbdev->buslogger);
-}
-#endif
 
 #if MALI_USE_CSF
 /**
@@ -5222,7 +5197,8 @@ static int kbase_platform_device_probe(struct platform_device *pdev)
 
 	if (err) {
 		if (err == -EPROBE_DEFER)
-			dev_err(kbdev->dev, "Device initialization Deferred\n");
+			dev_info(kbdev->dev,
+				"Device initialization Deferred\n");
 		else
 			dev_err(kbdev->dev, "Device initialization failed\n");
 
@@ -5448,7 +5424,6 @@ static struct platform_driver kbase_platform_driver = {
 	.remove = kbase_platform_device_remove,
 	.driver = {
 		   .name = kbase_drv_name,
-		   .owner = THIS_MODULE,
 		   .pm = &kbase_pm_ops,
 		   .of_match_table = of_match_ptr(kbase_dt_ids),
 	},
diff --git a/mali_kbase/mali_kbase_ctx_sched.c b/mali_kbase/mali_kbase_ctx_sched.c
index f59a2d7..c63bc8d 100644
--- a/mali_kbase/mali_kbase_ctx_sched.c
+++ b/mali_kbase/mali_kbase_ctx_sched.c
@@ -365,8 +365,7 @@ void kbase_ctx_sched_release_ctx_lock(struct kbase_context *kctx)
 }
 
 #if MALI_USE_CSF
-bool kbase_ctx_sched_refcount_mmu_flush(struct kbase_context *kctx,
-					bool sync)
+bool kbase_ctx_sched_inc_refcount_if_as_valid(struct kbase_context *kctx)
 {
 	struct kbase_device *kbdev;
 	bool added_ref = false;
@@ -383,20 +382,16 @@ bool kbase_ctx_sched_refcount_mmu_flush(struct kbase_context *kctx,
 	mutex_lock(&kbdev->mmu_hw_mutex);
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 
-	added_ref = kbase_ctx_sched_inc_refcount_nolock(kctx);
-
-	WARN_ON(added_ref &&
-		(kctx->mmu_flush_pend_state != KCTX_MMU_FLUSH_NOT_PEND));
-
-	if (!added_ref && (kctx->as_nr != KBASEP_AS_NR_INVALID)) {
-		enum kbase_ctx_mmu_flush_pending_state new_state =
-					sync ? KCTX_MMU_FLUSH_PEND_SYNC :
-					       KCTX_MMU_FLUSH_PEND_NO_SYNC;
+	if ((kctx->as_nr != KBASEP_AS_NR_INVALID) &&
+	    (kctx == kbdev->as_to_kctx[kctx->as_nr])) {
+		atomic_inc(&kctx->refcount);
 
-		WARN_ON(kctx != kbdev->as_to_kctx[kctx->as_nr]);
+		if (kbdev->as_free & (1u << kctx->as_nr))
+			kbdev->as_free &= ~(1u << kctx->as_nr);
 
-		if (kctx->mmu_flush_pend_state != KCTX_MMU_FLUSH_PEND_SYNC)
-			kctx->mmu_flush_pend_state = new_state;
+		KBASE_KTRACE_ADD(kbdev, SCHED_RETAIN_CTX_NOLOCK, kctx,
+				 kbase_ktrace_get_ctx_refcnt(kctx));
+		added_ref = true;
 	}
 
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
diff --git a/mali_kbase/mali_kbase_ctx_sched.h b/mali_kbase/mali_kbase_ctx_sched.h
index 1aa3762..cadb735 100644
--- a/mali_kbase/mali_kbase_ctx_sched.h
+++ b/mali_kbase/mali_kbase_ctx_sched.h
@@ -222,23 +222,20 @@ void kbase_ctx_sched_release_ctx_lock(struct kbase_context *kctx);
 
 #if MALI_USE_CSF
 /**
- * kbase_ctx_sched_refcount_mmu_flush - Refcount the context for the MMU flush
- *                                      operation.
+ * kbase_ctx_sched_inc_refcount_if_as_valid - Refcount the context if it has GPU
+ *                                            address space slot assigned to it.
  *
- * @kctx: Context to be refcounted.
- * @sync: Flag passed to the caller function kbase_mmu_flush_invalidate().
+ * @kctx: Context to be refcounted
  *
- * This function takes a reference on the context for the MMU flush operation.
- * The refcount is taken only if the context is busy/active.
- * If the context isn't active but has a GPU address space slot assigned to it
- * then a flag is set to indicate that MMU flush operation is pending, which
- * will be performed when the context becomes active.
+ * This function takes a reference on the context if it has a GPU address space
+ * slot assigned to it. The address space slot will not be available for
+ * re-assignment until the reference is released.
  *
  * Return: true if refcount succeeded and the address space slot will not be
- * reassigned, false if the refcount failed (because the context was inactive)
+ * reassigned, false if the refcount failed (because the address space slot
+ * was not assigned).
  */
-bool kbase_ctx_sched_refcount_mmu_flush(struct kbase_context *kctx,
-					bool sync);
+bool kbase_ctx_sched_inc_refcount_if_as_valid(struct kbase_context *kctx);
 #endif
 
 #endif /* _KBASE_CTX_SCHED_H_ */
diff --git a/mali_kbase/mali_kbase_debug_job_fault.c b/mali_kbase/mali_kbase_debug_job_fault.c
index 6902ded..7dfdff1 100644
--- a/mali_kbase/mali_kbase_debug_job_fault.c
+++ b/mali_kbase/mali_kbase_debug_job_fault.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2012-2016, 2018-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2012-2016, 2018-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -549,6 +549,14 @@ void kbase_debug_job_fault_kctx_unblock(struct kbase_context *kctx)
 {
 	WARN_ON(!kbase_ctx_flag(kctx, KCTX_DYING));
 
+	/* Return early if the job fault part of the kbase_device is not
+	 * initialized yet. An error can happen during the device probe after
+	 * the privileged Kbase context was created for the HW counter dumping
+	 * but before the job fault part is initialized.
+	 */
+	if (!kctx->kbdev->job_fault_resume_workq)
+		return;
+
 	kbase_ctx_remove_pending_event(kctx);
 }
 
diff --git a/mali_kbase/mali_kbase_defs.h b/mali_kbase/mali_kbase_defs.h
index d813f2f..5b7591c 100644
--- a/mali_kbase/mali_kbase_defs.h
+++ b/mali_kbase/mali_kbase_defs.h
@@ -48,9 +48,6 @@
 #include <linux/file.h>
 #include <linux/sizes.h>
 
-#ifdef CONFIG_MALI_BUSLOG
-#include <linux/bus_logger.h>
-#endif
 
 #if defined(CONFIG_SYNC)
 #include <sync.h>
@@ -554,7 +551,6 @@ struct kbase_mmu_mode {
 	unsigned long flags;
 };
 
-struct kbase_mmu_mode const *kbase_mmu_mode_get_lpae(void);
 struct kbase_mmu_mode const *kbase_mmu_mode_get_aarch64(void);
 
 #define DEVNAME_SIZE	16
@@ -624,8 +620,8 @@ struct kbase_process {
  *                         issues present in the GPU.
  * @hw_quirks_mmu:         Configuration to be used for the MMU as per the HW
  *                         issues present in the GPU.
- * @hw_quirks_jm:          Configuration to be used for the Job Manager as per
- *                         the HW issues present in the GPU.
+ * @hw_quirks_gpu:         Configuration to be used for the Job Manager or CSF/MCU
+ *                         subsystems as per the HW issues present in the GPU.
  * @entry:                 Links the device instance to the global list of GPU
  *                         devices. The list would have as many entries as there
  *                         are GPU device instances.
@@ -710,6 +706,8 @@ struct kbase_process {
  * @nr_hw_address_spaces:  Number of address spaces actually available in the
  *                         GPU, remains constant after driver initialisation.
  * @nr_user_address_spaces: Number of address spaces available to user contexts
+ * @hwcnt_backend_csf_if_fw: Firmware interface to access CSF GPU performance
+ *                         counters.
  * @hwcnt:                  Structure used for instrumentation and HW counters
  *                         dumping
  * @hwcnt.lock:            The lock should be used when accessing any of the
@@ -754,6 +752,8 @@ struct kbase_process {
  *                         including any contexts that might be created for
  *                         hardware counters.
  * @kctx_list_lock:        Lock protecting concurrent accesses to @kctx_list.
+ * @group_max_uid_in_devices: Max value of any queue group UID in any kernel
+ *                            context in the kbase device.
  * @devfreq_profile:       Describes devfreq profile for the Mali GPU device, passed
  *                         to devfreq_add_device() to add devfreq feature to Mali
  *                         GPU device.
@@ -918,7 +918,7 @@ struct kbase_device {
 	u32 hw_quirks_sc;
 	u32 hw_quirks_tiler;
 	u32 hw_quirks_mmu;
-	u32 hw_quirks_jm;
+	u32 hw_quirks_gpu;
 
 	struct list_head entry;
 	struct device *dev;
@@ -1016,6 +1016,7 @@ struct kbase_device {
 
 	struct list_head        kctx_list;
 	struct mutex            kctx_list_lock;
+	atomic_t                group_max_uid_in_devices;
 
 #ifdef CONFIG_MALI_DEVFREQ
 	struct devfreq_dev_profile devfreq_profile;
@@ -1120,9 +1121,6 @@ struct kbase_device {
 
 	struct work_struct protected_mode_hwcnt_disable_work;
 
-#ifdef CONFIG_MALI_BUSLOG
-	struct bus_logger_client *buslogger;
-#endif
 
 	bool irq_reset_flush;
 
@@ -1225,7 +1223,7 @@ struct kbase_file {
 	unsigned long         api_version;
 	atomic_t              setup_state;
 };
-
+#if MALI_JIT_PRESSURE_LIMIT_BASE
 /**
  * enum kbase_context_flags - Flags for kbase contexts
  *
@@ -1285,6 +1283,9 @@ struct kbase_file {
  * refcount for the context drops to 0 or on when the address spaces are
  * re-enabled on GPU reset or power cycle.
  *
+ * @KCTX_JPL_ENABLED: Set when JIT physical page limit is less than JIT virtual
+ * address page limit, so we must take care to not exceed the physical limit
+ *
  * All members need to be separate bits. This enum is intended for use in a
  * bitmask where multiple values get OR-ed together.
  */
@@ -1305,38 +1306,90 @@ enum kbase_context_flags {
 	KCTX_PULLED_SINCE_ACTIVE_JS1 = 1U << 13,
 	KCTX_PULLED_SINCE_ACTIVE_JS2 = 1U << 14,
 	KCTX_AS_DISABLED_ON_FAULT = 1U << 15,
-#if MALI_JIT_PRESSURE_LIMIT_BASE
-	/*
-	 * Set when JIT physical page limit is less than JIT virtual address
-	 * page limit, so we must take care to not exceed the physical limit
-	 */
 	KCTX_JPL_ENABLED = 1U << 16,
-#endif /* !MALI_JIT_PRESSURE_LIMIT_BASE */
 };
-
-#if MALI_USE_CSF
+#else
 /**
- * enum kbase_ctx_mmu_flush_pending_state - State for the pending mmu flush
- *                                          operation for a kbase context.
+ * enum kbase_context_flags - Flags for kbase contexts
+ *
+ * @KCTX_COMPAT: Set when the context process is a compat process, 32-bit
+ * process on a 64-bit kernel.
+ *
+ * @KCTX_RUNNABLE_REF: Set when context is counted in
+ * kbdev->js_data.nr_contexts_runnable. Must hold queue_mutex when accessing.
+ *
+ * @KCTX_ACTIVE: Set when the context is active.
+ *
+ * @KCTX_PULLED: Set when last kick() caused atoms to be pulled from this
+ * context.
+ *
+ * @KCTX_MEM_PROFILE_INITIALIZED: Set when the context's memory profile has been
+ * initialized.
+ *
+ * @KCTX_INFINITE_CACHE: Set when infinite cache is to be enabled for new
+ * allocations. Existing allocations will not change.
+ *
+ * @KCTX_SUBMIT_DISABLED: Set to prevent context from submitting any jobs.
+ *
+ * @KCTX_PRIVILEGED:Set if the context uses an address space and should be kept
+ * scheduled in.
+ *
+ * @KCTX_SCHEDULED: Set when the context is scheduled on the Run Pool.
+ * This is only ever updated whilst the jsctx_mutex is held.
+ *
+ * @KCTX_DYING: Set when the context process is in the process of being evicted.
+ *
+ * @KCTX_NO_IMPLICIT_SYNC: Set when explicit Android fences are in use on this
+ * context, to disable use of implicit dma-buf fences. This is used to avoid
+ * potential synchronization deadlocks.
+ *
+ * @KCTX_FORCE_SAME_VA: Set when BASE_MEM_SAME_VA should be forced on memory
+ * allocations. For 64-bit clients it is enabled by default, and disabled by
+ * default on 32-bit clients. Being able to clear this flag is only used for
+ * testing purposes of the custom zone allocation on 64-bit user-space builds,
+ * where we also require more control than is available through e.g. the JIT
+ * allocation mechanism. However, the 64-bit user-space client must still
+ * reserve a JIT region using KBASE_IOCTL_MEM_JIT_INIT
+ *
+ * @KCTX_PULLED_SINCE_ACTIVE_JS0: Set when the context has had an atom pulled
+ * from it for job slot 0. This is reset when the context first goes active or
+ * is re-activated on that slot.
+ *
+ * @KCTX_PULLED_SINCE_ACTIVE_JS1: Set when the context has had an atom pulled
+ * from it for job slot 1. This is reset when the context first goes active or
+ * is re-activated on that slot.
  *
- * @KCTX_MMU_FLUSH_NOT_PEND: Set when there is no MMU flush operation pending
- *                           for a kbase context or deferred flush operation
- *                           is performed.
+ * @KCTX_PULLED_SINCE_ACTIVE_JS2: Set when the context has had an atom pulled
+ * from it for job slot 2. This is reset when the context first goes active or
+ * is re-activated on that slot.
  *
- * @KCTX_MMU_FLUSH_PEND_NO_SYNC: Set when the MMU flush operation is deferred
- *                               for a kbase context when it is inactive and
- *                               the sync flag passed is 0.
+ * @KCTX_AS_DISABLED_ON_FAULT: Set when the GPU address space is disabled for
+ * the context due to unhandled page(or bus) fault. It is cleared when the
+ * refcount for the context drops to 0 or on when the address spaces are
+ * re-enabled on GPU reset or power cycle.
  *
- * @KCTX_MMU_FLUSH_PEND_SYNC: Set when the MMU flush operation is deferred
- *                            for a kbase context when it is inactive and
- *                            the sync flag passed is 1.
+ * All members need to be separate bits. This enum is intended for use in a
+ * bitmask where multiple values get OR-ed together.
  */
-enum kbase_ctx_mmu_flush_pending_state {
-	KCTX_MMU_FLUSH_NOT_PEND,
-	KCTX_MMU_FLUSH_PEND_NO_SYNC,
-	KCTX_MMU_FLUSH_PEND_SYNC,
+enum kbase_context_flags {
+	KCTX_COMPAT = 1U << 0,
+	KCTX_RUNNABLE_REF = 1U << 1,
+	KCTX_ACTIVE = 1U << 2,
+	KCTX_PULLED = 1U << 3,
+	KCTX_MEM_PROFILE_INITIALIZED = 1U << 4,
+	KCTX_INFINITE_CACHE = 1U << 5,
+	KCTX_SUBMIT_DISABLED = 1U << 6,
+	KCTX_PRIVILEGED = 1U << 7,
+	KCTX_SCHEDULED = 1U << 8,
+	KCTX_DYING = 1U << 9,
+	KCTX_NO_IMPLICIT_SYNC = 1U << 10,
+	KCTX_FORCE_SAME_VA = 1U << 11,
+	KCTX_PULLED_SINCE_ACTIVE_JS0 = 1U << 12,
+	KCTX_PULLED_SINCE_ACTIVE_JS1 = 1U << 13,
+	KCTX_PULLED_SINCE_ACTIVE_JS2 = 1U << 14,
+	KCTX_AS_DISABLED_ON_FAULT = 1U << 15,
 };
-#endif
+#endif /* MALI_JIT_PRESSURE_LIMIT_BASE */
 
 struct kbase_sub_alloc {
 	struct list_head link;
@@ -1616,12 +1669,8 @@ struct kbase_reg_zone {
  * @kinstr_jm:            Kernel job manager instrumentation context handle
  * @tl_kctx_list_node:    List item into the device timeline's list of
  *                        contexts, for timeline summarization.
- * @mmu_flush_pend_state: Tracks if the MMU flush operations are pending for the
- *                        context. The flush required due to unmap is also
- *                        tracked. It is supposed to be in
- *                        KCTX_MMU_FLUSH_NOT_PEND state whilst a context is
- *                        active and shall be updated with mmu_hw_mutex lock
- *                        held.
+ * @limited_core_mask:    The mask that is applied to the affinity in case of atoms
+ *                        marked with BASE_JD_REQ_LIMITED_CORE_MASK.
  *
  * A kernel base context is an entity among which the GPU is scheduled.
  * Each context has its own GPU address space.
@@ -1769,9 +1818,7 @@ struct kbase_context {
 #endif
 	struct list_head tl_kctx_list_node;
 
-#if MALI_USE_CSF
-	enum kbase_ctx_mmu_flush_pending_state mmu_flush_pend_state;
-#endif
+	u64 limited_core_mask;
 };
 
 #ifdef CONFIG_MALI_CINSTR_GWT
diff --git a/mali_kbase/mali_kbase_event.c b/mali_kbase/mali_kbase_event.c
index 04687ee..25a379d 100644
--- a/mali_kbase/mali_kbase_event.c
+++ b/mali_kbase/mali_kbase_event.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2010-2016,2018-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2016,2018-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -42,7 +42,7 @@ static struct base_jd_udata kbase_event_process(struct kbase_context *kctx, stru
 	KBASE_TLSTREAM_TL_DEL_ATOM(kbdev, katom);
 
 	katom->status = KBASE_JD_ATOM_STATE_UNUSED;
-	dev_dbg(kbdev->dev, "Atom %p status to unused\n", (void *)katom);
+	dev_dbg(kbdev->dev, "Atom %pK status to unused\n", (void *)katom);
 	wake_up(&katom->completed);
 
 	return data;
@@ -79,7 +79,7 @@ int kbase_event_dequeue(struct kbase_context *ctx, struct base_jd_event_v2 *ueve
 
 	mutex_unlock(&ctx->event_mutex);
 
-	dev_dbg(ctx->kbdev->dev, "event dequeuing %p\n", (void *)atom);
+	dev_dbg(ctx->kbdev->dev, "event dequeuing %pK\n", (void *)atom);
 	uevent->event_code = atom->event_code;
 
 	uevent->atom_number = (atom - ctx->jctx.atoms);
@@ -164,11 +164,11 @@ void kbase_event_post(struct kbase_context *ctx, struct kbase_jd_atom *atom)
 {
 	struct kbase_device *kbdev = ctx->kbdev;
 
-	dev_dbg(kbdev->dev, "Posting event for atom %p\n", (void *)atom);
+	dev_dbg(kbdev->dev, "Posting event for atom %pK\n", (void *)atom);
 
 	if (WARN_ON(atom->status != KBASE_JD_ATOM_STATE_COMPLETED)) {
 		dev_warn(kbdev->dev,
-				"%s: Atom %d (%p) not completed (status %d)\n",
+				"%s: Atom %d (%pK) not completed (status %d)\n",
 				__func__,
 				kbase_jd_atom_id(atom->kctx, atom),
 				atom->kctx,
diff --git a/mali_kbase/mali_kbase_gpu_memory_debugfs.c b/mali_kbase/mali_kbase_gpu_memory_debugfs.c
index 45ce740..a10b2bb 100644
--- a/mali_kbase/mali_kbase_gpu_memory_debugfs.c
+++ b/mali_kbase/mali_kbase_gpu_memory_debugfs.c
@@ -56,7 +56,7 @@ static int kbasep_gpu_memory_seq_show(struct seq_file *sfile, void *data)
 			/* output the memory usage and cap for each kctx
 			* opened on this device
 			*/
-			seq_printf(sfile, "  %s-0x%p %10u\n",
+			seq_printf(sfile, "  %s-0x%pK %10u\n",
 				"kctx",
 				kctx,
 				atomic_read(&(kctx->used_pages)));
diff --git a/mali_kbase/mali_kbase_gpuprops.c b/mali_kbase/mali_kbase_gpuprops.c
index 9da0b00..49f96f6 100644
--- a/mali_kbase/mali_kbase_gpuprops.c
+++ b/mali_kbase/mali_kbase_gpuprops.c
@@ -28,7 +28,7 @@
 #include <mali_kbase_gpuprops.h>
 #include <mali_kbase_hwaccess_gpuprops.h>
 #include <mali_kbase_config_defaults.h>
-#include "mali_kbase_ioctl.h"
+#include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h>
 #include <linux/clk.h>
 #include <mali_kbase_pm_internal.h>
 #include <linux/of_platform.h>
@@ -104,6 +104,71 @@ static void kbase_gpuprops_construct_coherent_groups(
 }
 
 /**
+ * kbase_gpuprops_get_curr_config_props - Get the current allocated resources
+ * @kbdev:       The &struct kbase_device structure for the device
+ * @curr_config: The &struct curr_config_props structure to receive the result
+ *
+ * Fill the &struct curr_config_props structure with values from the GPU
+ * configuration registers.
+ *
+ * Return: Zero on success, Linux error code on failure
+ */
+int kbase_gpuprops_get_curr_config_props(struct kbase_device *kbdev,
+	struct curr_config_props * const curr_config)
+{
+	struct kbase_current_config_regdump curr_config_regdump;
+	int err;
+
+	if (WARN_ON(!kbdev) || WARN_ON(!curr_config))
+		return -EINVAL;
+
+	/* If update not needed just return. */
+	if (!curr_config->update_needed)
+		return 0;
+
+	/* Dump relevant registers */
+	err = kbase_backend_gpuprops_get_curr_config(kbdev,
+						     &curr_config_regdump);
+	if (err)
+		return err;
+
+	curr_config->l2_slices =
+		KBASE_UBFX32(curr_config_regdump.mem_features, 8U, 4) + 1;
+
+	curr_config->l2_present =
+		((u64) curr_config_regdump.l2_present_hi << 32) +
+		curr_config_regdump.l2_present_lo;
+
+	curr_config->shader_present =
+		((u64) curr_config_regdump.shader_present_hi << 32) +
+		curr_config_regdump.shader_present_lo;
+
+	curr_config->num_cores = hweight64(curr_config->shader_present);
+
+	curr_config->update_needed = false;
+
+	return 0;
+}
+
+/**
+ * kbase_gpuprops_req_curr_config_update - Request Current Config Update
+ * @kbdev: The &struct kbase_device structure for the device
+ *
+ * Requests the current configuration to be updated next time the
+ * kbase_gpuprops_get_curr_config_props() is called.
+ *
+ * Return: Zero on success, Linux error code on failure
+ */
+int kbase_gpuprops_req_curr_config_update(struct kbase_device *kbdev)
+{
+	if (WARN_ON(!kbdev))
+		return -EINVAL;
+
+	kbdev->gpu_props.curr_config.update_needed = true;
+	return 0;
+}
+
+/**
  * kbase_gpuprops_get_props - Get the GPU configuration
  * @gpu_props: The &struct base_gpu_props structure
  * @kbdev: The &struct kbase_device structure for the device
@@ -183,6 +248,59 @@ void kbase_gpuprops_update_core_props_gpu_id(
 }
 
 /**
+ * kbase_gpuprops_update_max_config_props - Updates the max config properties in
+ * the base_gpu_props.
+ * @base_props: The &struct base_gpu_props structure
+ * @kbdev:      The &struct kbase_device structure for the device
+ *
+ * Updates the &struct base_gpu_props structure with the max config properties.
+ */
+static void kbase_gpuprops_update_max_config_props(
+	struct base_gpu_props * const base_props, struct kbase_device *kbdev)
+{
+	int l2_n = 0;
+
+	if (WARN_ON(!kbdev) || WARN_ON(!base_props))
+		return;
+
+	/* return if the max_config is not set during arbif initialization */
+	if (kbdev->gpu_props.max_config.core_mask == 0)
+		return;
+
+	/*
+	 * Set the base_props with the maximum config values to ensure that the
+	 * user space will always be based on the maximum resources available.
+	 */
+	base_props->l2_props.num_l2_slices =
+		kbdev->gpu_props.max_config.l2_slices;
+	base_props->raw_props.shader_present =
+		kbdev->gpu_props.max_config.core_mask;
+	/*
+	 * Update l2_present in the raw data to be consistent with the
+	 * max_config.l2_slices number.
+	 */
+	base_props->raw_props.l2_present = 0;
+	for (l2_n = 0; l2_n < base_props->l2_props.num_l2_slices; l2_n++) {
+		base_props->raw_props.l2_present <<= 1;
+		base_props->raw_props.l2_present |= 0x1;
+	}
+	/*
+	 * Update the coherency_info data using just one core group. For
+	 * architectures where the max_config is provided by the arbiter it is
+	 * not necessary to split the shader core groups in different coherent
+	 * groups.
+	 */
+	base_props->coherency_info.coherency =
+		base_props->raw_props.mem_features;
+	base_props->coherency_info.num_core_groups = 1;
+	base_props->coherency_info.num_groups = 1;
+	base_props->coherency_info.group[0].core_mask =
+		kbdev->gpu_props.max_config.core_mask;
+	base_props->coherency_info.group[0].num_cores =
+		hweight32(kbdev->gpu_props.max_config.core_mask);
+}
+
+/**
  * kbase_gpuprops_calculate_props - Calculate the derived properties
  * @gpu_props: The &struct base_gpu_props structure
  * @kbdev:     The &struct kbase_device structure for the device
@@ -297,8 +415,30 @@ static void kbase_gpuprops_calculate_props(
 		gpu_props->thread_props.max_task_queue = THREAD_MTQ_DEFAULT;
 		gpu_props->thread_props.max_thread_group_split = THREAD_MTGS_DEFAULT;
 	}
-	/* Initialize the coherent_group structure for each group */
-	kbase_gpuprops_construct_coherent_groups(gpu_props);
+
+	/*
+	 * If the maximum resources allocated information is available it is
+	 * necessary to update the base_gpu_props with the max_config info to
+	 * the userspace. This is applicable to systems that receive this
+	 * information from the arbiter.
+	 */
+	if (kbdev->gpu_props.max_config.core_mask)
+		/* Update the max config properties in the base_gpu_props */
+		kbase_gpuprops_update_max_config_props(gpu_props,
+						       kbdev);
+	else
+		/* Initialize the coherent_group structure for each group */
+		kbase_gpuprops_construct_coherent_groups(gpu_props);
+}
+
+void kbase_gpuprops_set_max_config(struct kbase_device *kbdev,
+	const struct max_config_props *max_config)
+{
+	if (WARN_ON(!kbdev) || WARN_ON(!max_config))
+		return;
+
+	kbdev->gpu_props.max_config.l2_slices = max_config->l2_slices;
+	kbdev->gpu_props.max_config.core_mask = max_config->core_mask;
 }
 
 void kbase_gpuprops_set(struct kbase_device *kbdev)
@@ -306,7 +446,8 @@ void kbase_gpuprops_set(struct kbase_device *kbdev)
 	struct kbase_gpu_props *gpu_props;
 	struct gpu_raw_gpu_props *raw;
 
-	KBASE_DEBUG_ASSERT(kbdev != NULL);
+	if (WARN_ON(!kbdev))
+		return;
 	gpu_props = &kbdev->gpu_props;
 	raw = &gpu_props->props.raw_props;
 
@@ -326,9 +467,19 @@ void kbase_gpuprops_set(struct kbase_device *kbdev)
 	gpu_props->mmu.pa_bits = KBASE_UBFX32(raw->mmu_features, 8U, 8);
 
 	gpu_props->num_cores = hweight64(raw->shader_present);
-	gpu_props->num_core_groups = hweight64(raw->l2_present);
+	gpu_props->num_core_groups =
+		gpu_props->props.coherency_info.num_core_groups;
 	gpu_props->num_address_spaces = hweight32(raw->as_present);
 	gpu_props->num_job_slots = hweight32(raw->js_present);
+
+	/*
+	 * Current configuration is used on HW interactions so that the maximum
+	 * config is just used for user space avoiding interactions with parts
+	 * of the hardware that might not be allocated to the kbase instance at
+	 * that moment.
+	 */
+	kbase_gpuprops_req_curr_config_update(kbdev);
+	kbase_gpuprops_get_curr_config_props(kbdev, &gpu_props->curr_config);
 }
 
 int kbase_gpuprops_set_features(struct kbase_device *kbdev)
@@ -494,7 +645,10 @@ int kbase_gpuprops_update_l2_features(struct kbase_device *kbdev)
 			goto exit;
 
 		dev_info(kbdev->dev, "Reflected L2_FEATURES is 0x%x\n",
-				regdump.l2_features);
+			 regdump.l2_features);
+		dev_info(kbdev->dev, "Reflected L2_CONFIG is 0x%08x\n",
+			 regdump.l2_config);
+
 
 		/* Update gpuprops with reflected L2_FEATURES */
 		gpu_props->raw_props.l2_features = regdump.l2_features;
diff --git a/mali_kbase/mali_kbase_gpuprops.h b/mali_kbase/mali_kbase_gpuprops.h
index 7c7b123..72f76c3 100644
--- a/mali_kbase/mali_kbase_gpuprops.h
+++ b/mali_kbase/mali_kbase_gpuprops.h
@@ -115,4 +115,38 @@ int kbase_device_populate_max_freq(struct kbase_device *kbdev);
 void kbase_gpuprops_update_core_props_gpu_id(
 	struct base_gpu_props * const gpu_props);
 
+/**
+ * kbase_gpuprops_set_max_config - Set the max config information
+ * @kbdev:       Device pointer
+ * @max_config:  Maximum configuration data to be updated
+ *
+ * This function sets max_config in the kbase_gpu_props.
+ */
+void kbase_gpuprops_set_max_config(struct kbase_device *kbdev,
+	const struct max_config_props *max_config);
+
+/**
+ * kbase_gpuprops_get_curr_config_props - Get the current allocated resources
+ * @kbdev: The &struct kbase_device structure for the device
+ * @curr_config: The &struct curr_config_props structure to receive the result
+ *
+ * Fill the &struct curr_config_props structure with values from the GPU
+ * configuration registers.
+ *
+ * Return: Zero on success, Linux error code on failure
+ */
+int kbase_gpuprops_get_curr_config_props(struct kbase_device *kbdev,
+	struct curr_config_props * const curr_config);
+
+/**
+ * kbase_gpuprops_req_curr_config_update - Request Current Config Update
+ * @kbdev: The &struct kbase_device structure for the device
+ *
+ * Requests the current configuration to be updated next time the
+ * kbase_gpuprops_get_curr_config_props() is called.
+ *
+ * Return: Zero on success, Linux error code on failure
+ */
+int kbase_gpuprops_req_curr_config_update(struct kbase_device *kbdev);
+
 #endif				/* _KBASE_GPUPROPS_H_ */
diff --git a/mali_kbase/mali_kbase_gpuprops_types.h b/mali_kbase/mali_kbase_gpuprops_types.h
index 8ecb54f..8b37b88 100644
--- a/mali_kbase/mali_kbase_gpuprops_types.h
+++ b/mali_kbase/mali_kbase_gpuprops_types.h
@@ -26,7 +26,7 @@
 #ifndef _KBASE_GPUPROPS_TYPES_H_
 #define _KBASE_GPUPROPS_TYPES_H_
 
-#include "mali_base_kernel.h"
+#include <uapi/gpu/arm/midgard/mali_base_kernel.h>
 
 #define KBASE_GPU_SPEED_MHZ    123
 #define KBASE_GPU_PC_SIZE_LOG2 24U
@@ -34,6 +34,7 @@
 struct kbase_gpuprops_regdump {
 	u32 gpu_id;
 	u32 l2_features;
+	u32 l2_config;
 	u32 core_features;
 	u32 tiler_features;
 	u32 mem_features;
@@ -60,6 +61,28 @@ struct kbase_gpuprops_regdump {
 	u32 gpu_features_hi;
 };
 
+/**
+ * struct kbase_current_config_regdump - Register dump for current resources
+ *                                       allocated to the GPU.
+ * @mem_features: Memory system features. Contains information about the
+ *                features of the memory system. Used here to get the L2 slice
+ *                count.
+ * @shader_present_lo: Shader core present bitmap. Low word.
+ * @shader_present_hi: Shader core present bitmap. High word.
+ * @l2_present_lo: L2 cache present bitmap. Low word.
+ * @l2_present_hi: L2 cache present bitmap. High word.
+ *
+ * Register dump structure used to store the resgisters data realated to the
+ * current resources allocated to the GPU.
+ */
+struct kbase_current_config_regdump {
+	u32 mem_features;
+	u32 shader_present_lo;
+	u32 shader_present_hi;
+	u32 l2_present_lo;
+	u32 l2_present_hi;
+};
+
 struct kbase_gpu_cache_props {
 	u8 associativity;
 	u8 external_bus_width;
@@ -74,6 +97,50 @@ struct kbase_gpu_mmu_props {
 	u8 pa_bits;
 };
 
+/**
+ * struct max_config_props - Properties based on the maximum resources
+ *                           available.
+ * @l2_slices: Maximum number of L2 slices that can be assinged to the GPU
+ *             during runtime.
+ * @padding:   Padding to a multiple of 64 bits.
+ * @core_mask: Largest core mask bitmap that can be assigned to the GPU during
+ *             runtime.
+ *
+ * Properties based on the maximum resources available (not necessarly
+ * allocated at that moment). Used to provide the maximum configuration to the
+ * userspace allowing the applications to allocate enough resources in case the
+ * real allocated resources change.
+ */
+struct max_config_props {
+	u8 l2_slices;
+	u8 padding[3];
+	u32 core_mask;
+};
+
+/**
+ * struct curr_config_props - Properties based on the current resources
+ *                            allocated to the GPU.
+ * @l2_present:     Current L2 present bitmap that is allocated to the GPU.
+ * @shader_present: Current shader present bitmap that is allocated to the GPU.
+ * @num_cores:      Current number of shader cores allocated to the GPU.
+ * @l2_slices:      Current number of L2 slices allocated to the GPU.
+ * @update_needed:  Defines if it is necessary to re-read the registers to
+ *                  update the current allocated resources.
+ * @padding:        Padding to a multiple of 64 bits.
+ *
+ * Properties based on the current resource available. Used for operations with
+ * hardware interactions to avoid using userspace data that can be based on
+ * the maximum resource available.
+ */
+struct curr_config_props {
+	u64 l2_present;
+	u64 shader_present;
+	u16 num_cores;
+	u8 l2_slices;
+	bool update_needed;
+	u8 padding[4];
+};
+
 struct kbase_gpu_props {
 	/* kernel-only properties */
 	u8 num_cores;
@@ -86,6 +153,12 @@ struct kbase_gpu_props {
 	struct kbase_gpu_mem_props mem;
 	struct kbase_gpu_mmu_props mmu;
 
+	/* Properties based on the current resource available */
+	struct curr_config_props curr_config;
+
+	/* Properties based on the maximum resource available */
+	struct max_config_props max_config;
+
 	/* Properties shared with userspace */
 	struct base_gpu_props props;
 
diff --git a/mali_kbase/mali_kbase_gwt.h b/mali_kbase/mali_kbase_gwt.h
index f349d8f..32b0f5f 100644
--- a/mali_kbase/mali_kbase_gwt.h
+++ b/mali_kbase/mali_kbase_gwt.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2010-2017, 2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2017, 2020-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -23,7 +23,7 @@
 #define _KBASE_GWT_H
 
 #include <mali_kbase.h>
-#include <mali_kbase_ioctl.h>
+#include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h>
 
 /**
  * kbase_gpu_gwt_start - Start the GPU write tracking
diff --git a/mali_kbase/mali_kbase_hw.c b/mali_kbase/mali_kbase_hw.c
index d2063bb..b1758d7 100644
--- a/mali_kbase/mali_kbase_hw.c
+++ b/mali_kbase/mali_kbase_hw.c
@@ -126,91 +126,91 @@ static const enum base_hw_issue *kbase_hw_get_issues_for_new_id(
 	};
 
 	static const struct base_hw_product base_hw_products[] = {
-		{GPU_ID2_PRODUCT_TMIX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 1),
-		   base_hw_issues_tMIx_r0p0_05dev0},
-		  {GPU_ID2_VERSION_MAKE(0, 0, 2), base_hw_issues_tMIx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 1, 0), base_hw_issues_tMIx_r0p1},
-		  {U32_MAX /* sentinel value */, NULL} } },
-
-		{GPU_ID2_PRODUCT_THEX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tHEx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 0, 1), base_hw_issues_tHEx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 1, 0), base_hw_issues_tHEx_r0p1},
-		  {GPU_ID2_VERSION_MAKE(0, 1, 1), base_hw_issues_tHEx_r0p1},
-		  {GPU_ID2_VERSION_MAKE(0, 2, 0), base_hw_issues_tHEx_r0p2},
-		  {GPU_ID2_VERSION_MAKE(0, 3, 0), base_hw_issues_tHEx_r0p3},
-		  {U32_MAX, NULL} } },
-
-		{GPU_ID2_PRODUCT_TSIX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tSIx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 0, 1), base_hw_issues_tSIx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 1, 0), base_hw_issues_tSIx_r0p1},
-		  {GPU_ID2_VERSION_MAKE(1, 0, 0), base_hw_issues_tSIx_r1p0},
-		  {GPU_ID2_VERSION_MAKE(1, 1, 0), base_hw_issues_tSIx_r1p1},
-		  {U32_MAX, NULL} } },
-
-		{GPU_ID2_PRODUCT_TDVX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tDVx_r0p0},
-		  {U32_MAX, NULL} } },
-
-		{GPU_ID2_PRODUCT_TNOX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tNOx_r0p0},
-		  {U32_MAX, NULL} } },
-
-		{GPU_ID2_PRODUCT_TGOX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tGOx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(1, 0, 0), base_hw_issues_tGOx_r1p0},
-		  {U32_MAX, NULL} } },
-
-		{GPU_ID2_PRODUCT_TTRX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tTRx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 0, 3), base_hw_issues_tTRx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 1, 0), base_hw_issues_tTRx_r0p1},
-		  {GPU_ID2_VERSION_MAKE(0, 1, 1), base_hw_issues_tTRx_r0p1},
-		  {GPU_ID2_VERSION_MAKE(0, 2, 0), base_hw_issues_tTRx_r0p2},
-		  {U32_MAX, NULL} } },
-
-		{GPU_ID2_PRODUCT_TNAX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tNAx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 0, 3), base_hw_issues_tNAx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 0, 4), base_hw_issues_tNAx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 0, 5), base_hw_issues_tNAx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 1, 0), base_hw_issues_tNAx_r0p1},
-		  {GPU_ID2_VERSION_MAKE(0, 1, 1), base_hw_issues_tNAx_r0p1},
-		  {U32_MAX, NULL} } },
-
-		{GPU_ID2_PRODUCT_LBEX,
-		 {{GPU_ID2_VERSION_MAKE(1, 0, 0), base_hw_issues_lBEx_r1p0},
-		  {GPU_ID2_VERSION_MAKE(1, 1, 0), base_hw_issues_lBEx_r1p1},
-		  {U32_MAX, NULL} } },
-
-		{GPU_ID2_PRODUCT_TBEX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tBEx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 0, 3), base_hw_issues_tBEx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 1, 0), base_hw_issues_tBEx_r0p1},
-		  {GPU_ID2_VERSION_MAKE(1, 0, 0), base_hw_issues_tBEx_r1p0},
-		  {U32_MAX, NULL} } },
-
-		{GPU_ID2_PRODUCT_TBAX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tBAx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 0, 3), base_hw_issues_tBAx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(1, 0, 0), base_hw_issues_tBAx_r1p0},
-		  {U32_MAX, NULL} } },
-
-		{GPU_ID2_PRODUCT_TDUX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tDUx_r0p0},
-		  {U32_MAX, NULL} } },
-
-		{GPU_ID2_PRODUCT_TODX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tODx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 0, 4), base_hw_issues_tODx_r0p0},
-		  {GPU_ID2_VERSION_MAKE(0, 0, 5), base_hw_issues_tODx_r0p0},
-		  {U32_MAX, NULL} } },
-
-		{GPU_ID2_PRODUCT_LODX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tODx_r0p0},
-		  {U32_MAX, NULL} } },
+		{ GPU_ID2_PRODUCT_TMIX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 1),
+		      base_hw_issues_tMIx_r0p0_05dev0 },
+		    { GPU_ID2_VERSION_MAKE(0, 0, 2), base_hw_issues_tMIx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 1, 0), base_hw_issues_tMIx_r0p1 },
+		    { U32_MAX /* sentinel value */, NULL } } },
+
+		{ GPU_ID2_PRODUCT_THEX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tHEx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 0, 1), base_hw_issues_tHEx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 1, 0), base_hw_issues_tHEx_r0p1 },
+		    { GPU_ID2_VERSION_MAKE(0, 1, 1), base_hw_issues_tHEx_r0p1 },
+		    { GPU_ID2_VERSION_MAKE(0, 2, 0), base_hw_issues_tHEx_r0p2 },
+		    { GPU_ID2_VERSION_MAKE(0, 3, 0), base_hw_issues_tHEx_r0p3 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_TSIX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tSIx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 0, 1), base_hw_issues_tSIx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 1, 0), base_hw_issues_tSIx_r0p1 },
+		    { GPU_ID2_VERSION_MAKE(1, 0, 0), base_hw_issues_tSIx_r1p0 },
+		    { GPU_ID2_VERSION_MAKE(1, 1, 0), base_hw_issues_tSIx_r1p1 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_TDVX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tDVx_r0p0 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_TNOX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tNOx_r0p0 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_TGOX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tGOx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(1, 0, 0), base_hw_issues_tGOx_r1p0 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_TTRX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tTRx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 0, 3), base_hw_issues_tTRx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 1, 0), base_hw_issues_tTRx_r0p1 },
+		    { GPU_ID2_VERSION_MAKE(0, 1, 1), base_hw_issues_tTRx_r0p1 },
+		    { GPU_ID2_VERSION_MAKE(0, 2, 0), base_hw_issues_tTRx_r0p2 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_TNAX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tNAx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 0, 3), base_hw_issues_tNAx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 0, 4), base_hw_issues_tNAx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 0, 5), base_hw_issues_tNAx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 1, 0), base_hw_issues_tNAx_r0p1 },
+		    { GPU_ID2_VERSION_MAKE(0, 1, 1), base_hw_issues_tNAx_r0p1 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_LBEX,
+		  { { GPU_ID2_VERSION_MAKE(1, 0, 0), base_hw_issues_lBEx_r1p0 },
+		    { GPU_ID2_VERSION_MAKE(1, 1, 0), base_hw_issues_lBEx_r1p1 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_TBEX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tBEx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 0, 3), base_hw_issues_tBEx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 1, 0), base_hw_issues_tBEx_r0p1 },
+		    { GPU_ID2_VERSION_MAKE(1, 0, 0), base_hw_issues_tBEx_r1p0 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_TBAX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tBAx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 0, 1), base_hw_issues_tBAx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 0, 2), base_hw_issues_tBAx_r0p0 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_TDUX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tDUx_r0p0 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_TODX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tODx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 0, 4), base_hw_issues_tODx_r0p0 },
+		    { GPU_ID2_VERSION_MAKE(0, 0, 5), base_hw_issues_tODx_r0p0 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_LODX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tODx_r0p0 },
+		    { U32_MAX, NULL } } },
 	};
 
 	u32 gpu_id = kbdev->gpu_props.props.raw_props.gpu_id;
diff --git a/mali_kbase/mali_kbase_hwaccess_gpuprops.h b/mali_kbase/mali_kbase_hwaccess_gpuprops.h
index 5e5f9dc..0fca83e 100644
--- a/mali_kbase/mali_kbase_hwaccess_gpuprops.h
+++ b/mali_kbase/mali_kbase_hwaccess_gpuprops.h
@@ -40,6 +40,23 @@ int kbase_backend_gpuprops_get(struct kbase_device *kbdev,
 					struct kbase_gpuprops_regdump *regdump);
 
 /**
+ * kbase_backend_gpuprops_get_curr_config() - Fill @curr_config_regdump with
+ *                                            relevant GPU properties read from
+ *                                            the GPU registers.
+ * @kbdev:               Device pointer.
+ * @curr_config_regdump: Pointer to struct kbase_current_config_regdump
+ *                       structure.
+ *
+ * The caller should ensure that GPU remains powered-on during this function and
+ * the caller must ensure this function returns success before using the values
+ * returned in the curr_config_regdump in any part of the kernel.
+ *
+ * Return: Zero for succeess or a Linux error code
+ */
+int kbase_backend_gpuprops_get_curr_config(struct kbase_device *kbdev,
+		struct kbase_current_config_regdump *curr_config_regdump);
+
+/**
  * kbase_backend_gpuprops_get_features - Fill @regdump with GPU properties read
  *                                       from GPU
  * @kbdev:   Device pointer
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_csf.c b/mali_kbase/mali_kbase_hwcnt_backend_csf.c
index c1bc7fc..4bc62c1 100644
--- a/mali_kbase/mali_kbase_hwcnt_backend_csf.c
+++ b/mali_kbase/mali_kbase_hwcnt_backend_csf.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -127,41 +127,31 @@ enum kbase_hwcnt_backend_csf_enable_state {
  * struct kbase_hwcnt_backend_csf_info - Information used to create an instance
  *                                       of a CSF hardware counter backend.
  * @backend:                      Pointer to access CSF backend.
- * @lock:                         Spinlock protecting backend and its internal
- *                                states.
  * @fw_in_protected_mode:         True if FW is running in protected mode, else
  *                                false.
  * @unrecoverable_error_happened: True if an recoverable error happened, else
  *                                false.
- * @csf_if:                       CSF interface object pointer. Functions inside
- *                                this interface MUST never be called while
- *                                holding the spin lock, as that could cause
- *                                deadlocks.
+ * @csf_if:                       CSF interface object pointer.
  * @ring_buf_cnt:                 Dump buffer count in the ring buffer.
  * @counter_set:                  The performance counter set to use.
  * @metadata:                     Hardware counter metadata.
- * @dump_bytes:                   Bytes of GPU memory required to perform a
- *                                hardware counter dump.
- * @gpu_info:                     GPU information to initialise HWC dump memory
- *                                layout.
+ * @prfcnt_info:                  Performance counter information.
  */
 struct kbase_hwcnt_backend_csf_info {
 	struct kbase_hwcnt_backend_csf *backend;
-	spinlock_t lock;
 	bool fw_in_protected_mode;
 	bool unrecoverable_error_happened;
 	struct kbase_hwcnt_backend_csf_if *csf_if;
 	u32 ring_buf_cnt;
 	enum kbase_hwcnt_set counter_set;
 	const struct kbase_hwcnt_metadata *metadata;
-	size_t dump_bytes;
-	struct kbase_hwcnt_gpu_info gpu_info;
+	struct kbase_hwcnt_backend_csf_if_prfcnt_info prfcnt_info;
 };
 
 /**
  * struct kbase_hwcnt_csf_physical_layout - HWC sample memory physical layout
  *                                          information.
- * @fe_cnt:             FroneEnd block count.
+ * @fe_cnt:             Front end block count.
  * @tiler_cnt:          Tiler block count.
  * @mmu_l2_cnt:         Memory system(MMU and L2 cache) block count.
  * @shader_cnt:         Shader Core block count.
@@ -207,7 +197,7 @@ struct kbase_hwcnt_csf_physical_layout {
  *                              count for sample period.
  * @phys_layout:                Physical memory layout information of HWC
  *                              sample buffer.
- * @dump_completed:             Completion signalled by the dump worker when
+ * @dump_completed:             Completion signaled by the dump worker when
  *                              it is completed accumulating up to the
  *                              insert_index_to_accumulate.
  *                              Should be initialized to the "complete" state.
@@ -242,7 +232,7 @@ bool kbasep_hwcnt_backend_csf_backend_exists(
 	struct kbase_hwcnt_backend_csf_info *csf_info)
 {
 	WARN_ON(!csf_info);
-	lockdep_assert_held(&csf_info->lock);
+	csf_info->csf_if->assert_lock_held(csf_info->csf_if->ctx);
 	return (csf_info->backend != NULL);
 }
 
@@ -280,6 +270,9 @@ kbasep_hwcnt_backend_csf_cc_update(struct kbase_hwcnt_backend_csf *backend_csf)
 	u64 cycle_counts[BASE_MAX_NR_CLOCKS_REGULATORS];
 	size_t clk;
 
+	backend_csf->info->csf_if->assert_lock_held(
+		backend_csf->info->csf_if->ctx);
+
 	backend_csf->info->csf_if->get_gpu_cycle_count(
 		backend_csf->info->csf_if->ctx, cycle_counts,
 		backend_csf->clk_enable_map);
@@ -310,10 +303,9 @@ kbasep_hwcnt_backend_csf_timestamp_ns(struct kbase_hwcnt_backend *backend)
 }
 
 /** kbasep_hwcnt_backend_csf_process_enable_map() - Process the enable_map to
- *                                                  guarantee the header is
- *                                                  enabled, the header will be
- *                                                  used when do the samples
- *                                                  delta calculation.
+ *                                                  guarantee headers are
+ *                                                  enabled if any counter is
+ *                                                  required.
  *@phys_enable_map: HWC physical enable map to be processed.
  */
 static void kbasep_hwcnt_backend_csf_process_enable_map(
@@ -338,21 +330,21 @@ static void kbasep_hwcnt_backend_csf_process_enable_map(
 }
 
 static void kbasep_hwcnt_backend_csf_init_layout(
-	const struct kbase_hwcnt_gpu_info *gpu_info,
+	const struct kbase_hwcnt_backend_csf_if_prfcnt_info *prfcnt_info,
 	struct kbase_hwcnt_csf_physical_layout *phys_layout)
 {
-	WARN_ON(!gpu_info);
+	WARN_ON(!prfcnt_info);
 	WARN_ON(!phys_layout);
 
 	phys_layout->fe_cnt = 1;
 	phys_layout->tiler_cnt = 1;
-	phys_layout->mmu_l2_cnt = gpu_info->l2_count;
-	phys_layout->shader_cnt = fls64(gpu_info->core_mask);
+	phys_layout->mmu_l2_cnt = prfcnt_info->l2_count;
+	phys_layout->shader_cnt = fls64(prfcnt_info->core_mask);
 	phys_layout->block_cnt = phys_layout->fe_cnt + phys_layout->tiler_cnt +
 				 phys_layout->mmu_l2_cnt +
 				 phys_layout->shader_cnt;
 
-	phys_layout->shader_avail_mask = gpu_info->core_mask;
+	phys_layout->shader_avail_mask = prfcnt_info->core_mask;
 
 	phys_layout->headers_per_block = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
 	phys_layout->counters_per_block = KBASE_HWCNT_V5_COUNTERS_PER_BLOCK;
@@ -363,9 +355,12 @@ static void kbasep_hwcnt_backend_csf_init_layout(
 static void kbasep_hwcnt_backend_csf_reset_internal_buffers(
 	struct kbase_hwcnt_backend_csf *backend_csf)
 {
-	memset(backend_csf->to_user_buf, 0, backend_csf->info->dump_bytes);
-	memset(backend_csf->accum_buf, 0, backend_csf->info->dump_bytes);
-	memset(backend_csf->old_sample_buf, 0, backend_csf->info->dump_bytes);
+	memset(backend_csf->to_user_buf, 0,
+	       backend_csf->info->prfcnt_info.dump_bytes);
+	memset(backend_csf->accum_buf, 0,
+	       backend_csf->info->prfcnt_info.dump_bytes);
+	memset(backend_csf->old_sample_buf, 0,
+	       backend_csf->info->prfcnt_info.dump_bytes);
 }
 
 static void kbasep_hwcnt_backend_csf_zero_sample_prfcnt_en_header(
@@ -389,12 +384,12 @@ static void kbasep_hwcnt_backend_csf_zero_all_prfcnt_en_header(
 	u32 idx;
 	u32 *sample;
 	char *cpu_dump_base;
+	size_t dump_bytes = backend_csf->info->prfcnt_info.dump_bytes;
 
 	cpu_dump_base = (char *)backend_csf->ring_buf_cpu_base;
 
 	for (idx = 0; idx < backend_csf->info->ring_buf_cnt; idx++) {
-		sample = (u32 *)&cpu_dump_base[idx *
-					       backend_csf->info->dump_bytes];
+		sample = (u32 *)&cpu_dump_base[idx * dump_bytes];
 		kbasep_hwcnt_backend_csf_zero_sample_prfcnt_en_header(
 			backend_csf, sample);
 	}
@@ -405,19 +400,20 @@ static void kbasep_hwcnt_backend_csf_update_user_sample(
 {
 	/* Copy the data into the sample and wait for the user to get it. */
 	memcpy(backend_csf->to_user_buf, backend_csf->accum_buf,
-	       backend_csf->info->dump_bytes);
+	       backend_csf->info->prfcnt_info.dump_bytes);
 
 	/* After copied data into user sample, clear the accumulator values to
 	 * prepare for the next accumulator, such as the next request or
 	 * threshold.
 	 */
-	memset(backend_csf->accum_buf, 0, backend_csf->info->dump_bytes);
+	memset(backend_csf->accum_buf, 0,
+	       backend_csf->info->prfcnt_info.dump_bytes);
 }
 
 static void kbasep_hwcnt_backend_csf_accumulate_sample(
 	const struct kbase_hwcnt_csf_physical_layout *phys_layout,
 	size_t dump_bytes, u32 *accum_buf, const u32 *old_sample_buf,
-	const u32 *new_sample_buf)
+	const u32 *new_sample_buf, bool clearing_samples)
 {
 	size_t block_idx, ctr_idx;
 	const u32 *old_block = old_sample_buf;
@@ -425,6 +421,8 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 	u32 *acc_block = accum_buf;
 
 	for (block_idx = 0; block_idx < phys_layout->block_cnt; block_idx++) {
+		const u32 old_enable_mask =
+			old_block[phys_layout->offset_enable_mask];
 		const u32 new_enable_mask =
 			new_block[phys_layout->offset_enable_mask];
 
@@ -442,11 +440,63 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 			       phys_layout->headers_per_block *
 				       KBASE_HWCNT_VALUE_BYTES);
 
-			/* Accumulate the counters. */
-			for (ctr_idx = phys_layout->headers_per_block;
-			     ctr_idx < phys_layout->values_per_block;
-			     ctr_idx++) {
-				acc_block[ctr_idx] += new_block[ctr_idx];
+			/* Accumulate counter samples
+			 *
+			 * When accumulating samples we need to take into
+			 * account whether the counter sampling method involves
+			 * clearing counters back to zero after each sample is
+			 * taken.
+			 *
+			 * The intention for CSF was that all HW should use
+			 * counters which wrap to zero when their maximum value
+			 * is reached. This, combined with non-clearing
+			 * sampling, enables multiple concurrent users to
+			 * request samples without interfering with each other.
+			 *
+			 * However some early HW may not support wrapping
+			 * counters, for these GPUs counters must be cleared on
+			 * sample to avoid loss of data due to counters
+			 * saturating at their maximum value.
+			 */
+			if (!clearing_samples) {
+				if (old_enable_mask == 0) {
+					/* Hardware block was previously
+					 * unavailable. Accumulate the new
+					 * counters only, as we know previous
+					 * values are zeroes.
+					 */
+					for (ctr_idx =
+						     phys_layout
+							     ->headers_per_block;
+					     ctr_idx <
+					     phys_layout->values_per_block;
+					     ctr_idx++) {
+						acc_block[ctr_idx] +=
+							new_block[ctr_idx];
+					}
+				} else {
+					/* Hardware block was previously
+					 * available. Accumulate the delta
+					 * between old and new counter values.
+					 */
+					for (ctr_idx =
+						     phys_layout
+							     ->headers_per_block;
+					     ctr_idx <
+					     phys_layout->values_per_block;
+					     ctr_idx++) {
+						acc_block[ctr_idx] +=
+							new_block[ctr_idx] -
+							old_block[ctr_idx];
+					}
+				}
+			} else {
+				for (ctr_idx = phys_layout->headers_per_block;
+				     ctr_idx < phys_layout->values_per_block;
+				     ctr_idx++) {
+					acc_block[ctr_idx] +=
+						new_block[ctr_idx];
+				}
 			}
 		}
 		old_block += phys_layout->values_per_block;
@@ -467,9 +517,11 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(
 	u32 insert_index_to_stop)
 {
 	u32 raw_idx;
+	unsigned long flags;
 	u8 *cpu_dump_base = (u8 *)backend_csf->ring_buf_cpu_base;
 	const size_t ring_buf_cnt = backend_csf->info->ring_buf_cnt;
-	const size_t buf_dump_bytes = backend_csf->info->dump_bytes;
+	const size_t buf_dump_bytes = backend_csf->info->prfcnt_info.dump_bytes;
+	bool clearing_samples = backend_csf->info->prfcnt_info.clearing_samples;
 	u32 *old_sample_buf = backend_csf->old_sample_buf;
 	u32 *new_sample_buf;
 
@@ -478,9 +530,10 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(
 		return;
 
 	/* Sync all the buffers to CPU side before read the data. */
-	backend_csf->info->csf_if->ring_buf_sync(
-		backend_csf->info->csf_if->ctx, backend_csf->ring_buf,
-		extract_index_to_start, (insert_index_to_stop - 1), true);
+	backend_csf->info->csf_if->ring_buf_sync(backend_csf->info->csf_if->ctx,
+						 backend_csf->ring_buf,
+						 extract_index_to_start,
+						 insert_index_to_stop, true);
 
 	/* Consider u32 wrap case, '!=' is used here instead of '<' operator */
 	for (raw_idx = extract_index_to_start; raw_idx != insert_index_to_stop;
@@ -495,7 +548,8 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(
 
 		kbasep_hwcnt_backend_csf_accumulate_sample(
 			&backend_csf->phys_layout, buf_dump_bytes,
-			backend_csf->accum_buf, old_sample_buf, new_sample_buf);
+			backend_csf->accum_buf, old_sample_buf, new_sample_buf,
+			clearing_samples);
 
 		old_sample_buf = new_sample_buf;
 	}
@@ -514,23 +568,28 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(
 	}
 
 	/* Sync zeroed buffers to avoid coherency issues on future use. */
-	backend_csf->info->csf_if->ring_buf_sync(
-		backend_csf->info->csf_if->ctx, backend_csf->ring_buf,
-		extract_index_to_start, (insert_index_to_stop - 1), false);
+	backend_csf->info->csf_if->ring_buf_sync(backend_csf->info->csf_if->ctx,
+						 backend_csf->ring_buf,
+						 extract_index_to_start,
+						 insert_index_to_stop, false);
 
 	/* After consuming all samples between extract_idx and insert_idx,
 	 * set the raw extract index to insert_idx so that the sample buffers
 	 * can be released back to the ring buffer pool.
 	 */
+	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 	backend_csf->info->csf_if->set_extract_index(
 		backend_csf->info->csf_if->ctx, insert_index_to_stop);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
+					  flags);
 }
 
 static void kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
 	struct kbase_hwcnt_backend_csf *backend_csf,
 	enum kbase_hwcnt_backend_csf_enable_state new_state)
 {
-	lockdep_assert_held(&backend_csf->info->lock);
+	backend_csf->info->csf_if->assert_lock_held(
+		backend_csf->info->csf_if->ctx);
 
 	if (backend_csf->enable_state != new_state) {
 		backend_csf->enable_state = new_state;
@@ -558,21 +617,19 @@ static void kbasep_hwcnt_backend_csf_dump_worker(struct work_struct *work)
 	WARN_ON(!work);
 	backend_csf = container_of(work, struct kbase_hwcnt_backend_csf,
 				   hwc_dump_work);
-
-	spin_lock_irqsave(&backend_csf->info->lock, flags);
+	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 	/* Assert the backend is not destroyed. */
 	WARN_ON(backend_csf != backend_csf->info->backend);
 
 	/* The backend was disabled or had an error while the worker was being
 	 * launched.
 	 */
-	if (backend_csf->enable_state !=
-		    KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED &&
-	    backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_ENABLED) {
+	if (backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_ENABLED) {
 		WARN_ON(backend_csf->dump_state !=
 			KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE);
 		WARN_ON(!completion_done(&backend_csf->dump_completed));
-		spin_unlock_irqrestore(&backend_csf->info->lock, flags);
+		backend_csf->info->csf_if->unlock(
+			backend_csf->info->csf_if->ctx, flags);
 		return;
 	}
 
@@ -581,12 +638,14 @@ static void kbasep_hwcnt_backend_csf_dump_worker(struct work_struct *work)
 
 	backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_ACCUMULATING;
 	insert_index_to_acc = backend_csf->insert_index_to_accumulate;
-	spin_unlock_irqrestore(&backend_csf->info->lock, flags);
 
 	/* Read the raw extract and insert indexes from the CSF interface. */
 	backend_csf->info->csf_if->get_indexes(backend_csf->info->csf_if->ctx,
 					       &extract_index, &insert_index);
 
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
+					  flags);
+
 	/* Accumulate up to the insert we grabbed at the prfcnt request
 	 * interrupt.
 	 */
@@ -599,19 +658,18 @@ static void kbasep_hwcnt_backend_csf_dump_worker(struct work_struct *work)
 	kbasep_hwcnt_backend_csf_update_user_sample(backend_csf);
 
 	/* Dump done, set state back to COMPLETED for next request. */
-	spin_lock_irqsave(&backend_csf->info->lock, flags);
+	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 	/* Assert the backend is not destroyed. */
 	WARN_ON(backend_csf != backend_csf->info->backend);
 
 	/* The backend was disabled or had an error while we were accumulating.
 	 */
-	if (backend_csf->enable_state !=
-		    KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED &&
-	    backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_ENABLED) {
+	if (backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_ENABLED) {
 		WARN_ON(backend_csf->dump_state !=
 			KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE);
 		WARN_ON(!completion_done(&backend_csf->dump_completed));
-		spin_unlock_irqrestore(&backend_csf->info->lock, flags);
+		backend_csf->info->csf_if->unlock(
+			backend_csf->info->csf_if->ctx, flags);
 		return;
 	}
 
@@ -621,7 +679,8 @@ static void kbasep_hwcnt_backend_csf_dump_worker(struct work_struct *work)
 	/* Our work here is done - set the wait object and unblock waiters. */
 	backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED;
 	complete_all(&backend_csf->dump_completed);
-	spin_unlock_irqrestore(&backend_csf->info->lock, flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
+					  flags);
 }
 
 /**
@@ -643,20 +702,21 @@ static void kbasep_hwcnt_backend_csf_threshold_worker(struct work_struct *work)
 
 	backend_csf = container_of(work, struct kbase_hwcnt_backend_csf,
 				   hwc_threshold_work);
+	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
+
+	/* Assert the backend is not destroyed. */
+	WARN_ON(backend_csf != backend_csf->info->backend);
 
 	/* Read the raw extract and insert indexes from the CSF interface. */
 	backend_csf->info->csf_if->get_indexes(backend_csf->info->csf_if->ctx,
 					       &extract_index, &insert_index);
 
-	spin_lock_irqsave(&backend_csf->info->lock, flags);
-	/* Assert the backend is not destroyed. */
-	WARN_ON(backend_csf != backend_csf->info->backend);
-
 	/* The backend was disabled or had an error while the worker was being
 	 * launched.
 	 */
 	if (backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_ENABLED) {
-		spin_unlock_irqrestore(&backend_csf->info->lock, flags);
+		backend_csf->info->csf_if->unlock(
+			backend_csf->info->csf_if->ctx, flags);
 		return;
 	}
 
@@ -667,14 +727,19 @@ static void kbasep_hwcnt_backend_csf_threshold_worker(struct work_struct *work)
 	if ((backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE) &&
 	    (backend_csf->dump_state !=
 	     KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED)) {
-		spin_unlock_irqrestore(&backend_csf->info->lock, flags);
+		backend_csf->info->csf_if->unlock(
+			backend_csf->info->csf_if->ctx, flags);
 		return;
 	}
-	spin_unlock_irqrestore(&backend_csf->info->lock, flags);
-
-	/* Accumulate everything we possibly can. We grabbed offsets before the
-	 * spin lock, so we know it is not possible for a concurrent dump's
-	 * insert_to_accumulate to exceed the insert we grabbed.
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
+					  flags);
+
+	/* Accumulate everything we possibly can. We grabbed the insert index
+	 * immediately after we acquired the lock but before we checked whether
+	 * a concurrent dump was triggered. This ensures that if a concurrent
+	 * dump was triggered between releasing the lock and now, we know for a
+	 * fact that our insert will not exceed the concurrent dump's
+	 * insert_to_accumulate, so we don't risk accumulating too much data.
 	 */
 	kbasep_hwcnt_backend_csf_accumulate_samples(backend_csf, extract_index,
 						    insert_index);
@@ -685,45 +750,31 @@ static void kbasep_hwcnt_backend_csf_threshold_worker(struct work_struct *work)
 static void kbase_hwcnt_backend_csf_submit_dump_worker(
 	struct kbase_hwcnt_backend_csf_info *csf_info)
 {
-	unsigned long flags;
 	u32 extract_index;
-	u32 insert_index;
 
 	WARN_ON(!csf_info);
-
-	csf_info->csf_if->get_indexes(csf_info->csf_if->ctx, &extract_index,
-				      &insert_index);
-
-	spin_lock_irqsave(&csf_info->lock, flags);
-
-	/* Make sure the backend exists and is in the correct state.
-	 * A lot of things could have happened to it in the period before we
-	 * acquired the lock.
+	csf_info->csf_if->assert_lock_held(csf_info->csf_if->ctx);
+
+	WARN_ON(!kbasep_hwcnt_backend_csf_backend_exists(csf_info));
+	WARN_ON(csf_info->backend->enable_state !=
+		KBASE_HWCNT_BACKEND_CSF_ENABLED);
+	WARN_ON(csf_info->backend->dump_state !=
+		KBASE_HWCNT_BACKEND_CSF_DUMP_QUERYING_INSERT);
+
+	/* Save insert index now so that the dump worker only accumulates the
+	 * HWC data associated with this request. Extract index is not stored
+	 * as that needs to be checked when accumulating to prevent re-reading
+	 * buffers that have already been read and returned to the GPU.
 	 */
-	if (kbasep_hwcnt_backend_csf_backend_exists(csf_info) &&
-	    (csf_info->backend->enable_state ==
-		     KBASE_HWCNT_BACKEND_CSF_ENABLED ||
-	     csf_info->backend->enable_state ==
-		     KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED) &&
-	    csf_info->backend->dump_state ==
-		    KBASE_HWCNT_BACKEND_CSF_DUMP_QUERYING_INSERT) {
-		csf_info->backend->insert_index_to_accumulate = insert_index;
-		csf_info->backend->dump_state =
-			KBASE_HWCNT_BACKEND_CSF_DUMP_WORKER_LAUNCHED;
-
-		/* Submit the accumulator task into the work queue. */
-		while (true != queue_work(csf_info->backend->hwc_dump_workq,
-					  &csf_info->backend->hwc_dump_work)) {
-			/* Spin until we have guaranteed the work has been
-			 * submitted.
-			 * Without this there is a potential race where a prior
-			 * submission of the work may still technically be on
-			 * the queue, even though all of its work is complete.
-			 */
-		}
-	}
-
-	spin_unlock_irqrestore(&csf_info->lock, flags);
+	csf_info->csf_if->get_indexes(
+		csf_info->csf_if->ctx, &extract_index,
+		&csf_info->backend->insert_index_to_accumulate);
+	csf_info->backend->dump_state =
+		KBASE_HWCNT_BACKEND_CSF_DUMP_WORKER_LAUNCHED;
+
+	/* Submit the accumulator task into the work queue. */
+	queue_work(csf_info->backend->hwc_dump_workq,
+		   &csf_info->backend->hwc_dump_work);
 }
 
 static void kbasep_hwcnt_backend_csf_get_physical_enable(
@@ -753,59 +804,36 @@ static void kbasep_hwcnt_backend_csf_get_physical_enable(
 	enable->clk_enable_map = enable_map->clk_enable_map;
 }
 
-static int kbasep_hwcnt_backend_csf_dump_enable_impl(
+/* CSF backend implementation of kbase_hwcnt_backend_dump_enable_nolock_fn */
+static int kbasep_hwcnt_backend_csf_dump_enable_nolock(
 	struct kbase_hwcnt_backend *backend,
-	const struct kbase_hwcnt_enable_map *enable_map,
-	struct kbase_hwcnt_backend_csf_if_enable *out_enable)
+	const struct kbase_hwcnt_enable_map *enable_map)
 {
-	unsigned long flags;
 	struct kbase_hwcnt_backend_csf *backend_csf =
 		(struct kbase_hwcnt_backend_csf *)backend;
 	struct kbase_hwcnt_backend_csf_if_enable enable;
 
-	WARN_ON(!out_enable);
-
 	if (!backend_csf || !enable_map ||
 	    (enable_map->metadata != backend_csf->info->metadata))
 		return -EINVAL;
 
+	backend_csf->info->csf_if->assert_lock_held(
+		backend_csf->info->csf_if->ctx);
+
 	kbasep_hwcnt_backend_csf_get_physical_enable(backend_csf, enable_map,
 						     &enable);
 
-	spin_lock_irqsave(&backend_csf->info->lock, flags);
 	/* enable_state should be DISABLED before we transfer it to enabled */
-	if (backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_DISABLED) {
-		spin_unlock_irqrestore(&backend_csf->info->lock, flags);
+	if (backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_DISABLED)
 		return -EIO;
-	}
 
 	backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE;
 	WARN_ON(!completion_done(&backend_csf->dump_completed));
 	kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
 		backend_csf, KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED);
-	spin_unlock_irqrestore(&backend_csf->info->lock, flags);
 
-	*out_enable = enable;
-	return 0;
-}
-
-/* CSF backend implementation of kbase_hwcnt_backend_dump_enable_nolock_fn */
-static int kbasep_hwcnt_backend_csf_dump_enable_nolock(
-	struct kbase_hwcnt_backend *backend,
-	const struct kbase_hwcnt_enable_map *enable_map)
-{
-	int errcode;
-	struct kbase_hwcnt_backend_csf *backend_csf =
-		(struct kbase_hwcnt_backend_csf *)backend;
-	struct kbase_hwcnt_backend_csf_if_enable enable;
-
-	errcode = kbasep_hwcnt_backend_csf_dump_enable_impl(backend, enable_map,
-							    &enable);
-	if (errcode)
-		return errcode;
-
-	backend_csf->info->csf_if->dump_enable_nolock(
-		backend_csf->info->csf_if->ctx, backend_csf->ring_buf, &enable);
+	backend_csf->info->csf_if->dump_enable(backend_csf->info->csf_if->ctx,
+					       backend_csf->ring_buf, &enable);
 
 	kbasep_hwcnt_backend_csf_cc_initial_sample(backend_csf, enable_map);
 
@@ -818,33 +846,33 @@ static int kbasep_hwcnt_backend_csf_dump_enable(
 	const struct kbase_hwcnt_enable_map *enable_map)
 {
 	int errcode;
+	unsigned long flags;
 	struct kbase_hwcnt_backend_csf *backend_csf =
 		(struct kbase_hwcnt_backend_csf *)backend;
-	struct kbase_hwcnt_backend_csf_if_enable enable;
-
-	errcode = kbasep_hwcnt_backend_csf_dump_enable_impl(backend, enable_map,
-							    &enable);
-	if (errcode)
-		return errcode;
-
-	backend_csf->info->csf_if->dump_enable(backend_csf->info->csf_if->ctx,
-					       backend_csf->ring_buf, &enable);
 
-	kbasep_hwcnt_backend_csf_cc_initial_sample(backend_csf, enable_map);
+	if (!backend_csf)
+		return -EINVAL;
 
-	return 0;
+	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
+	errcode = kbasep_hwcnt_backend_csf_dump_enable_nolock(backend,
+							      enable_map);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
+					  flags);
+	return errcode;
 }
 
 static void kbasep_hwcnt_backend_csf_wait_enable_transition_complete(
 	struct kbase_hwcnt_backend_csf *backend_csf, unsigned long *lock_flags)
 {
-	lockdep_assert_held(&backend_csf->info->lock);
+	backend_csf->info->csf_if->assert_lock_held(
+		backend_csf->info->csf_if->ctx);
 
 	while ((backend_csf->enable_state ==
 		KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED) ||
 	       (backend_csf->enable_state ==
 		KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED)) {
-		spin_unlock_irqrestore(&backend_csf->info->lock, *lock_flags);
+		backend_csf->info->csf_if->unlock(
+			backend_csf->info->csf_if->ctx, *lock_flags);
 
 		wait_event(
 			backend_csf->enable_state_waitq,
@@ -853,7 +881,8 @@ static void kbasep_hwcnt_backend_csf_wait_enable_transition_complete(
 				(backend_csf->enable_state !=
 				 KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED));
 
-		spin_lock_irqsave(&backend_csf->info->lock, *lock_flags);
+		backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx,
+						lock_flags);
 	}
 }
 
@@ -868,7 +897,7 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 
 	WARN_ON(!backend_csf);
 
-	spin_lock_irqsave(&backend_csf->info->lock, flags);
+	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 
 	/* Make sure we wait until any previous enable or disable have completed
 	 * before doing anything.
@@ -882,7 +911,8 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 		/* If we are already disabled or in an unrecoverable error
 		 * state, there is nothing for us to do.
 		 */
-		spin_unlock_irqrestore(&backend_csf->info->lock, flags);
+		backend_csf->info->csf_if->unlock(
+			backend_csf->info->csf_if->ctx, flags);
 		return;
 	}
 
@@ -901,7 +931,8 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 	WARN_ON(backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE);
 	WARN_ON(!completion_done(&backend_csf->dump_completed));
 
-	spin_unlock_irqrestore(&backend_csf->info->lock, flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
+					  flags);
 
 	/* Block until any async work has completed. We have transitioned out of
 	 * the ENABLED state so we can guarantee no new work will concurrently
@@ -909,23 +940,16 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 	 */
 	flush_workqueue(backend_csf->hwc_dump_workq);
 
+	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
+
 	if (do_disable)
 		backend_csf->info->csf_if->dump_disable(
 			backend_csf->info->csf_if->ctx);
 
-	spin_lock_irqsave(&backend_csf->info->lock, flags);
-
 	kbasep_hwcnt_backend_csf_wait_enable_transition_complete(backend_csf,
 								 &flags);
 
 	switch (backend_csf->enable_state) {
-	case KBASE_HWCNT_BACKEND_CSF_DISABLED:
-	case KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED:
-	case KBASE_HWCNT_BACKEND_CSF_ENABLED:
-	case KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED:
-	case KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR:
-		WARN_ON(true);
-		break;
 	case KBASE_HWCNT_BACKEND_CSF_DISABLED_WAIT_FOR_WORKER:
 		kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
 			backend_csf, KBASE_HWCNT_BACKEND_CSF_DISABLED);
@@ -935,9 +959,13 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 			backend_csf,
 			KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR);
 		break;
+	default:
+		WARN_ON(true);
+		break;
 	}
 
-	spin_unlock_irqrestore(&backend_csf->info->lock, flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
+					  flags);
 
 	/* After disable, zero the header of all buffers in the ring buffer back
 	 * to 0 to prepare for the next enable.
@@ -947,7 +975,7 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 	/* Sync zeroed buffers to avoid coherency issues on future use. */
 	backend_csf->info->csf_if->ring_buf_sync(
 		backend_csf->info->csf_if->ctx, backend_csf->ring_buf, 0,
-		(backend_csf->info->ring_buf_cnt - 1), false);
+		backend_csf->info->ring_buf_cnt, false);
 
 	/* Reset accumulator, old_sample_buf and user_sample to all-0 to prepare
 	 * for next enable.
@@ -968,12 +996,27 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 	if (!backend_csf)
 		return -EINVAL;
 
-	spin_lock_irqsave(&backend_csf->info->lock, flags);
-	/* Make sure we are enabled or becoming enabled. */
-	if ((backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_ENABLED) &&
-	    (backend_csf->enable_state !=
-	     KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED)) {
-		spin_unlock_irqrestore(&backend_csf->info->lock, flags);
+	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
+
+	/* If we're transitioning to enabled there's nothing to accumulate, and
+	 * the user dump buffer is already zeroed. We can just short circuit to
+	 * the DUMP_COMPLETED state.
+	 */
+	if (backend_csf->enable_state ==
+	    KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED) {
+		backend_csf->dump_state =
+			KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED;
+		*dump_time_ns = kbasep_hwcnt_backend_csf_timestamp_ns(backend);
+		kbasep_hwcnt_backend_csf_cc_update(backend_csf);
+		backend_csf->info->csf_if->unlock(
+			backend_csf->info->csf_if->ctx, flags);
+		return 0;
+	}
+
+	/* Otherwise, make sure we're already enabled. */
+	if (backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_ENABLED) {
+		backend_csf->info->csf_if->unlock(
+			backend_csf->info->csf_if->ctx, flags);
 		return -EIO;
 	}
 
@@ -983,27 +1026,14 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 	if ((backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE) &&
 	    (backend_csf->dump_state !=
 	     KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED)) {
-		spin_unlock_irqrestore(&backend_csf->info->lock, flags);
+		backend_csf->info->csf_if->unlock(
+			backend_csf->info->csf_if->ctx, flags);
 		/* HWC is disabled or another dump is ongoing, or we are on
 		 * fault.
 		 */
 		return -EIO;
 	}
 
-	/* If we are transitioning to enabled there is nothing to accumulate,
-	 * and the user dump buffer is already zeroed.
-	 * We can just short circuit to the DUMP_COMPLETED state.
-	 */
-	if (backend_csf->enable_state ==
-	    KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED) {
-		backend_csf->dump_state =
-			KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED;
-		spin_unlock_irqrestore(&backend_csf->info->lock, flags);
-		*dump_time_ns = kbasep_hwcnt_backend_csf_timestamp_ns(backend);
-		kbasep_hwcnt_backend_csf_cc_update(backend_csf);
-		return 0;
-	}
-
 	/* Reset the completion so dump_wait() has something to wait on. */
 	reinit_completion(&backend_csf->dump_completed);
 
@@ -1022,7 +1052,6 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 		backend_csf->dump_state =
 			KBASE_HWCNT_BACKEND_CSF_DUMP_QUERYING_INSERT;
 	}
-	spin_unlock_irqrestore(&backend_csf->info->lock, flags);
 
 	/* CSF firmware might enter protected mode now, but still call request.
 	 * That is fine, as we changed state while holding the lock, so the
@@ -1036,13 +1065,14 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 	*dump_time_ns = kbasep_hwcnt_backend_csf_timestamp_ns(backend);
 	kbasep_hwcnt_backend_csf_cc_update(backend_csf);
 
-	if (do_request) {
+	if (do_request)
 		backend_csf->info->csf_if->dump_request(
 			backend_csf->info->csf_if->ctx);
-	} else {
+	else
 		kbase_hwcnt_backend_csf_submit_dump_worker(backend_csf->info);
-	}
 
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
+					  flags);
 	return 0;
 }
 
@@ -1060,13 +1090,14 @@ kbasep_hwcnt_backend_csf_dump_wait(struct kbase_hwcnt_backend *backend)
 
 	wait_for_completion(&backend_csf->dump_completed);
 
-	spin_lock_irqsave(&backend_csf->info->lock, flags);
+	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 	/* Make sure the last dump actually succeeded. */
 	errcode = (backend_csf->dump_state ==
 		   KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED) ?
 			  0 :
 			  -EIO;
-	spin_unlock_irqrestore(&backend_csf->info->lock, flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
+					  flags);
 
 	return errcode;
 }
@@ -1144,10 +1175,8 @@ kbasep_hwcnt_backend_csf_destroy(struct kbase_hwcnt_backend_csf *backend_csf)
 
 	destroy_workqueue(backend_csf->hwc_dump_workq);
 
-	if (backend_csf->info->csf_if->ring_buf_free) {
-		backend_csf->info->csf_if->ring_buf_free(
-			backend_csf->info->csf_if->ctx, backend_csf->ring_buf);
-	}
+	backend_csf->info->csf_if->ring_buf_free(backend_csf->info->csf_if->ctx,
+						 backend_csf->ring_buf);
 
 	kfree(backend_csf->accum_buf);
 	backend_csf->accum_buf = NULL;
@@ -1183,18 +1212,21 @@ kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info,
 		goto alloc_error;
 
 	backend_csf->info = csf_info;
-	kbasep_hwcnt_backend_csf_init_layout(&csf_info->gpu_info,
+	kbasep_hwcnt_backend_csf_init_layout(&csf_info->prfcnt_info,
 					     &backend_csf->phys_layout);
 
-	backend_csf->accum_buf = kzalloc(csf_info->dump_bytes, GFP_KERNEL);
+	backend_csf->accum_buf =
+		kzalloc(csf_info->prfcnt_info.dump_bytes, GFP_KERNEL);
 	if (!backend_csf->accum_buf)
 		goto err_alloc_acc_buf;
 
-	backend_csf->old_sample_buf = kzalloc(csf_info->dump_bytes, GFP_KERNEL);
+	backend_csf->old_sample_buf =
+		kzalloc(csf_info->prfcnt_info.dump_bytes, GFP_KERNEL);
 	if (!backend_csf->old_sample_buf)
 		goto err_alloc_pre_sample_buf;
 
-	backend_csf->to_user_buf = kzalloc(csf_info->dump_bytes, GFP_KERNEL);
+	backend_csf->to_user_buf =
+		kzalloc(csf_info->prfcnt_info.dump_bytes, GFP_KERNEL);
 	if (!backend_csf->to_user_buf)
 		goto err_alloc_user_sample_buf;
 
@@ -1210,7 +1242,7 @@ kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info,
 	/* Sync zeroed buffers to avoid coherency issues on use. */
 	backend_csf->info->csf_if->ring_buf_sync(
 		backend_csf->info->csf_if->ctx, backend_csf->ring_buf, 0,
-		(backend_csf->info->ring_buf_cnt - 1), false);
+		backend_csf->info->ring_buf_cnt, false);
 
 	init_completion(&backend_csf->dump_completed);
 
@@ -1278,17 +1310,17 @@ kbasep_hwcnt_backend_csf_init(const struct kbase_hwcnt_backend_info *info,
 	/* If it was not created before, attach it to csf_info.
 	 * Use spin lock to avoid concurrent initialization.
 	 */
-	spin_lock_irqsave(&csf_info->lock, flags);
+	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 	if (csf_info->backend == NULL) {
 		csf_info->backend = backend_csf;
 		*out_backend = (struct kbase_hwcnt_backend *)backend_csf;
 		success = true;
-		if (csf_info->unrecoverable_error_happened) {
+		if (csf_info->unrecoverable_error_happened)
 			backend_csf->enable_state =
 				KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR;
-		}
 	}
-	spin_unlock_irqrestore(&csf_info->lock, flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
+					  flags);
 
 	/* Destroy the new created backend if the backend has already created
 	 * before. In normal case, this won't happen if the client call init()
@@ -1317,9 +1349,10 @@ static void kbasep_hwcnt_backend_csf_term(struct kbase_hwcnt_backend *backend)
 	/* Set the backend in csf_info to NULL so we won't handle any external
 	 * notification anymore since we are terminating.
 	 */
-	spin_lock_irqsave(&backend_csf->info->lock, flags);
+	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 	backend_csf->info->backend = NULL;
-	spin_unlock_irqrestore(&backend_csf->info->lock, flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
+					  flags);
 
 	kbasep_hwcnt_backend_csf_destroy(backend_csf);
 }
@@ -1370,8 +1403,6 @@ static int kbasep_hwcnt_backend_csf_info_create(
 	if (!info)
 		return -ENOMEM;
 
-	spin_lock_init(&info->lock);
-
 #if defined(CONFIG_MALI_PRFCNT_SET_SECONDARY)
 	info->counter_set = KBASE_HWCNT_SET_SECONDARY;
 #elif defined(CONFIG_MALI_PRFCNT_SET_TERTIARY)
@@ -1405,11 +1436,12 @@ kbasep_hwcnt_backend_csf_metadata(const struct kbase_hwcnt_backend_info *info)
 }
 
 static void kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
-	struct kbase_hwcnt_backend_csf *backend_csf, unsigned long *lock_flags)
+	struct kbase_hwcnt_backend_csf *backend_csf)
 {
 	bool do_disable = false;
 
-	lockdep_assert_held(&backend_csf->info->lock);
+	backend_csf->info->csf_if->assert_lock_held(
+		backend_csf->info->csf_if->ctx);
 
 	/* We are already in or transitioning to the unrecoverable error state.
 	 * Early out.
@@ -1451,18 +1483,16 @@ static void kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
 	 * disabled, - we don't want to disable twice if an unrecoverable error
 	 * happens while we are disabling.
 	 */
-	if (do_disable) {
-		spin_unlock_irqrestore(&backend_csf->info->lock, *lock_flags);
+	if (do_disable)
 		backend_csf->info->csf_if->dump_disable(
 			backend_csf->info->csf_if->ctx);
-		spin_lock_irqsave(&backend_csf->info->lock, *lock_flags);
-	}
 }
 
 static void kbasep_hwcnt_backend_csf_handle_recoverable_error(
-	struct kbase_hwcnt_backend_csf *backend_csf, unsigned long *lock_flags)
+	struct kbase_hwcnt_backend_csf *backend_csf)
 {
-	lockdep_assert_held(&backend_csf->info->lock);
+	backend_csf->info->csf_if->assert_lock_held(
+		backend_csf->info->csf_if->ctx);
 
 	switch (backend_csf->enable_state) {
 	case KBASE_HWCNT_BACKEND_CSF_DISABLED:
@@ -1478,8 +1508,8 @@ static void kbasep_hwcnt_backend_csf_handle_recoverable_error(
 		/* A seemingly recoverable error that occurs while we are
 		 * transitioning to enabled is probably unrecoverable.
 		 */
-		kbasep_hwcnt_backend_csf_handle_unrecoverable_error(backend_csf,
-								    lock_flags);
+		kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
+			backend_csf);
 		return;
 	case KBASE_HWCNT_BACKEND_CSF_ENABLED:
 		/* Start transitioning to the disabled state. We can't wait for
@@ -1496,14 +1526,8 @@ static void kbasep_hwcnt_backend_csf_handle_recoverable_error(
 		backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE;
 		complete_all(&backend_csf->dump_completed);
 
-		/* Unlock spin lock before we call csf_if disable(). */
-		spin_unlock_irqrestore(&backend_csf->info->lock, *lock_flags);
-
 		backend_csf->info->csf_if->dump_disable(
 			backend_csf->info->csf_if->ctx);
-
-		/* Lock spin lock again to match the spin lock pairs. */
-		spin_lock_irqsave(&backend_csf->info->lock, *lock_flags);
 		return;
 	}
 }
@@ -1511,44 +1535,27 @@ static void kbasep_hwcnt_backend_csf_handle_recoverable_error(
 void kbase_hwcnt_backend_csf_protm_entered(
 	struct kbase_hwcnt_backend_interface *iface)
 {
-	unsigned long flags;
-	struct kbase_hwcnt_backend_csf_info *csf_info;
-	struct kbase_hwcnt_backend_csf *backend_csf;
-
-	csf_info = (struct kbase_hwcnt_backend_csf_info *)iface->info;
+	struct kbase_hwcnt_backend_csf_info *csf_info =
+		(struct kbase_hwcnt_backend_csf_info *)iface->info;
 
-	spin_lock_irqsave(&csf_info->lock, flags);
+	csf_info->csf_if->assert_lock_held(csf_info->csf_if->ctx);
 	csf_info->fw_in_protected_mode = true;
 
-	/* Early out if the backend does not exist. */
-	if (!kbasep_hwcnt_backend_csf_backend_exists(csf_info)) {
-		spin_unlock_irqrestore(&csf_info->lock, flags);
-		return;
-	}
-
-	backend_csf = csf_info->backend;
-	/* If we are not in REQUESTED state, we don't need to do the dumping. */
-	if (backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_REQUESTED) {
-		spin_unlock_irqrestore(&csf_info->lock, flags);
-		return;
-	}
-	backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_QUERYING_INSERT;
-
-	spin_unlock_irqrestore(&csf_info->lock, flags);
-	kbase_hwcnt_backend_csf_submit_dump_worker(csf_info);
+	/* Call on_prfcnt_sample() to trigger collection of the protected mode
+	 * entry auto-sample if there is currently a pending dump request.
+	 */
+	kbase_hwcnt_backend_csf_on_prfcnt_sample(iface);
 }
 
 void kbase_hwcnt_backend_csf_protm_exited(
 	struct kbase_hwcnt_backend_interface *iface)
 {
-	unsigned long flags;
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 
 	csf_info = (struct kbase_hwcnt_backend_csf_info *)iface->info;
 
-	spin_lock_irqsave(&csf_info->lock, flags);
+	csf_info->csf_if->assert_lock_held(csf_info->csf_if->ctx);
 	csf_info->fw_in_protected_mode = false;
-	spin_unlock_irqrestore(&csf_info->lock, flags);
 }
 
 void kbase_hwcnt_backend_csf_on_unrecoverable_error(
@@ -1559,18 +1566,17 @@ void kbase_hwcnt_backend_csf_on_unrecoverable_error(
 
 	csf_info = (struct kbase_hwcnt_backend_csf_info *)iface->info;
 
-	spin_lock_irqsave(&csf_info->lock, flags);
+	csf_info->csf_if->lock(csf_info->csf_if->ctx, &flags);
 	csf_info->unrecoverable_error_happened = true;
 	/* Early out if the backend does not exist. */
 	if (!kbasep_hwcnt_backend_csf_backend_exists(csf_info)) {
-		spin_unlock_irqrestore(&csf_info->lock, flags);
+		csf_info->csf_if->unlock(csf_info->csf_if->ctx, flags);
 		return;
 	}
 
-	kbasep_hwcnt_backend_csf_handle_unrecoverable_error(csf_info->backend,
-							    &flags);
+	kbasep_hwcnt_backend_csf_handle_unrecoverable_error(csf_info->backend);
 
-	spin_unlock_irqrestore(&csf_info->lock, flags);
+	csf_info->csf_if->unlock(csf_info->csf_if->ctx, flags);
 }
 
 void kbase_hwcnt_backend_csf_on_before_reset(
@@ -1582,11 +1588,11 @@ void kbase_hwcnt_backend_csf_on_before_reset(
 
 	csf_info = (struct kbase_hwcnt_backend_csf_info *)iface->info;
 
-	spin_lock_irqsave(&csf_info->lock, flags);
+	csf_info->csf_if->lock(csf_info->csf_if->ctx, &flags);
 	csf_info->unrecoverable_error_happened = false;
 	/* Early out if the backend does not exist. */
 	if (!kbasep_hwcnt_backend_csf_backend_exists(csf_info)) {
-		spin_unlock_irqrestore(&csf_info->lock, flags);
+		csf_info->csf_if->unlock(csf_info->csf_if->ctx, flags);
 		return;
 	}
 	backend_csf = csf_info->backend;
@@ -1605,7 +1611,7 @@ void kbase_hwcnt_backend_csf_on_before_reset(
 		 * really matter, the power is being pulled.
 		 */
 		kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
-			csf_info->backend, &flags);
+			csf_info->backend);
 	}
 
 	/* A reset is the only way to exit the unrecoverable error state */
@@ -1615,81 +1621,66 @@ void kbase_hwcnt_backend_csf_on_before_reset(
 			backend_csf, KBASE_HWCNT_BACKEND_CSF_DISABLED);
 	}
 
-	spin_unlock_irqrestore(&csf_info->lock, flags);
+	csf_info->csf_if->unlock(csf_info->csf_if->ctx, flags);
 }
 
 void kbase_hwcnt_backend_csf_on_prfcnt_sample(
 	struct kbase_hwcnt_backend_interface *iface)
 {
-	unsigned long flags;
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 	struct kbase_hwcnt_backend_csf *backend_csf;
 
 	csf_info = (struct kbase_hwcnt_backend_csf_info *)iface->info;
+	csf_info->csf_if->assert_lock_held(csf_info->csf_if->ctx);
 
-	spin_lock_irqsave(&csf_info->lock, flags);
 	/* Early out if the backend does not exist. */
-	if (!kbasep_hwcnt_backend_csf_backend_exists(csf_info)) {
-		spin_unlock_irqrestore(&csf_info->lock, flags);
+	if (!kbasep_hwcnt_backend_csf_backend_exists(csf_info))
 		return;
-	}
 	backend_csf = csf_info->backend;
 
 	/* If the current state is not REQUESTED, this HWC sample will be
 	 * skipped and processed in next dump_request.
 	 */
-	if (backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_REQUESTED) {
-		spin_unlock_irqrestore(&csf_info->lock, flags);
+	if (backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_REQUESTED)
 		return;
-	}
 	backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_QUERYING_INSERT;
 
-	spin_unlock_irqrestore(&csf_info->lock, flags);
 	kbase_hwcnt_backend_csf_submit_dump_worker(csf_info);
 }
 
 void kbase_hwcnt_backend_csf_on_prfcnt_threshold(
 	struct kbase_hwcnt_backend_interface *iface)
 {
-	unsigned long flags;
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 	struct kbase_hwcnt_backend_csf *backend_csf;
 
 	csf_info = (struct kbase_hwcnt_backend_csf_info *)iface->info;
+	csf_info->csf_if->assert_lock_held(csf_info->csf_if->ctx);
 
-	spin_lock_irqsave(&csf_info->lock, flags);
 	/* Early out if the backend does not exist. */
-	if (!kbasep_hwcnt_backend_csf_backend_exists(csf_info)) {
-		spin_unlock_irqrestore(&csf_info->lock, flags);
+	if (!kbasep_hwcnt_backend_csf_backend_exists(csf_info))
 		return;
-	}
 	backend_csf = csf_info->backend;
 
-	if (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_ENABLED) {
+	if (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_ENABLED)
 		/* Submit the threshold work into the work queue to consume the
 		 * available samples.
 		 */
 		queue_work(backend_csf->hwc_dump_workq,
 			   &backend_csf->hwc_threshold_work);
-	}
-
-	spin_unlock_irqrestore(&csf_info->lock, flags);
 }
 
 void kbase_hwcnt_backend_csf_on_prfcnt_overflow(
 	struct kbase_hwcnt_backend_interface *iface)
 {
-	unsigned long flags;
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 
 	csf_info = (struct kbase_hwcnt_backend_csf_info *)iface->info;
+	csf_info->csf_if->assert_lock_held(csf_info->csf_if->ctx);
 
-	spin_lock_irqsave(&csf_info->lock, flags);
 	/* Early out if the backend does not exist. */
-	if (!kbasep_hwcnt_backend_csf_backend_exists(csf_info)) {
-		spin_unlock_irqrestore(&csf_info->lock, flags);
+	if (!kbasep_hwcnt_backend_csf_backend_exists(csf_info))
 		return;
-	}
 
 	/* Called when an overflow occurs. We treat this as a recoverable error,
 	 * so we start transitioning to the disabled state.
@@ -1698,27 +1689,21 @@ void kbase_hwcnt_backend_csf_on_prfcnt_overflow(
 	 * complex recovery code when we can just turn ourselves off instead for
 	 * a while.
 	 */
-	kbasep_hwcnt_backend_csf_handle_recoverable_error(csf_info->backend,
-							  &flags);
-
-	spin_unlock_irqrestore(&csf_info->lock, flags);
+	kbasep_hwcnt_backend_csf_handle_recoverable_error(csf_info->backend);
 }
 
 void kbase_hwcnt_backend_csf_on_prfcnt_enable(
 	struct kbase_hwcnt_backend_interface *iface)
 {
-	unsigned long flags;
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 	struct kbase_hwcnt_backend_csf *backend_csf;
 
 	csf_info = (struct kbase_hwcnt_backend_csf_info *)iface->info;
+	csf_info->csf_if->assert_lock_held(csf_info->csf_if->ctx);
 
-	spin_lock_irqsave(&csf_info->lock, flags);
 	/* Early out if the backend does not exist. */
-	if (!kbasep_hwcnt_backend_csf_backend_exists(csf_info)) {
-		spin_unlock_irqrestore(&csf_info->lock, flags);
+	if (!kbasep_hwcnt_backend_csf_backend_exists(csf_info))
 		return;
-	}
 	backend_csf = csf_info->backend;
 
 	if (backend_csf->enable_state ==
@@ -1735,27 +1720,22 @@ void kbase_hwcnt_backend_csf_on_prfcnt_enable(
 		 * we reset.
 		 */
 		kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
-			csf_info->backend, &flags);
+			csf_info->backend);
 	}
-
-	spin_unlock_irqrestore(&csf_info->lock, flags);
 }
 
 void kbase_hwcnt_backend_csf_on_prfcnt_disable(
 	struct kbase_hwcnt_backend_interface *iface)
 {
-	unsigned long flags;
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 	struct kbase_hwcnt_backend_csf *backend_csf;
 
 	csf_info = (struct kbase_hwcnt_backend_csf_info *)iface->info;
+	csf_info->csf_if->assert_lock_held(csf_info->csf_if->ctx);
 
-	spin_lock_irqsave(&csf_info->lock, flags);
 	/* Early out if the backend does not exist. */
-	if (!kbasep_hwcnt_backend_csf_backend_exists(csf_info)) {
-		spin_unlock_irqrestore(&csf_info->lock, flags);
+	if (!kbasep_hwcnt_backend_csf_backend_exists(csf_info))
 		return;
-	}
 	backend_csf = csf_info->backend;
 
 	if (backend_csf->enable_state ==
@@ -1773,10 +1753,8 @@ void kbase_hwcnt_backend_csf_on_prfcnt_disable(
 		 * we reset.
 		 */
 		kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
-			csf_info->backend, &flags);
+			csf_info->backend);
 	}
-
-	spin_unlock_irqrestore(&csf_info->lock, flags);
 }
 
 int kbase_hwcnt_backend_csf_metadata_init(
@@ -1784,28 +1762,29 @@ int kbase_hwcnt_backend_csf_metadata_init(
 {
 	int errcode;
 	struct kbase_hwcnt_backend_csf_info *csf_info;
+	struct kbase_hwcnt_gpu_info gpu_info;
 
 	if (!iface)
 		return -EINVAL;
 
 	csf_info = (struct kbase_hwcnt_backend_csf_info *)iface->info;
 
-	WARN_ON(!csf_info->csf_if->get_gpu_info);
-	csf_info->csf_if->get_gpu_info(csf_info->csf_if->ctx,
-				       &csf_info->dump_bytes,
-				       &csf_info->gpu_info.l2_count,
-				       &csf_info->gpu_info.core_mask,
-				       &csf_info->gpu_info.clk_cnt);
+	WARN_ON(!csf_info->csf_if->get_prfcnt_info);
+
+	csf_info->csf_if->get_prfcnt_info(csf_info->csf_if->ctx,
+					  &csf_info->prfcnt_info);
 
 	/* The clock domain counts should not exceed the number of maximum
 	 * number of clock regulators.
 	 */
-	if (csf_info->gpu_info.clk_cnt > BASE_MAX_NR_CLOCKS_REGULATORS)
+	if (csf_info->prfcnt_info.clk_cnt > BASE_MAX_NR_CLOCKS_REGULATORS)
 		return -EIO;
 
-	errcode = kbase_hwcnt_csf_metadata_create(&csf_info->gpu_info,
-						  csf_info->counter_set,
-						  &csf_info->metadata);
+	gpu_info.l2_count = csf_info->prfcnt_info.l2_count;
+	gpu_info.core_mask = csf_info->prfcnt_info.core_mask;
+	gpu_info.clk_cnt = csf_info->prfcnt_info.clk_cnt;
+	errcode = kbase_hwcnt_csf_metadata_create(
+		&gpu_info, csf_info->counter_set, &csf_info->metadata);
 	if (errcode)
 		return errcode;
 
@@ -1813,7 +1792,8 @@ int kbase_hwcnt_backend_csf_metadata_init(
 	 * Dump abstraction size should be exactly the same size and layout as
 	 * the physical dump size, for backwards compatibility.
 	 */
-	WARN_ON(csf_info->dump_bytes != csf_info->metadata->dump_buf_bytes);
+	WARN_ON(csf_info->prfcnt_info.dump_bytes !=
+		csf_info->metadata->dump_buf_bytes);
 
 	return 0;
 }
@@ -1868,8 +1848,7 @@ int kbase_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_if *csf_if,
 	return 0;
 }
 
-void kbase_hwcnt_backend_csf_destroy(
-	struct kbase_hwcnt_backend_interface *iface)
+void kbase_hwcnt_backend_csf_destroy(struct kbase_hwcnt_backend_interface *iface)
 {
 	if (!iface)
 		return;
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_csf.h b/mali_kbase/mali_kbase_hwcnt_backend_csf.h
index 93938f0..7506274 100644
--- a/mali_kbase/mali_kbase_hwcnt_backend_csf.h
+++ b/mali_kbase/mali_kbase_hwcnt_backend_csf.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -40,7 +40,7 @@
  * @iface:        Non-NULL pointer to backend interface structure that is filled
  *                in on creation success.
  *
- * Calls to iface->dump_enable_nolock() require kbdev->hwaccess_lock held.
+ * Calls to iface->dump_enable_nolock() require the CSF Scheduler IRQ lock.
  *
  * Return: 0 on success, else error code.
  */
@@ -77,7 +77,7 @@ void kbase_hwcnt_backend_csf_destroy(
 	struct kbase_hwcnt_backend_interface *iface);
 
 /**
- * kbase_hwcnt_backend_csf_protm_entered() - CSf HWC backend function to receive
+ * kbase_hwcnt_backend_csf_protm_entered() - CSF HWC backend function to receive
  *                                           notification that protected mode
  *                                           has been entered.
  * @iface: Non-NULL pointer to HWC backend interface.
@@ -86,7 +86,7 @@ void kbase_hwcnt_backend_csf_protm_entered(
 	struct kbase_hwcnt_backend_interface *iface);
 
 /**
- * kbase_hwcnt_backend_csf_protm_exited() - CSf HWC backend function to receive
+ * kbase_hwcnt_backend_csf_protm_exited() - CSF HWC backend function to receive
  *                                          notification that protected mode has
  *                                          been exited.
  * @iface: Non-NULL pointer to HWC backend interface.
@@ -95,22 +95,20 @@ void kbase_hwcnt_backend_csf_protm_exited(
 	struct kbase_hwcnt_backend_interface *iface);
 
 /**
- * kbase_hwcnt_backend_csf_on_unrecoverable_error() - CSf HWC backend function
- *                                                    to be called when an
- *                                                    unrecoverable error
- *                                                    occurs, such as the
- *                                                    firmware has died or bus
- *                                                    error, this puts us into
- *                                                    the unrecoverable error
- *                                                    state, which we can only
- *                                                    get out of by a reset.
+ * kbase_hwcnt_backend_csf_on_unrecoverable_error() - CSF HWC backend function
+ *                                                    called when unrecoverable
+ *                                                    errors are detected.
  * @iface: Non-NULL pointer to HWC backend interface.
+ *
+ * This should be called on encountering errors that can only be recovered from
+ * with reset, or that may put HWC logic in state that could result in hang. For
+ * example, on bus error, or when FW becomes unresponsive.
  */
 void kbase_hwcnt_backend_csf_on_unrecoverable_error(
 	struct kbase_hwcnt_backend_interface *iface);
 
 /**
- * kbase_hwcnt_backend_csf_on_before_reset() - CSf HWC backend function to be
+ * kbase_hwcnt_backend_csf_on_before_reset() - CSF HWC backend function to be
  *                                             called immediately before a
  *                                             reset. Takes us out of the
  *                                             unrecoverable error state, if we
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_csf_if.h b/mali_kbase/mali_kbase_hwcnt_backend_csf_if.h
index e86d240..b4ddd31 100644
--- a/mali_kbase/mali_kbase_hwcnt_backend_csf_if.h
+++ b/mali_kbase/mali_kbase_hwcnt_backend_csf_if.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -61,19 +61,63 @@ struct kbase_hwcnt_backend_csf_if_enable {
 };
 
 /**
- * typedef kbase_hwcnt_backend_csf_if_get_gpu_info_fn - Get GPU information
- * @ctx:       Non-NULL pointer to a CSF context.
- * @dump_size: Non-NULL pointer to where the dump size of performance counter
- *             sample is stored.
- * @l2_count:  Non-NULL pointer to where the MMU L2 cache count is stored.
- * @core_mask: Non-NULL pointer to where shader core mask is stored.
+ * struct kbase_hwcnt_backend_csf_if_prfcnt_info - Performance counter
+ *                                                 information.
+ * @dump_bytes:       Bytes of GPU memory required to perform a performance
+ *                    counter dump.
+ * @l2_count:         The MMU L2 cache count.
+ * @core_mask:        Shader core mask.
+ * @clk_cnt:          Clock domain count in the system.
+ * @clearing_samples: Indicates whether counters are cleared after each sample
+ *                    is taken.
+ */
+struct kbase_hwcnt_backend_csf_if_prfcnt_info {
+	size_t dump_bytes;
+	size_t l2_count;
+	u64 core_mask;
+	u8 clk_cnt;
+	bool clearing_samples;
+};
+
+/**
+ * typedef kbase_hwcnt_backend_csf_if_assert_lock_held_fn - Assert that the
+ *                                                          backend spinlock is
+ *                                                          held.
+ * @ctx: Non-NULL pointer to a CSF context.
+ */
+typedef void (*kbase_hwcnt_backend_csf_if_assert_lock_held_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
+
+/**
+ * typedef kbase_hwcnt_backend_csf_if_lock_fn - Acquire backend spinlock.
+ *
+ * @ctx:   Non-NULL pointer to a CSF context.
+ * @flags: Pointer to the memory location that would store the previous
+ *         interrupt state.
+ */
+typedef void (*kbase_hwcnt_backend_csf_if_lock_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx, unsigned long *flags);
+
+/**
+ * typedef kbase_hwcnt_backend_csf_if_unlock_fn - Release backend spinlock.
  *
- * @clk_cnt:   Non-NULL pointer to where clock domain count in the system is
- *             stored.
+ * @ctx:   Non-NULL pointer to a CSF context.
+ * @flags: Previously stored interrupt state when Scheduler interrupt
+ *         spinlock was acquired.
  */
-typedef void (*kbase_hwcnt_backend_csf_if_get_gpu_info_fn)(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, size_t *dump_size,
-	size_t *l2_count, u64 *core_mask, u8 *clk_cnt);
+typedef void (*kbase_hwcnt_backend_csf_if_unlock_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx, unsigned long flags);
+
+/**
+ * typedef kbase_hwcnt_backend_csf_if_get_prfcnt_info_fn - Get performance
+ *                                                         counter information.
+ * @ctx:          Non-NULL pointer to a CSF context.
+ * @prfcnt_info:  Non-NULL pointer to struct where performance counter
+ *                information should be stored.
+ */
+typedef void (*kbase_hwcnt_backend_csf_if_get_prfcnt_info_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+	struct kbase_hwcnt_backend_csf_if_prfcnt_info *prfcnt_info);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn - Allocate a ring buffer
@@ -105,14 +149,13 @@ typedef int (*kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn)(
  *                   inclusive.
  * @buf_index_last:  The last buffer index in the ring buffer to be synced,
  *                   exclusive.
- * @for_cpu:         The direction of sync to be applied.
- *                   It is set to true when CPU cache needs to be invalidated
- *                   before reading the ring buffer contents. And set to false
- *                   when CPU cache needs to be flushed after writing to the
- *                   ring buffer.
+ * @for_cpu:         The direction of sync to be applied, set to true when CPU
+ *                   cache needs invalidating before reading the buffer, and set
+ *                   to false after CPU writes to flush these before this memory
+ *                   is overwritten by the GPU.
  *
- * After HWC sample request is done in GPU side, we need to sync the dump memory
- * to CPU side to access the HWC data.
+ * Flush cached HWC dump buffer data to ensure that all writes from GPU and CPU
+ * are correctly observed.
  */
 typedef void (*kbase_hwcnt_backend_csf_if_ring_buf_sync_fn)(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
@@ -147,25 +190,10 @@ typedef u64 (*kbase_hwcnt_backend_csf_if_timestamp_ns_fn)(
  * @ctx:      Non-NULL pointer to a CSF interface context.
  * @ring_buf: Non-NULL pointer to the ring buffer which used to setup the HWC.
  * @enable:   Non-NULL pointer to the enable map of HWC.
- */
-typedef void (*kbase_hwcnt_backend_csf_if_dump_enable_fn)(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
-	struct kbase_hwcnt_backend_csf_if_enable *enable);
-
-/**
- * typedef kbase_hwcnt_backend_csf_if_dump_enable_nolock_fn - Setup and enable
- *                                                            hardware counter
- *                                                            in CSF interface.
- * @ctx:      Non-NULL pointer to a CSF interface context.
- * @ring_buf: Non-NULL pointer to the ring buffer which used to setup the HWC.
- * @enable:   Non-NULL pointer to the enable map of HWC.
  *
- * Exactly the same as kbase_hwcnt_backend_csf_if_dump_enable_fn(), except must
- * be called in an atomic context with the spinlock documented by the specific
- * backend interface held.
+ * Requires lock to be taken before calling.
  */
-typedef void (*kbase_hwcnt_backend_csf_if_dump_enable_nolock_fn)(
+typedef void (*kbase_hwcnt_backend_csf_if_dump_enable_fn)(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
 	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
 	struct kbase_hwcnt_backend_csf_if_enable *enable);
@@ -174,13 +202,18 @@ typedef void (*kbase_hwcnt_backend_csf_if_dump_enable_nolock_fn)(
  * typedef kbase_hwcnt_backend_csf_if_dump_disable_fn - Disable hardware counter
  *                                                      in CSF interface.
  * @ctx: Non-NULL pointer to a CSF interface context.
+ *
+ * Requires lock to be taken before calling.
  */
 typedef void (*kbase_hwcnt_backend_csf_if_dump_disable_fn)(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_dump_request_fn - Request a HWC dump.
+ *
  * @ctx: Non-NULL pointer to the interface context.
+ *
+ * Requires lock to be taken before calling.
  */
 typedef void (*kbase_hwcnt_backend_csf_if_dump_request_fn)(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
@@ -189,9 +222,12 @@ typedef void (*kbase_hwcnt_backend_csf_if_dump_request_fn)(
  * typedef kbase_hwcnt_backend_csf_if_get_indexes_fn - Get current extract and
  *                                                     insert indexes of the
  *                                                     ring buffer.
+ *
  * @ctx:           Non-NULL pointer to a CSF interface context.
  * @extract_index: Non-NULL pointer where current extract index to be saved.
  * @insert_index:  Non-NULL pointer where current insert index to be saved.
+ *
+ * Requires lock to be taken before calling.
  */
 typedef void (*kbase_hwcnt_backend_csf_if_get_indexes_fn)(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 *extract_index,
@@ -201,8 +237,11 @@ typedef void (*kbase_hwcnt_backend_csf_if_get_indexes_fn)(
  * typedef kbase_hwcnt_backend_csf_if_set_extract_index_fn - Update the extract
  *                                                           index of the ring
  *                                                           buffer.
+ *
  * @ctx:            Non-NULL pointer to a CSF interface context.
  * @extract_index:  New extract index to be set.
+ *
+ * Requires lock to be taken before calling.
  */
 typedef void (*kbase_hwcnt_backend_csf_if_set_extract_index_fn)(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 extract_index);
@@ -213,9 +252,11 @@ typedef void (*kbase_hwcnt_backend_csf_if_set_extract_index_fn)(
  * @ctx:            Non-NULL pointer to a CSF interface context.
  * @cycle_counts:   Non-NULL pointer to an array where cycle counts to be saved,
  *                  the array size should be at least as big as the number of
- *                  clock domains returned by get_gpu_info interface.
+ *                  clock domains returned by get_prfcnt_info interface.
  * @clk_enable_map: An array of bitfields, each bit specifies an enabled clock
  *                  domain.
+ *
+ * Requires lock to be taken before calling.
  */
 typedef void (*kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn)(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u64 *cycle_counts,
@@ -225,7 +266,11 @@ typedef void (*kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn)(
  * struct kbase_hwcnt_backend_csf_if - Hardware counter backend CSF virtual
  *                                     interface.
  * @ctx:                 CSF interface context.
- * @get_gpu_info:        Function ptr to get HWC related information.
+ * @assert_lock_held:    Function ptr to assert backend spinlock is held.
+ * @lock:                Function ptr to acquire backend spinlock.
+ * @unlock:              Function ptr to release backend spinlock.
+ * @get_prfcnt_info:     Function ptr to get performance counter related
+ *                       information.
  * @ring_buf_alloc:      Function ptr to allocate ring buffer for CSF HWC.
  * @ring_buf_sync:       Function ptr to sync ring buffer to CPU.
  * @ring_buf_free:       Function ptr to free ring buffer for CSF HWC.
@@ -243,13 +288,15 @@ typedef void (*kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn)(
  */
 struct kbase_hwcnt_backend_csf_if {
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx;
-	kbase_hwcnt_backend_csf_if_get_gpu_info_fn get_gpu_info;
+	kbase_hwcnt_backend_csf_if_assert_lock_held_fn assert_lock_held;
+	kbase_hwcnt_backend_csf_if_lock_fn lock;
+	kbase_hwcnt_backend_csf_if_unlock_fn unlock;
+	kbase_hwcnt_backend_csf_if_get_prfcnt_info_fn get_prfcnt_info;
 	kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn ring_buf_alloc;
 	kbase_hwcnt_backend_csf_if_ring_buf_sync_fn ring_buf_sync;
 	kbase_hwcnt_backend_csf_if_ring_buf_free_fn ring_buf_free;
 	kbase_hwcnt_backend_csf_if_timestamp_ns_fn timestamp_ns;
 	kbase_hwcnt_backend_csf_if_dump_enable_fn dump_enable;
-	kbase_hwcnt_backend_csf_if_dump_enable_nolock_fn dump_enable_nolock;
 	kbase_hwcnt_backend_csf_if_dump_disable_fn dump_disable;
 	kbase_hwcnt_backend_csf_if_dump_request_fn dump_request;
 	kbase_hwcnt_backend_csf_if_get_indexes_fn get_indexes;
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c b/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c
index 7a3b239..67ca4cb 100644
--- a/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c
+++ b/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -28,7 +28,7 @@
 #include <device/mali_kbase_device.h>
 #include "mali_kbase_hwcnt_gpu.h"
 #include "mali_kbase_hwcnt_types.h"
-#include "csf/mali_gpu_csf_registers.h"
+#include <uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h>
 
 #include "csf/mali_kbase_csf_firmware.h"
 #include "mali_kbase_hwcnt_backend_csf_if_fw.h"
@@ -88,6 +88,50 @@ struct kbase_hwcnt_backend_csf_if_fw_ctx {
 	struct kbase_ccswe ccswe_shader_cores;
 };
 
+static void kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx)
+{
+	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx;
+	struct kbase_device *kbdev;
+
+	WARN_ON(!ctx);
+
+	fw_ctx = (struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
+	kbdev = fw_ctx->kbdev;
+
+	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
+}
+
+static void
+kbasep_hwcnt_backend_csf_if_fw_lock(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+				    unsigned long *flags)
+{
+	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx;
+	struct kbase_device *kbdev;
+
+	WARN_ON(!ctx);
+
+	fw_ctx = (struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
+	kbdev = fw_ctx->kbdev;
+
+	kbase_csf_scheduler_spin_lock(kbdev, flags);
+}
+
+static void kbasep_hwcnt_backend_csf_if_fw_unlock(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx, unsigned long flags)
+{
+	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx;
+	struct kbase_device *kbdev;
+
+	WARN_ON(!ctx);
+
+	fw_ctx = (struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
+	kbdev = fw_ctx->kbdev;
+
+	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
+	kbase_csf_scheduler_spin_unlock(kbdev, flags);
+}
+
 /**
  * kbasep_hwcnt_backend_csf_if_fw_on_freq_change() - On freq change callback
  *
@@ -170,16 +214,18 @@ static void kbasep_hwcnt_backend_csf_if_fw_cc_disable(
 			rtm, &fw_ctx->rate_listener);
 }
 
-static void kbasep_hwcnt_backend_csf_if_fw_get_gpu_info(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, size_t *dump_size,
-	size_t *l2_count, u64 *core_mask, u8 *clk_cnt)
+static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+	struct kbase_hwcnt_backend_csf_if_prfcnt_info *prfcnt_info)
 {
 #ifdef CONFIG_MALI_NO_MALI
-	*l2_count = KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS;
-	*core_mask = (1ull << KBASE_DUMMY_MODEL_MAX_SHADER_CORES) - 1;
-	*dump_size = KBASE_DUMMY_MODEL_MAX_NUM_PERF_BLOCKS *
-		     KBASE_DUMMY_MODEL_BLOCK_SIZE;
-	*clk_cnt = 1;
+	prfcnt_info->l2_count = KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS;
+	prfcnt_info->core_mask =
+		(1ull << KBASE_DUMMY_MODEL_MAX_SHADER_CORES) - 1;
+	prfcnt_info->dump_bytes = KBASE_DUMMY_MODEL_MAX_NUM_PERF_BLOCKS *
+				  KBASE_DUMMY_MODEL_BLOCK_SIZE;
+	prfcnt_info->clk_cnt = 1;
+	prfcnt_info->clearing_samples = false;
 #else
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx;
 	struct kbase_device *kbdev;
@@ -188,10 +234,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_gpu_info(
 	u32 prfcnt_fw_size = 0;
 
 	WARN_ON(!ctx);
-	WARN_ON(!dump_size);
-	WARN_ON(!l2_count);
-	WARN_ON(!core_mask);
-	WARN_ON(!clk_cnt);
+	WARN_ON(!prfcnt_info);
 
 	fw_ctx = (struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
 	kbdev = fw_ctx->kbdev;
@@ -199,12 +242,14 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_gpu_info(
 	prfcnt_hw_size = (prfcnt_size & 0xFF) << 8;
 	prfcnt_fw_size = (prfcnt_size >> 16) << 8;
 	fw_ctx->buf_bytes = prfcnt_hw_size + prfcnt_fw_size;
-	*dump_size = fw_ctx->buf_bytes;
+	prfcnt_info->dump_bytes = fw_ctx->buf_bytes;
 
-	*l2_count = kbdev->gpu_props.props.l2_props.num_l2_slices;
-	*core_mask = kbdev->gpu_props.props.coherency_info.group[0].core_mask;
+	prfcnt_info->l2_count = kbdev->gpu_props.props.l2_props.num_l2_slices;
+	prfcnt_info->core_mask =
+		kbdev->gpu_props.props.coherency_info.group[0].core_mask;
 
-	*clk_cnt = fw_ctx->clk_cnt;
+	prfcnt_info->clk_cnt = fw_ctx->clk_cnt;
+	prfcnt_info->clearing_samples = true;
 #endif
 }
 
@@ -331,9 +376,14 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync(
 	WARN_ON(!ctx);
 	WARN_ON(!ring_buf);
 
-	/* Get the buffer indexes in the ring buffer. */
+	/* The index arguments for this function form an inclusive, exclusive
+	 * range.
+	 * However, when masking back to the available buffers we will make this
+	 * inclusive at both ends so full flushes are not 0 -> 0.
+	 */
 	ring_buf_index_first = buf_index_first & (fw_ring_buf->buf_count - 1);
-	ring_buf_index_last = buf_index_last & (fw_ring_buf->buf_count - 1);
+	ring_buf_index_last =
+		(buf_index_last - 1) & (fw_ring_buf->buf_count - 1);
 
 	/* The start address is the offset of the first buffer. */
 	start_address = fw_ctx->buf_bytes * ring_buf_index_first;
@@ -348,6 +398,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync(
 		/* sync the first part to the end of ring buffer. */
 		for (i = pg_first; i < fw_ring_buf->num_pages; i++) {
 			struct page *pg = as_page(fw_ring_buf->phys[i]);
+
 			if (for_cpu) {
 				kbase_sync_single_for_cpu(fw_ctx->kbdev,
 							  kbase_dma_addr(pg),
@@ -367,6 +418,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync(
 
 	for (i = pg_first; i <= pg_last; i++) {
 		struct page *pg = as_page(fw_ring_buf->phys[i]);
+
 		if (for_cpu) {
 			kbase_sync_single_for_cpu(fw_ctx->kbdev,
 						  kbase_dma_addr(pg), PAGE_SIZE,
@@ -420,12 +472,11 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_free(
 	}
 }
 
-static void kbasep_hwcnt_backend_csf_if_fw_dump_enable_nolock(
+static void kbasep_hwcnt_backend_csf_if_fw_dump_enable(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
 	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
 	struct kbase_hwcnt_backend_csf_if_enable *enable)
 {
-	unsigned long flags;
 	u32 prfcnt_config;
 	struct kbase_device *kbdev;
 	struct kbase_csf_global_iface *global_iface;
@@ -437,18 +488,15 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_enable_nolock(
 	WARN_ON(!ctx);
 	WARN_ON(!ring_buf);
 	WARN_ON(!enable);
+	kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(ctx);
 
 	kbdev = fw_ctx->kbdev;
 	global_iface = &kbdev->csf.global_iface;
 
-	lockdep_assert_held(&kbdev->hwaccess_lock);
-
 	/* Configure */
 	prfcnt_config = fw_ring_buf->buf_count;
 	prfcnt_config |= enable->counter_set << PRFCNT_CONFIG_SETSELECT_SHIFT;
 
-	kbase_csf_scheduler_spin_lock(kbdev, &flags);
-
 	/* Configure the ring buffer base address */
 	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_JASID,
 					fw_ring_buf->as_nr);
@@ -503,52 +551,25 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_enable_nolock(
 	prfcnt_config = kbase_csf_firmware_global_input_read(global_iface,
 							     GLB_PRFCNT_CONFIG);
 
-	kbase_csf_scheduler_spin_unlock(kbdev, flags);
-
 	kbasep_hwcnt_backend_csf_if_fw_cc_enable(fw_ctx,
 						 enable->clk_enable_map);
 }
 
-static void kbasep_hwcnt_backend_csf_if_fw_dump_enable(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
-	struct kbase_hwcnt_backend_csf_if_enable *enable)
-{
-	unsigned long flags;
-	struct kbase_device *kbdev;
-	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
-		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
-
-	WARN_ON(!ctx);
-	WARN_ON(!ring_buf);
-	WARN_ON(!enable);
-
-	kbdev = fw_ctx->kbdev;
-
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-
-	kbasep_hwcnt_backend_csf_if_fw_dump_enable_nolock(ctx, ring_buf,
-							  enable);
-
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-}
-
 static void kbasep_hwcnt_backend_csf_if_fw_dump_disable(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx)
 {
-	unsigned long flags;
 	struct kbase_device *kbdev;
 	struct kbase_csf_global_iface *global_iface;
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
 		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
 
 	WARN_ON(!ctx);
+	kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(ctx);
 
 	kbdev = fw_ctx->kbdev;
 	global_iface = &kbdev->csf.global_iface;
 
 	/* Disable the HWC */
-	kbase_csf_scheduler_spin_lock(kbdev, &flags);
 	kbdev->csf.hwcnt.enable_pending = true;
 	kbase_csf_firmware_global_input_mask(global_iface, GLB_REQ, 0,
 					     GLB_REQ_PRFCNT_ENABLE_MASK);
@@ -569,7 +590,6 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_disable(
 	 * happens.
 	 */
 	kbdev->csf.hwcnt.request_pending = false;
-	kbase_csf_scheduler_spin_unlock(kbdev, flags);
 
 	kbasep_hwcnt_backend_csf_if_fw_cc_disable(fw_ctx);
 }
@@ -577,7 +597,6 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_disable(
 static void kbasep_hwcnt_backend_csf_if_fw_dump_request(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx)
 {
-	unsigned long flags;
 	u32 glb_req;
 	struct kbase_device *kbdev;
 	struct kbase_csf_global_iface *global_iface;
@@ -585,57 +604,52 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_request(
 		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
 
 	WARN_ON(!ctx);
+	kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(ctx);
 
 	kbdev = fw_ctx->kbdev;
 	global_iface = &kbdev->csf.global_iface;
 
 	/* Trigger dumping */
-	kbase_csf_scheduler_spin_lock(kbdev, &flags);
 	kbdev->csf.hwcnt.request_pending = true;
 	glb_req = kbase_csf_firmware_global_input_read(global_iface, GLB_REQ);
 	glb_req ^= GLB_REQ_PRFCNT_SAMPLE_MASK;
 	kbase_csf_firmware_global_input_mask(global_iface, GLB_REQ, glb_req,
 					     GLB_REQ_PRFCNT_SAMPLE_MASK);
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
-	kbase_csf_scheduler_spin_unlock(kbdev, flags);
 }
 
 static void kbasep_hwcnt_backend_csf_if_fw_get_indexes(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 *extract_index,
 	u32 *insert_index)
 {
-	unsigned long flags;
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
 		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
 
 	WARN_ON(!ctx);
 	WARN_ON(!extract_index);
 	WARN_ON(!insert_index);
+	kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(ctx);
 
-	kbase_csf_scheduler_spin_lock(fw_ctx->kbdev, &flags);
 	*extract_index = kbase_csf_firmware_global_input_read(
 		&fw_ctx->kbdev->csf.global_iface, GLB_PRFCNT_EXTRACT);
 	*insert_index = kbase_csf_firmware_global_output(
 		&fw_ctx->kbdev->csf.global_iface, GLB_PRFCNT_INSERT);
-	kbase_csf_scheduler_spin_unlock(fw_ctx->kbdev, flags);
 }
 
 static void kbasep_hwcnt_backend_csf_if_fw_set_extract_index(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 extract_idx)
 {
-	unsigned long flags;
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
 		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
 
 	WARN_ON(!ctx);
+	kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(ctx);
 
 	/* Set the raw extract index to release the buffer back to the ring
 	 * buffer.
 	 */
-	kbase_csf_scheduler_spin_lock(fw_ctx->kbdev, &flags);
 	kbase_csf_firmware_global_input(&fw_ctx->kbdev->csf.global_iface,
 					GLB_PRFCNT_EXTRACT, extract_idx);
-	kbase_csf_scheduler_spin_unlock(fw_ctx->kbdev, flags);
 }
 
 static void kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count(
@@ -649,6 +663,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count(
 
 	WARN_ON(!ctx);
 	WARN_ON(!cycle_counts);
+	kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(ctx);
 
 	for (clk = 0; clk < fw_ctx->clk_cnt; clk++) {
 		if (!(clk_enable_map & (1ull << clk)))
@@ -749,14 +764,16 @@ int kbase_hwcnt_backend_csf_if_fw_create(
 		return errcode;
 
 	if_fw->ctx = (struct kbase_hwcnt_backend_csf_if_ctx *)ctx;
-	if_fw->get_gpu_info = kbasep_hwcnt_backend_csf_if_fw_get_gpu_info;
+	if_fw->assert_lock_held =
+		kbasep_hwcnt_backend_csf_if_fw_assert_lock_held;
+	if_fw->lock = kbasep_hwcnt_backend_csf_if_fw_lock;
+	if_fw->unlock = kbasep_hwcnt_backend_csf_if_fw_unlock;
+	if_fw->get_prfcnt_info = kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info;
 	if_fw->ring_buf_alloc = kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc;
 	if_fw->ring_buf_sync = kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync;
 	if_fw->ring_buf_free = kbasep_hwcnt_backend_csf_if_fw_ring_buf_free;
 	if_fw->timestamp_ns = kbasep_hwcnt_backend_csf_if_fw_timestamp_ns;
 	if_fw->dump_enable = kbasep_hwcnt_backend_csf_if_fw_dump_enable;
-	if_fw->dump_enable_nolock =
-		kbasep_hwcnt_backend_csf_if_fw_dump_enable_nolock;
 	if_fw->dump_disable = kbasep_hwcnt_backend_csf_if_fw_dump_disable;
 	if_fw->dump_request = kbasep_hwcnt_backend_csf_if_fw_dump_request;
 	if_fw->get_gpu_cycle_count =
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.h b/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.h
index d72851e..f55efb6 100644
--- a/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.h
+++ b/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_jm.c b/mali_kbase/mali_kbase_hwcnt_backend_jm.c
index c6c672c..4168472 100644
--- a/mali_kbase/mali_kbase_hwcnt_backend_jm.c
+++ b/mali_kbase/mali_kbase_hwcnt_backend_jm.c
@@ -62,6 +62,8 @@ struct kbase_hwcnt_backend_jm_info {
  * @enabled:          True if dumping has been enabled, else false.
  * @pm_core_mask:     PM state sync-ed shaders core mask for the enabled
  *                    dumping.
+ * @curr_config:      Current allocated hardware resources to correctly map the src
+ *                    raw dump buffer to the dst dump buffer.
  * @clk_enable_map:   The enable map specifying enabled clock domains.
  * @cycle_count_elapsed:
  *                    Cycle count elapsed for a given sample period.
@@ -81,6 +83,7 @@ struct kbase_hwcnt_backend_jm {
 	struct kbase_vmap_struct *vmap;
 	bool enabled;
 	u64 pm_core_mask;
+	struct kbase_hwcnt_curr_config curr_config;
 	u64 clk_enable_map;
 	u64 cycle_count_elapsed[BASE_MAX_NR_CLOCKS_REGULATORS];
 	u64 prev_cycle_count[BASE_MAX_NR_CLOCKS_REGULATORS];
@@ -89,15 +92,16 @@ struct kbase_hwcnt_backend_jm {
 };
 
 /**
- * kbase_hwcnt_gpu_info_init() - Initialise an info structure used to create the
- *                               hwcnt metadata.
+ * kbasep_hwcnt_backend_jm_gpu_info_init() - Initialise an info structure used
+ *                                           to create the hwcnt metadata.
  * @kbdev: Non-NULL pointer to kbase device.
  * @info:  Non-NULL pointer to data structure to be filled in.
  *
  * The initialised info struct will only be valid for use while kbdev is valid.
  */
-static int kbase_hwcnt_gpu_info_init(struct kbase_device *kbdev,
-			      struct kbase_hwcnt_gpu_info *info)
+static int
+kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
+				      struct kbase_hwcnt_gpu_info *info)
 {
 	size_t clk;
 
@@ -240,6 +244,37 @@ static void kbasep_hwcnt_backend_jm_cc_disable(
 }
 
 
+/**
+ * kbasep_hwcnt_gpu_update_curr_config() - Update the destination buffer with
+ *                                        current config information.
+ * @kbdev:       Non-NULL pointer to kbase device.
+ * @curr_config: Non-NULL pointer to return the current configuration of
+ *               hardware allocated to the GPU.
+ *
+ * The current configuration information is used for architectures where the
+ * max_config interface is available from the Arbiter. In this case the current
+ * allocated hardware is not always the same, so the current config information
+ * is used to correctly map the current allocated resources to the memory layout
+ * that is copied to the user space.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_hwcnt_gpu_update_curr_config(
+	struct kbase_device *kbdev,
+	struct kbase_hwcnt_curr_config *curr_config)
+{
+	if (WARN_ON(!kbdev) || WARN_ON(!curr_config))
+		return -EINVAL;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	curr_config->num_l2_slices =
+		kbdev->gpu_props.curr_config.l2_slices;
+	curr_config->shader_present =
+		kbdev->gpu_props.curr_config.shader_present;
+	return 0;
+}
+
 /* JM backend implementation of kbase_hwcnt_backend_timestamp_ns_fn */
 static u64 kbasep_hwcnt_backend_jm_timestamp_ns(
 	struct kbase_hwcnt_backend *backend)
@@ -287,11 +322,18 @@ static int kbasep_hwcnt_backend_jm_dump_enable_nolock(
 
 	timestamp_ns = kbasep_hwcnt_backend_jm_timestamp_ns(backend);
 
+	/* Update the current configuration information. */
+	errcode = kbasep_hwcnt_gpu_update_curr_config(kbdev,
+						      &backend_jm->curr_config);
+	if (errcode)
+		goto error;
+
 	errcode = kbase_instr_hwcnt_enable_internal(kbdev, kctx, &enable);
 	if (errcode)
 		goto error;
 
 	backend_jm->pm_core_mask = kbase_pm_ca_get_instr_core_mask(kbdev);
+
 	backend_jm->enabled = true;
 
 	kbasep_hwcnt_backend_jm_cc_enable(backend_jm, enable_map, timestamp_ns);
@@ -372,7 +414,7 @@ static int kbasep_hwcnt_backend_jm_dump_request(
 	size_t clk;
 	int ret;
 
-	if (!backend_jm || !backend_jm->enabled)
+	if (!backend_jm || !backend_jm->enabled || !dump_time_ns)
 		return -EINVAL;
 
 	kbdev = backend_jm->kctx->kbdev;
@@ -441,6 +483,11 @@ static int kbasep_hwcnt_backend_jm_dump_get(
 	struct kbase_hwcnt_backend_jm *backend_jm =
 		(struct kbase_hwcnt_backend_jm *)backend;
 	size_t clk;
+#ifdef CONFIG_MALI_NO_MALI
+	struct kbase_device *kbdev;
+	unsigned long flags;
+	int errcode;
+#endif
 
 	if (!backend_jm || !dst || !dst_enable_map ||
 	    (backend_jm->info->metadata != dst->metadata) ||
@@ -460,9 +507,24 @@ static int kbasep_hwcnt_backend_jm_dump_get(
 		dst->clk_cnt_buf[clk] = backend_jm->cycle_count_elapsed[clk];
 	}
 
+#ifdef CONFIG_MALI_NO_MALI
+	kbdev = backend_jm->kctx->kbdev;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+
+	/* Update the current configuration information. */
+	errcode = kbasep_hwcnt_gpu_update_curr_config(kbdev,
+		&backend_jm->curr_config);
+
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	if (errcode)
+		return errcode;
+#endif
+
 	return kbase_hwcnt_jm_dump_get(dst, backend_jm->cpu_dump_va,
 				       dst_enable_map, backend_jm->pm_core_mask,
-				       accumulate);
+				       &backend_jm->curr_config, accumulate);
 }
 
 /**
@@ -684,7 +746,7 @@ static int kbasep_hwcnt_backend_jm_info_create(
 	WARN_ON(!kbdev);
 	WARN_ON(!out_info);
 
-	errcode = kbase_hwcnt_gpu_info_init(kbdev, &hwcnt_gpu_info);
+	errcode = kbasep_hwcnt_backend_jm_gpu_info_init(kbdev, &hwcnt_gpu_info);
 	if (errcode)
 		return errcode;
 
diff --git a/mali_kbase/mali_kbase_hwcnt_gpu.c b/mali_kbase/mali_kbase_hwcnt_gpu.c
index 91d1f8c..4fba6b6 100644
--- a/mali_kbase/mali_kbase_hwcnt_gpu.c
+++ b/mali_kbase/mali_kbase_hwcnt_gpu.c
@@ -242,6 +242,13 @@ int kbase_hwcnt_jm_metadata_create(
 	if (!gpu_info || !out_metadata || !out_dump_bytes)
 		return -EINVAL;
 
+	/*
+	 * For architectures where a max_config interface is available
+	 * from the arbiter, the v5 dump bytes and the metadata v5 are
+	 * based on the maximum possible allocation of the HW in the
+	 * GPU cause it needs to be prepared for the worst case where
+	 * all the available L2 cache and Shader cores are allocated.
+	 */
 	dump_bytes = kbasep_hwcnt_backend_jm_dump_bytes(gpu_info);
 	errcode = kbasep_hwcnt_backend_gpu_metadata_create(
 		gpu_info, false, counter_set, &metadata);
@@ -260,8 +267,7 @@ int kbase_hwcnt_jm_metadata_create(
 	return 0;
 }
 
-void kbase_hwcnt_jm_metadata_destroy(
-	const struct kbase_hwcnt_metadata *metadata)
+void kbase_hwcnt_jm_metadata_destroy(const struct kbase_hwcnt_metadata *metadata)
 {
 	if (!metadata)
 		return;
@@ -318,15 +324,41 @@ static bool is_block_type_shader(
 	return is_shader;
 }
 
+static bool is_block_type_l2_cache(
+	const u64 grp_type,
+	const u64 blk_type)
+{
+	bool is_l2_cache = false;
+
+	switch (grp_type) {
+	case KBASE_HWCNT_GPU_GROUP_TYPE_V5:
+		if (blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS ||
+		    blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2)
+			is_l2_cache = true;
+		break;
+	default:
+		/* Warn on unknown group type */
+		WARN_ON(true);
+	}
+
+	return is_l2_cache;
+}
+
 int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
 			    const struct kbase_hwcnt_enable_map *dst_enable_map,
-			    u64 pm_core_mask, bool accumulate)
+			    u64 pm_core_mask,
+			    const struct kbase_hwcnt_curr_config *curr_config,
+			    bool accumulate)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	const u32 *dump_src;
 	size_t src_offset, grp, blk, blk_inst;
 	u64 core_mask = pm_core_mask;
 
+	/* Variables to deal with the current configuration */
+	int l2_count = 0;
+	bool hw_res_available = true;
+
 	if (!dst || !src || !dst_enable_map ||
 	    (dst_enable_map->metadata != dst->metadata))
 		return -EINVAL;
@@ -348,15 +380,43 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
 		const bool is_shader_core = is_block_type_shader(
 			kbase_hwcnt_metadata_group_type(metadata, grp),
 			blk_type, blk);
+		const bool is_l2_cache = is_block_type_l2_cache(
+			kbase_hwcnt_metadata_group_type(metadata, grp),
+			blk_type);
+
+		/*
+		 * If l2 blocks is greater than the current allocated number of
+		 * L2 slices, there is no hw allocated to that block.
+		 */
+		if (is_l2_cache) {
+			l2_count++;
+			if (l2_count > curr_config->num_l2_slices)
+				hw_res_available = false;
+			else
+				hw_res_available = true;
+		}
+		/*
+		 * For the shader cores, the current shader_mask allocated is
+		 * always a subgroup of the maximum shader_mask, so after
+		 * jumping any L2 cache not available the available shader cores
+		 * will always have a matching set of blk instances available to
+		 * accumulate them.
+		 */
+		else {
+			hw_res_available = true;
+		}
 
-		/* Early out if no values in the dest block are enabled */
+		/*
+		 * Early out if no values in the dest block are enabled or if
+		 * the resource target of the block is not available in the HW.
+		 */
 		if (kbase_hwcnt_enable_map_block_enabled(
 			dst_enable_map, grp, blk, blk_inst)) {
 			u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
 				dst, grp, blk, blk_inst);
 			const u32 *src_blk = dump_src + src_offset;
 
-			if (!is_shader_core || (core_mask & 1)) {
+			if ((!is_shader_core || (core_mask & 1)) && hw_res_available) {
 				if (accumulate) {
 					kbase_hwcnt_dump_buffer_block_accumulate(
 						dst_blk, src_blk, hdr_cnt,
@@ -372,7 +432,9 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
 			}
 		}
 
-		src_offset += (hdr_cnt + ctr_cnt);
+		/* Just increase the src_offset if the HW is available */
+		if (hw_res_available)
+			src_offset += (hdr_cnt + ctr_cnt);
 		if (is_shader_core)
 			core_mask = core_mask >> 1;
 	}
@@ -380,10 +442,9 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
 	return 0;
 }
 
-int kbase_hwcnt_csf_dump_get(
-	struct kbase_hwcnt_dump_buffer *dst, void *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map,
-	bool accumulate)
+int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
+			     const struct kbase_hwcnt_enable_map *dst_enable_map,
+			     bool accumulate)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	const u32 *dump_src;
diff --git a/mali_kbase/mali_kbase_hwcnt_gpu.h b/mali_kbase/mali_kbase_hwcnt_gpu.h
index 4ebff2d..9b846a9 100644
--- a/mali_kbase/mali_kbase_hwcnt_gpu.h
+++ b/mali_kbase/mali_kbase_hwcnt_gpu.h
@@ -128,6 +128,50 @@ struct kbase_hwcnt_gpu_info {
 };
 
 /**
+ * struct kbase_hwcnt_curr_config - Current Configuration of HW allocated to the
+ *                                  GPU.
+ * @num_l2_slices:  Current number of L2 slices allocated to the GPU.
+ * @shader_present: Current shader present bitmap that is allocated to the GPU.
+ *
+ * For architectures with the max_config interface available from the Arbiter,
+ * the current resources allocated may change during runtime due to a
+ * re-partitioning (possible with partition manager). Thus, the HWC needs to be
+ * prepared to report any possible set of counters. For this reason the memory
+ * layout in the userspace is based on the maximum possible allocation. On the
+ * other hand, each partition has just the view of its currently allocated
+ * resources. Therefore, it is necessary to correctly map the dumped HWC values
+ * from the registers into this maximum memory layout so that it can be exposed
+ * to the userspace side correctly.
+ *
+ * For L2 cache just the number is enough once the allocated ones will be
+ * accumulated on the first L2 slots available in the destination buffer.
+ *
+ * For the correct mapping of the shader cores it is necessary to jump all the
+ * L2 cache slots in the destination buffer that are not allocated. But, it is
+ * not necessary to add any logic to map the shader cores bitmap into the memory
+ * layout because the shader_present allocated will always be a subset of the
+ * maximum shader_present. It is possible because:
+ * 1 - Partitions are made of slices and they are always ordered from the ones
+ *     with more shader cores to the ones with less.
+ * 2 - The shader cores in a slice are always contiguous.
+ * 3 - A partition can only have a contiguous set of slices allocated to it.
+ * So, for example, if 4 slices are available in total, 1 with 4 cores, 2 with
+ * 3 cores and 1 with 2 cores. The maximum possible shader_present would be:
+ * 0x0011|0111|0111|1111 -> note the order and that the shader cores are
+ *                          contiguous in any slice.
+ * Supposing that a partition takes the two slices in the middle, the current
+ * config shader_present for this partition would be:
+ * 0x0111|0111 -> note that this is a subset of the maximum above and the slices
+ *                are contiguous.
+ * Therefore, by directly copying any subset of the maximum possible
+ * shader_present the mapping is already achieved.
+ */
+struct kbase_hwcnt_curr_config {
+	size_t num_l2_slices;
+	u64 shader_present;
+};
+
+/**
  * kbase_hwcnt_jm_metadata_create() - Create hardware counter metadata for the
  *                                    JM GPUs.
  * @info:           Non-NULL pointer to info struct.
@@ -186,6 +230,8 @@ void kbase_hwcnt_csf_metadata_destroy(
  *                  kbase_hwcnt_jm_metadata_create.
  * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
  * @pm_core_mask:   PM state synchronized shaders core mask with the dump.
+ * @curr_config:    Current allocated hardware resources to correctly map the
+ *                  src raw dump buffer to the dst dump buffer.
  * @accumulate:     True if counters in src should be accumulated into dst,
  *                  rather than copied.
  *
@@ -197,7 +243,9 @@ void kbase_hwcnt_csf_metadata_destroy(
  */
 int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
 			    const struct kbase_hwcnt_enable_map *dst_enable_map,
-			    const u64 pm_core_mask, bool accumulate);
+			    const u64 pm_core_mask,
+			    const struct kbase_hwcnt_curr_config *curr_config,
+			    bool accumulate);
 
 /**
  * kbase_hwcnt_csf_dump_get() - Copy or accumulate enabled counters from the raw
@@ -217,10 +265,9 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_csf_dump_get(
-	struct kbase_hwcnt_dump_buffer *dst, void *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map,
-	bool accumulate);
+int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
+			     const struct kbase_hwcnt_enable_map *dst_enable_map,
+			     bool accumulate);
 
 /**
  * kbase_hwcnt_gpu_enable_map_to_physical() - Convert an enable map abstraction
diff --git a/mali_kbase/mali_kbase_hwcnt_legacy.c b/mali_kbase/mali_kbase_hwcnt_legacy.c
index bd523dd..e87dbbf 100644
--- a/mali_kbase/mali_kbase_hwcnt_legacy.c
+++ b/mali_kbase/mali_kbase_hwcnt_legacy.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2018, 2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -23,7 +23,7 @@
 #include "mali_kbase_hwcnt_virtualizer.h"
 #include "mali_kbase_hwcnt_types.h"
 #include "mali_kbase_hwcnt_gpu.h"
-#include "mali_kbase_ioctl.h"
+#include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h>
 
 #include <linux/slab.h>
 #include <linux/uaccess.h>
diff --git a/mali_kbase/mali_kbase_jd.c b/mali_kbase/mali_kbase_jd.c
index 1cf24a2..949c041 100644
--- a/mali_kbase/mali_kbase_jd.c
+++ b/mali_kbase/mali_kbase_jd.c
@@ -74,7 +74,7 @@ static void jd_mark_atom_complete(struct kbase_jd_atom *katom)
 {
 	katom->status = KBASE_JD_ATOM_STATE_COMPLETED;
 	kbase_kinstr_jm_atom_complete(katom);
-	dev_dbg(katom->kctx->kbdev->dev, "Atom %p status to completed\n",
+	dev_dbg(katom->kctx->kbdev->dev, "Atom %pK status to completed\n",
 		(void *)katom);
 }
 
@@ -89,7 +89,7 @@ static bool jd_run_atom(struct kbase_jd_atom *katom)
 {
 	struct kbase_context *kctx = katom->kctx;
 
-	dev_dbg(kctx->kbdev->dev, "JD run atom %p in kctx %p\n",
+	dev_dbg(kctx->kbdev->dev, "JD run atom %pK in kctx %pK\n",
 		(void *)katom, (void *)kctx);
 
 	KBASE_DEBUG_ASSERT(katom->status != KBASE_JD_ATOM_STATE_UNUSED);
@@ -99,23 +99,23 @@ static bool jd_run_atom(struct kbase_jd_atom *katom)
 		trace_sysgraph(SGR_SUBMIT, kctx->id,
 				kbase_jd_atom_id(katom->kctx, katom));
 		jd_mark_atom_complete(katom);
-		return 0;
+		return false;
 	} else if (katom->core_req & BASE_JD_REQ_SOFT_JOB) {
 		/* Soft-job */
 		if (katom->will_fail_event_code) {
 			kbase_finish_soft_job(katom);
 			jd_mark_atom_complete(katom);
-			return 0;
+			return false;
 		}
 		if (kbase_process_soft_job(katom) == 0) {
 			kbase_finish_soft_job(katom);
 			jd_mark_atom_complete(katom);
 		}
-		return 0;
+		return false;
 	}
 
 	katom->status = KBASE_JD_ATOM_STATE_IN_JS;
-	dev_dbg(kctx->kbdev->dev, "Atom %p status to in JS\n", (void *)katom);
+	dev_dbg(kctx->kbdev->dev, "Atom %pK status to in JS\n", (void *)katom);
 	/* Queue an action about whether we should try scheduling a context */
 	return kbasep_js_add_job(kctx, katom);
 }
@@ -758,7 +758,7 @@ bool jd_done_nolock(struct kbase_jd_atom *katom,
 			list_del(runnable_jobs.next);
 			node->in_jd_list = false;
 
-			dev_dbg(kctx->kbdev->dev, "List node %p has status %d\n",
+			dev_dbg(kctx->kbdev->dev, "List node %pK has status %d\n",
 				node, node->status);
 
 			KBASE_DEBUG_ASSERT(node->status != KBASE_JD_ATOM_STATE_UNUSED);
@@ -901,7 +901,7 @@ static bool jd_submit_atom(struct kbase_context *const kctx,
 	unsigned long flags;
 	enum kbase_jd_atom_state status;
 
-	dev_dbg(kbdev->dev, "User did JD submit atom %p\n", (void *)katom);
+	dev_dbg(kbdev->dev, "User did JD submit atom %pK\n", (void *)katom);
 
 	/* Update the TOTAL number of jobs. This includes those not tracked by
 	 * the scheduler: 'not ready to run' and 'dependency-only' jobs.
@@ -976,7 +976,7 @@ static bool jd_submit_atom(struct kbase_context *const kctx,
 				katom->event_code = BASE_JD_EVENT_JOB_CONFIG_FAULT;
 				katom->status = KBASE_JD_ATOM_STATE_COMPLETED;
 				dev_dbg(kbdev->dev,
-					"Atom %p status to completed\n",
+					"Atom %pK status to completed\n",
 					(void *)katom);
 
 				/* Wrong dependency setup. Atom will be sent
@@ -1019,7 +1019,7 @@ static bool jd_submit_atom(struct kbase_context *const kctx,
 			/* Atom has completed, propagate the error code if any */
 			katom->event_code = dep_atom->event_code;
 			katom->status = KBASE_JD_ATOM_STATE_QUEUED;
-			dev_dbg(kbdev->dev, "Atom %p status to queued\n",
+			dev_dbg(kbdev->dev, "Atom %pK status to queued\n",
 				(void *)katom);
 
 			/* This atom will be sent back to user space.
@@ -1062,7 +1062,7 @@ static bool jd_submit_atom(struct kbase_context *const kctx,
 	 */
 	katom->event_code = BASE_JD_EVENT_DONE;
 	katom->status = KBASE_JD_ATOM_STATE_QUEUED;
-	dev_dbg(kbdev->dev, "Atom %p status to queued\n", (void *)katom);
+	dev_dbg(kbdev->dev, "Atom %pK status to queued\n", (void *)katom);
 
 	/* For invalid priority, be most lenient and choose the default */
 	sched_prio = kbasep_js_atom_prio_to_sched_prio(user_atom->prio);
@@ -1199,7 +1199,7 @@ static bool jd_submit_atom(struct kbase_context *const kctx,
 		bool need_to_try_schedule_context;
 
 		katom->status = KBASE_JD_ATOM_STATE_IN_JS;
-		dev_dbg(kctx->kbdev->dev, "Atom %p status to in JS\n",
+		dev_dbg(kctx->kbdev->dev, "Atom %pK status to in JS\n",
 			(void *)katom);
 
 		need_to_try_schedule_context = kbasep_js_add_job(kctx, katom);
@@ -1270,7 +1270,7 @@ int kbase_jd_submit(struct kbase_context *kctx,
 
 		if (unlikely(jd_atom_is_v2)) {
 			if (copy_from_user(&user_atom.jc, user_addr, sizeof(struct base_jd_atom_v2)) != 0) {
-				dev_err(kbdev->dev,
+				dev_dbg(kbdev->dev,
 					"Invalid atom address %p passed to job_submit\n",
 					user_addr);
 				err = -EFAULT;
@@ -1281,7 +1281,7 @@ int kbase_jd_submit(struct kbase_context *kctx,
 			user_atom.seq_nr = 0;
 		} else {
 			if (copy_from_user(&user_atom, user_addr, stride) != 0) {
-				dev_err(kbdev->dev,
+				dev_dbg(kbdev->dev,
 					"Invalid atom address %p passed to job_submit\n",
 					user_addr);
 				err = -EFAULT;
@@ -1420,7 +1420,7 @@ void kbase_jd_done_worker(struct work_struct *data)
 	js_kctx_info = &kctx->jctx.sched_info;
 	js_devdata = &kbdev->js_data;
 
-	dev_dbg(kbdev->dev, "Enter atom %p done worker for kctx %p\n",
+	dev_dbg(kbdev->dev, "Enter atom %pK done worker for kctx %pK\n",
 		(void *)katom, (void *)kctx);
 
 	KBASE_KTRACE_ADD_JM(kbdev, JD_DONE_WORKER, kctx, katom, katom->jc, 0);
@@ -1444,7 +1444,7 @@ void kbase_jd_done_worker(struct work_struct *data)
 	if (katom->event_code == BASE_JD_EVENT_STOPPED) {
 		unsigned long flags;
 
-		dev_dbg(kbdev->dev, "Atom %p has been promoted to stopped\n",
+		dev_dbg(kbdev->dev, "Atom %pK has been promoted to stopped\n",
 			(void *)katom);
 		mutex_unlock(&js_kctx_info->ctx.jsctx_mutex);
 		mutex_unlock(&js_devdata->queue_mutex);
@@ -1452,7 +1452,7 @@ void kbase_jd_done_worker(struct work_struct *data)
 		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 
 		katom->status = KBASE_JD_ATOM_STATE_IN_JS;
-		dev_dbg(kctx->kbdev->dev, "Atom %p status to in JS\n",
+		dev_dbg(kctx->kbdev->dev, "Atom %pK status to in JS\n",
 			(void *)katom);
 		kbase_js_unpull(kctx, katom);
 
@@ -1568,7 +1568,7 @@ void kbase_jd_done_worker(struct work_struct *data)
 
 	KBASE_KTRACE_ADD_JM(kbdev, JD_DONE_WORKER_END, kctx, NULL, cache_jc, 0);
 
-	dev_dbg(kbdev->dev, "Leave atom %p done worker for kctx %p\n",
+	dev_dbg(kbdev->dev, "Leave atom %pK done worker for kctx %pK\n",
 		(void *)katom, (void *)kctx);
 }
 
@@ -1698,7 +1698,7 @@ void kbase_jd_cancel(struct kbase_device *kbdev, struct kbase_jd_atom *katom)
 	kctx = katom->kctx;
 	KBASE_DEBUG_ASSERT(kctx != NULL);
 
-	dev_dbg(kbdev->dev, "JD: cancelling atom %p\n", (void *)katom);
+	dev_dbg(kbdev->dev, "JD: cancelling atom %pK\n", (void *)katom);
 	KBASE_KTRACE_ADD_JM(kbdev, JD_CANCEL, kctx, katom, katom->jc, 0);
 
 	/* This should only be done from a context that is not scheduled */
diff --git a/mali_kbase/mali_kbase_jd_debugfs.c b/mali_kbase/mali_kbase_jd_debugfs.c
index 940b920..f423758 100644
--- a/mali_kbase/mali_kbase_jd_debugfs.c
+++ b/mali_kbase/mali_kbase_jd_debugfs.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2014-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -28,7 +28,7 @@
 #if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
 #include <mali_kbase_sync.h>
 #endif
-#include <mali_kbase_ioctl.h>
+#include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h>
 
 struct kbase_jd_debugfs_depinfo {
 	u8 id;
@@ -46,13 +46,13 @@ static void kbase_jd_debugfs_fence_info(struct kbase_jd_atom *atom,
 	case BASE_JD_REQ_SOFT_FENCE_TRIGGER:
 		res = kbase_sync_fence_out_info_get(atom, &info);
 		if (res == 0)
-			seq_printf(sfile, "Sa([%p]%d) ",
+			seq_printf(sfile, "Sa([%pK]%d) ",
 				   info.fence, info.status);
 		break;
 	case BASE_JD_REQ_SOFT_FENCE_WAIT:
 		res = kbase_sync_fence_in_info_get(atom, &info);
 		if (res == 0)
-			seq_printf(sfile, "Wa([%p]%d) ",
+			seq_printf(sfile, "Wa([%pK]%d) ",
 				   info.fence, info.status);
 		break;
 	default:
diff --git a/mali_kbase/mali_kbase_jm.c b/mali_kbase/mali_kbase_jm.c
index be14b45..73e9905 100644
--- a/mali_kbase/mali_kbase_jm.c
+++ b/mali_kbase/mali_kbase_jm.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2014-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -45,7 +45,7 @@ static bool kbase_jm_next_job(struct kbase_device *kbdev, int js,
 
 	kctx = kbdev->hwaccess.active_kctx[js];
 	dev_dbg(kbdev->dev,
-		"Trying to run the next %d jobs in kctx %p (s:%d)\n",
+		"Trying to run the next %d jobs in kctx %pK (s:%d)\n",
 		nr_jobs_to_submit, (void *)kctx, js);
 
 	if (!kctx)
@@ -117,7 +117,7 @@ void kbase_jm_idle_ctx(struct kbase_device *kbdev, struct kbase_context *kctx)
 
 	for (js = 0; js < BASE_JM_MAX_NR_SLOTS; js++) {
 		if (kbdev->hwaccess.active_kctx[js] == kctx) {
-			dev_dbg(kbdev->dev, "Marking kctx %p as inactive (s:%d)\n",
+			dev_dbg(kbdev->dev, "Marking kctx %pK as inactive (s:%d)\n",
 					(void *)kctx, js);
 			kbdev->hwaccess.active_kctx[js] = NULL;
 		}
@@ -129,7 +129,7 @@ struct kbase_jd_atom *kbase_jm_return_atom_to_js(struct kbase_device *kbdev,
 {
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
-	dev_dbg(kbdev->dev, "Atom %p is returning with event code 0x%x\n",
+	dev_dbg(kbdev->dev, "Atom %pK is returning with event code 0x%x\n",
 		(void *)katom, katom->event_code);
 
 	if (katom->event_code != BASE_JD_EVENT_STOPPED &&
diff --git a/mali_kbase/mali_kbase_js.c b/mali_kbase/mali_kbase_js.c
index ea317b2..6bb57e6 100644
--- a/mali_kbase/mali_kbase_js.c
+++ b/mali_kbase/mali_kbase_js.c
@@ -162,7 +162,7 @@ jsctx_rb_none_to_pull_prio(struct kbase_context *kctx, int js, int prio)
 	none_to_pull = RB_EMPTY_ROOT(&rb->runnable_tree);
 
 	dev_dbg(kctx->kbdev->dev,
-		"Slot %d (prio %d) is %spullable in kctx %p\n",
+		"Slot %d (prio %d) is %spullable in kctx %pK\n",
 		js, prio, none_to_pull ? "not " : "", kctx);
 
 	return none_to_pull;
@@ -186,7 +186,7 @@ jsctx_rb_none_to_pull(struct kbase_context *kctx, int js)
 
 	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
 
-	for (prio = KBASE_JS_ATOM_SCHED_PRIO_REALTIME;
+	for (prio = KBASE_JS_ATOM_SCHED_PRIO_FIRST;
 		prio < KBASE_JS_ATOM_SCHED_PRIO_COUNT; prio++) {
 		if (!jsctx_rb_none_to_pull_prio(kctx, js, prio))
 			return false;
@@ -236,7 +236,7 @@ jsctx_queue_foreach_prio(struct kbase_context *kctx, int js, int prio,
 			WARN_ON(!(entry->core_req &
 				BASE_JD_REQ_END_RENDERPASS));
 			dev_dbg(kctx->kbdev->dev,
-				"Del runnable atom %p from X_DEP list\n",
+				"Del runnable atom %pK from X_DEP list\n",
 				(void *)entry);
 
 			list_del(&entry->queue);
@@ -252,7 +252,7 @@ jsctx_queue_foreach_prio(struct kbase_context *kctx, int js, int prio,
 		WARN_ON(!(entry->atom_flags &
 			KBASE_KATOM_FLAG_JSCTX_IN_X_DEP_LIST));
 		dev_dbg(kctx->kbdev->dev,
-			"Del blocked atom %p from X_DEP list\n",
+			"Del blocked atom %pK from X_DEP list\n",
 			(void *)entry);
 
 		list_del(queue->x_dep_head.next);
@@ -279,7 +279,7 @@ jsctx_queue_foreach(struct kbase_context *kctx, int js,
 {
 	int prio;
 
-	for (prio = KBASE_JS_ATOM_SCHED_PRIO_REALTIME;
+	for (prio = KBASE_JS_ATOM_SCHED_PRIO_FIRST;
 		prio < KBASE_JS_ATOM_SCHED_PRIO_COUNT; prio++)
 		jsctx_queue_foreach_prio(kctx, js, prio, callback);
 }
@@ -303,7 +303,7 @@ jsctx_rb_peek_prio(struct kbase_context *kctx, int js, int prio)
 
 	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
 	dev_dbg(kctx->kbdev->dev,
-		"Peeking runnable tree of kctx %p for prio %d (s:%d)\n",
+		"Peeking runnable tree of kctx %pK for prio %d (s:%d)\n",
 		(void *)kctx, prio, js);
 
 	node = rb_first(&rb->runnable_tree);
@@ -335,7 +335,7 @@ jsctx_rb_peek(struct kbase_context *kctx, int js)
 
 	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
 
-	for (prio = KBASE_JS_ATOM_SCHED_PRIO_REALTIME;
+	for (prio = KBASE_JS_ATOM_SCHED_PRIO_FIRST;
 		prio < KBASE_JS_ATOM_SCHED_PRIO_COUNT; prio++) {
 		struct kbase_jd_atom *katom;
 
@@ -365,7 +365,7 @@ jsctx_rb_pull(struct kbase_context *kctx, struct kbase_jd_atom *katom)
 
 	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
 
-	dev_dbg(kctx->kbdev->dev, "Erasing atom %p from runnable tree of kctx %p\n",
+	dev_dbg(kctx->kbdev->dev, "Erasing atom %pK from runnable tree of kctx %pK\n",
 		(void *)katom, (void *)kctx);
 
 	/* Atoms must be pulled in the correct order. */
@@ -387,7 +387,7 @@ jsctx_tree_add(struct kbase_context *kctx, struct kbase_jd_atom *katom)
 
 	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
 
-	dev_dbg(kbdev->dev, "Adding atom %p to runnable tree of kctx %p (s:%d)\n",
+	dev_dbg(kbdev->dev, "Adding atom %pK to runnable tree of kctx %pK (s:%d)\n",
 		(void *)katom, (void *)kctx, js);
 
 	while (*new) {
@@ -542,7 +542,7 @@ int kbasep_js_devdata_init(struct kbase_device * const kbdev)
 	sema_init(&jsdd->schedule_sem, 1);
 
 	for (i = 0; i < kbdev->gpu_props.num_job_slots; ++i) {
-		for (j = 0; j < KBASE_JS_ATOM_SCHED_PRIO_COUNT; ++j) {
+		for (j = KBASE_JS_ATOM_SCHED_PRIO_FIRST; j < KBASE_JS_ATOM_SCHED_PRIO_COUNT; ++j) {
 			INIT_LIST_HEAD(&jsdd->ctx_list_pullable[i][j]);
 			INIT_LIST_HEAD(&jsdd->ctx_list_unpullable[i][j]);
 		}
@@ -610,7 +610,7 @@ int kbasep_js_kctx_init(struct kbase_context *const kctx)
 
 	init_waitqueue_head(&js_kctx_info->ctx.is_scheduled_wait);
 
-	for (i = 0; i < KBASE_JS_ATOM_SCHED_PRIO_COUNT; i++) {
+	for (i = KBASE_JS_ATOM_SCHED_PRIO_FIRST; i < KBASE_JS_ATOM_SCHED_PRIO_COUNT; i++) {
 		for (j = 0; j < BASE_JM_MAX_NR_SLOTS; j++) {
 			INIT_LIST_HEAD(&kctx->jsctx_queue[i][j].x_dep_head);
 			kctx->jsctx_queue[i][j].runnable_tree = RB_ROOT;
@@ -684,7 +684,7 @@ static bool kbase_js_ctx_list_add_pullable_nolock(struct kbase_device *kbdev,
 	bool ret = false;
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
-	dev_dbg(kbdev->dev, "Add pullable tail kctx %p (s:%d)\n",
+	dev_dbg(kbdev->dev, "Add pullable tail kctx %pK (s:%d)\n",
 		(void *)kctx, js);
 
 	if (!list_empty(&kctx->jctx.sched_info.ctx.ctx_list_entry[js]))
@@ -726,7 +726,7 @@ static bool kbase_js_ctx_list_add_pullable_head_nolock(
 	bool ret = false;
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
-	dev_dbg(kbdev->dev, "Add pullable head kctx %p (s:%d)\n",
+	dev_dbg(kbdev->dev, "Add pullable head kctx %pK (s:%d)\n",
 		(void *)kctx, js);
 
 	if (!list_empty(&kctx->jctx.sched_info.ctx.ctx_list_entry[js]))
@@ -802,7 +802,7 @@ static bool kbase_js_ctx_list_add_unpullable_nolock(struct kbase_device *kbdev,
 	bool ret = false;
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
-	dev_dbg(kbdev->dev, "Add unpullable tail kctx %p (s:%d)\n",
+	dev_dbg(kbdev->dev, "Add unpullable tail kctx %pK (s:%d)\n",
 		(void *)kctx, js);
 
 	list_move_tail(&kctx->jctx.sched_info.ctx.ctx_list_entry[js],
@@ -885,7 +885,7 @@ static struct kbase_context *kbase_js_ctx_list_pop_head_nolock(
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
-	for (i = 0; i < KBASE_JS_ATOM_SCHED_PRIO_COUNT; i++) {
+	for (i = KBASE_JS_ATOM_SCHED_PRIO_FIRST; i < KBASE_JS_ATOM_SCHED_PRIO_COUNT; i++) {
 		if (list_empty(&kbdev->js_data.ctx_list_pullable[js][i]))
 			continue;
 
@@ -895,7 +895,7 @@ static struct kbase_context *kbase_js_ctx_list_pop_head_nolock(
 
 		list_del_init(&kctx->jctx.sched_info.ctx.ctx_list_entry[js]);
 		dev_dbg(kbdev->dev,
-			"Popped %p from the pullable queue (s:%d)\n",
+			"Popped %pK from the pullable queue (s:%d)\n",
 			(void *)kctx, js);
 		return kctx;
 	}
@@ -949,25 +949,25 @@ static bool kbase_js_ctx_pullable(struct kbase_context *kctx, int js,
 
 	if (is_scheduled) {
 		if (!kbasep_js_is_submit_allowed(js_devdata, kctx)) {
-			dev_dbg(kbdev->dev, "JS: No submit allowed for kctx %p\n",
+			dev_dbg(kbdev->dev, "JS: No submit allowed for kctx %pK\n",
 				(void *)kctx);
 			return false;
 		}
 	}
 	katom = jsctx_rb_peek(kctx, js);
 	if (!katom) {
-		dev_dbg(kbdev->dev, "JS: No pullable atom in kctx %p (s:%d)\n",
+		dev_dbg(kbdev->dev, "JS: No pullable atom in kctx %pK (s:%d)\n",
 			(void *)kctx, js);
 		return false; /* No pullable atoms */
 	}
 	if (kctx->blocked_js[js][katom->sched_priority]) {
 		dev_dbg(kbdev->dev,
-			"JS: kctx %p is blocked from submitting atoms at priority %d (s:%d)\n",
+			"JS: kctx %pK is blocked from submitting atoms at priority %d (s:%d)\n",
 			(void *)kctx, katom->sched_priority, js);
 		return false;
 	}
 	if (atomic_read(&katom->blocked)) {
-		dev_dbg(kbdev->dev, "JS: Atom %p is blocked in js_ctx_pullable\n",
+		dev_dbg(kbdev->dev, "JS: Atom %pK is blocked in js_ctx_pullable\n",
 			(void *)katom);
 		return false; /* next atom blocked */
 	}
@@ -976,20 +976,20 @@ static bool kbase_js_ctx_pullable(struct kbase_context *kctx, int js,
 				KBASE_ATOM_GPU_RB_NOT_IN_SLOT_RB ||
 				katom->x_pre_dep->will_fail_event_code) {
 			dev_dbg(kbdev->dev,
-				"JS: X pre-dep %p is not present in slot FIFO or will fail\n",
+				"JS: X pre-dep %pK is not present in slot FIFO or will fail\n",
 				(void *)katom->x_pre_dep);
 			return false;
 		}
 		if ((katom->atom_flags & KBASE_KATOM_FLAG_FAIL_BLOCKER) &&
 			kbase_backend_nr_atoms_on_slot(kctx->kbdev, js)) {
 			dev_dbg(kbdev->dev,
-				"JS: Atom %p has cross-slot fail dependency and atoms on slot (s:%d)\n",
+				"JS: Atom %pK has cross-slot fail dependency and atoms on slot (s:%d)\n",
 				(void *)katom, js);
 			return false;
 		}
 	}
 
-	dev_dbg(kbdev->dev, "JS: Atom %p is pullable in kctx %p (s:%d)\n",
+	dev_dbg(kbdev->dev, "JS: Atom %pK is pullable in kctx %pK (s:%d)\n",
 		(void *)katom, (void *)kctx, js);
 
 	return true;
@@ -1013,7 +1013,7 @@ static bool kbase_js_dep_validate(struct kbase_context *kctx,
 			int dep_prio = dep_atom->sched_priority;
 
 			dev_dbg(kbdev->dev,
-				"Checking dep %d of atom %p (s:%d) on %p (s:%d)\n",
+				"Checking dep %d of atom %pK (s:%d) on %pK (s:%d)\n",
 				i, (void *)katom, js, (void *)dep_atom, dep_js);
 
 			/* Dependent atom must already have been submitted */
@@ -1115,7 +1115,7 @@ static bool kbase_js_dep_validate(struct kbase_context *kctx,
 				int dep_js = kbase_js_get_slot(kbdev, dep_atom);
 
 				dev_dbg(kbdev->dev,
-					"Clearing dep %d of atom %p (s:%d) on %p (s:%d)\n",
+					"Clearing dep %d of atom %pK (s:%d) on %pK (s:%d)\n",
 					i, (void *)katom, js, (void *)dep_atom,
 					dep_js);
 
@@ -1130,7 +1130,7 @@ static bool kbase_js_dep_validate(struct kbase_context *kctx,
 					katom->atom_flags |=
 						KBASE_KATOM_FLAG_X_DEP_BLOCKED;
 
-					dev_dbg(kbdev->dev, "Set X_DEP flag on atom %p\n",
+					dev_dbg(kbdev->dev, "Set X_DEP flag on atom %pK\n",
 						(void *)katom);
 
 					katom->x_pre_dep = dep_atom;
@@ -1154,7 +1154,7 @@ static bool kbase_js_dep_validate(struct kbase_context *kctx,
 		}
 	} else {
 		dev_dbg(kbdev->dev,
-			"Deps of atom %p (s:%d) could not be represented\n",
+			"Deps of atom %pK (s:%d) could not be represented\n",
 			(void *)katom, js);
 	}
 
@@ -1195,7 +1195,7 @@ void kbase_js_update_ctx_priority(struct kbase_context *kctx)
 		/* Determine the new priority for context, as per the priority
 		 * of currently in-use atoms.
 		 */
-		for (prio = KBASE_JS_ATOM_SCHED_PRIO_REALTIME;
+		for (prio = KBASE_JS_ATOM_SCHED_PRIO_FIRST;
 			prio < KBASE_JS_ATOM_SCHED_PRIO_COUNT; prio++) {
 			if (kctx->atoms_count[prio]) {
 				new_priority = prio;
@@ -1237,7 +1237,7 @@ static int js_add_start_rp(struct kbase_jd_atom *const start_katom)
 	if (rp->state != KBASE_JD_RP_COMPLETE)
 		return -EINVAL;
 
-	dev_dbg(kctx->kbdev->dev, "JS add start atom %p of RP %d\n",
+	dev_dbg(kctx->kbdev->dev, "JS add start atom %pK of RP %d\n",
 		(void *)start_katom, start_katom->renderpass_id);
 
 	/* The following members are read when updating the job slot
@@ -1280,7 +1280,7 @@ static int js_add_end_rp(struct kbase_jd_atom *const end_katom)
 
 	rp = &kctx->jctx.renderpasses[end_katom->renderpass_id];
 
-	dev_dbg(kbdev->dev, "JS add end atom %p in state %d of RP %d\n",
+	dev_dbg(kbdev->dev, "JS add end atom %pK in state %d of RP %d\n",
 		(void *)end_katom, (int)rp->state, end_katom->renderpass_id);
 
 	if (rp->state == KBASE_JD_RP_COMPLETE)
@@ -1347,7 +1347,7 @@ bool kbasep_js_add_job(struct kbase_context *kctx,
 	/* Refcount ctx.nr_jobs */
 	KBASE_DEBUG_ASSERT(js_kctx_info->ctx.nr_jobs < U32_MAX);
 	++(js_kctx_info->ctx.nr_jobs);
-	dev_dbg(kbdev->dev, "Add atom %p to kctx %p; now %d in ctx\n",
+	dev_dbg(kbdev->dev, "Add atom %pK to kctx %pK; now %d in ctx\n",
 		(void *)atom, (void *)kctx, js_kctx_info->ctx.nr_jobs);
 
 	/* Lock for state available during IRQ */
@@ -1360,14 +1360,14 @@ bool kbasep_js_add_job(struct kbase_context *kctx,
 		/* Dependencies could not be represented */
 		--(js_kctx_info->ctx.nr_jobs);
 		dev_dbg(kbdev->dev,
-			"Remove atom %p from kctx %p; now %d in ctx\n",
+			"Remove atom %pK from kctx %pK; now %d in ctx\n",
 			(void *)atom, (void *)kctx, js_kctx_info->ctx.nr_jobs);
 
 		/* Setting atom status back to queued as it still has unresolved
 		 * dependencies
 		 */
 		atom->status = KBASE_JD_ATOM_STATE_QUEUED;
-		dev_dbg(kbdev->dev, "Atom %p status to queued\n", (void *)atom);
+		dev_dbg(kbdev->dev, "Atom %pK status to queued\n", (void *)atom);
 
 		/* Undo the count, as the atom will get added again later but
 		 * leave the context priority adjusted or boosted, in case if
@@ -1430,7 +1430,7 @@ bool kbasep_js_add_job(struct kbase_context *kctx,
 			 * context on the Queue
 			 */
 			KBASE_DEBUG_ASSERT(!kbase_ctx_flag(kctx, KCTX_SCHEDULED));
-			dev_dbg(kbdev->dev, "JS: Enqueue Context %p", kctx);
+			dev_dbg(kbdev->dev, "JS: Enqueue Context %pK", kctx);
 
 			/* Queue was updated - caller must try to schedule the
 			 * head context
@@ -1439,7 +1439,7 @@ bool kbasep_js_add_job(struct kbase_context *kctx,
 		}
 	}
 out_unlock:
-	dev_dbg(kbdev->dev, "Enqueue of kctx %p is %srequired\n",
+	dev_dbg(kbdev->dev, "Enqueue of kctx %pK is %srequired\n",
 		kctx, enqueue_required ? "" : "not ");
 
 	mutex_unlock(&js_kctx_info->ctx.jsctx_mutex);
@@ -1468,7 +1468,7 @@ void kbasep_js_remove_job(struct kbase_device *kbdev,
 	KBASE_DEBUG_ASSERT(js_kctx_info->ctx.nr_jobs > 0);
 	--(js_kctx_info->ctx.nr_jobs);
 	dev_dbg(kbdev->dev,
-		"Remove atom %p from kctx %p; now %d in ctx\n",
+		"Remove atom %pK from kctx %pK; now %d in ctx\n",
 		(void *)atom, (void *)kctx, js_kctx_info->ctx.nr_jobs);
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
@@ -1660,7 +1660,7 @@ static kbasep_js_release_result kbasep_js_runpool_release_ctx_internal(
 		/* Last reference, and we've been told to remove this context
 		 * from the Run Pool
 		 */
-		dev_dbg(kbdev->dev, "JS: RunPool Remove Context %p because refcount=%d, jobs=%d, allowed=%d",
+		dev_dbg(kbdev->dev, "JS: RunPool Remove Context %pK because refcount=%d, jobs=%d, allowed=%d",
 				kctx, new_ref_count, js_kctx_info->ctx.nr_jobs,
 				kbasep_js_is_submit_allowed(js_devdata, kctx));
 
@@ -1670,7 +1670,7 @@ static kbasep_js_release_result kbasep_js_runpool_release_ctx_internal(
 
 		for (slot = 0; slot < num_slots; slot++) {
 			if (kbdev->hwaccess.active_kctx[slot] == kctx) {
-				dev_dbg(kbdev->dev, "Marking kctx %p as inactive (s:%d)\n",
+				dev_dbg(kbdev->dev, "Marking kctx %pK as inactive (s:%d)\n",
 					(void *)kctx, slot);
 				kbdev->hwaccess.active_kctx[slot] = NULL;
 			}
@@ -1773,7 +1773,7 @@ void kbasep_js_runpool_requeue_or_kill_ctx(struct kbase_device *kbdev,
 		 * happens asynchronously
 		 */
 		dev_dbg(kbdev->dev,
-			"JS: ** Killing Context %p on RunPool Remove **", kctx);
+			"JS: ** Killing Context %pK on RunPool Remove **", kctx);
 		kbase_js_foreach_ctx_job(kctx, &kbase_jd_cancel);
 	}
 }
@@ -1879,7 +1879,7 @@ static bool kbasep_js_schedule_ctx(struct kbase_device *kbdev,
 	bool kctx_suspended = false;
 	int as_nr;
 
-	dev_dbg(kbdev->dev, "Scheduling kctx %p (s:%d)\n", kctx, js);
+	dev_dbg(kbdev->dev, "Scheduling kctx %pK (s:%d)\n", kctx, js);
 
 	js_devdata = &kbdev->js_data;
 	js_kctx_info = &kctx->jctx.sched_info;
@@ -2025,7 +2025,7 @@ static bool kbase_js_use_ctx(struct kbase_device *kbdev,
 			kbase_backend_use_ctx_sched(kbdev, kctx, js)) {
 
 		dev_dbg(kbdev->dev,
-			"kctx %p already has ASID - mark as active (s:%d)\n",
+			"kctx %pK already has ASID - mark as active (s:%d)\n",
 			(void *)kctx, js);
 
 		if (kbdev->hwaccess.active_kctx[js] != kctx) {
@@ -2200,7 +2200,7 @@ void kbasep_js_resume(struct kbase_device *kbdev)
 
 	mutex_lock(&js_devdata->queue_mutex);
 	for (js = 0; js < kbdev->gpu_props.num_job_slots; js++) {
-		for (prio = KBASE_JS_ATOM_SCHED_PRIO_REALTIME;
+		for (prio = KBASE_JS_ATOM_SCHED_PRIO_FIRST;
 			prio < KBASE_JS_ATOM_SCHED_PRIO_COUNT; prio++) {
 			struct kbase_context *kctx, *n;
 			unsigned long flags;
@@ -2336,7 +2336,7 @@ bool kbase_js_dep_resolved_submit(struct kbase_context *kctx,
 		int js = katom->slot_nr;
 		struct jsctx_queue *queue = &kctx->jsctx_queue[prio][js];
 
-		dev_dbg(kctx->kbdev->dev, "Add atom %p to X_DEP list (s:%d)\n",
+		dev_dbg(kctx->kbdev->dev, "Add atom %pK to X_DEP list (s:%d)\n",
 			(void *)katom, js);
 
 		list_add_tail(&katom->queue, &queue->x_dep_head);
@@ -2346,7 +2346,7 @@ bool kbase_js_dep_resolved_submit(struct kbase_context *kctx,
 			add_required = false;
 		}
 	} else {
-		dev_dbg(kctx->kbdev->dev, "Atom %p not added to X_DEP list\n",
+		dev_dbg(kctx->kbdev->dev, "Atom %pK not added to X_DEP list\n",
 			(void *)katom);
 	}
 
@@ -2360,7 +2360,7 @@ bool kbase_js_dep_resolved_submit(struct kbase_context *kctx,
 	}
 
 	dev_dbg(kctx->kbdev->dev,
-		"Enqueue of kctx %p is %srequired to submit atom %p\n",
+		"Enqueue of kctx %pK is %srequired to submit atom %pK\n",
 		kctx, enqueue_required ? "" : "not ", katom);
 
 	return enqueue_required;
@@ -2387,7 +2387,7 @@ static void kbase_js_move_to_tree(struct kbase_jd_atom *katom)
 
 		if (!kbase_js_atom_blocked_on_x_dep(katom)) {
 			dev_dbg(kctx->kbdev->dev,
-				"Del atom %p from X_DEP list in js_move_to_tree\n",
+				"Del atom %pK from X_DEP list in js_move_to_tree\n",
 				(void *)katom);
 
 			list_del(&katom->queue);
@@ -2405,7 +2405,7 @@ static void kbase_js_move_to_tree(struct kbase_jd_atom *katom)
 			}
 		} else {
 			dev_dbg(kctx->kbdev->dev,
-				"Atom %p blocked on x-dep in js_move_to_tree\n",
+				"Atom %pK blocked on x-dep in js_move_to_tree\n",
 				(void *)katom);
 			break;
 		}
@@ -2449,7 +2449,7 @@ static void kbase_js_evict_deps(struct kbase_context *kctx,
 		/* Remove dependency.*/
 		x_dep->atom_flags &= ~KBASE_KATOM_FLAG_X_DEP_BLOCKED;
 
-		dev_dbg(kctx->kbdev->dev, "Cleared X_DEP flag on atom %p\n",
+		dev_dbg(kctx->kbdev->dev, "Cleared X_DEP flag on atom %pK\n",
 			(void *)x_dep);
 
 		/* Fail if it had a data dependency. */
@@ -2471,14 +2471,14 @@ struct kbase_jd_atom *kbase_js_pull(struct kbase_context *kctx, int js)
 	KBASE_DEBUG_ASSERT(kctx);
 
 	kbdev = kctx->kbdev;
-	dev_dbg(kbdev->dev, "JS: pulling an atom from kctx %p (s:%d)\n",
+	dev_dbg(kbdev->dev, "JS: pulling an atom from kctx %pK (s:%d)\n",
 		(void *)kctx, js);
 
 	js_devdata = &kbdev->js_data;
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
 	if (!kbasep_js_is_submit_allowed(js_devdata, kctx)) {
-		dev_dbg(kbdev->dev, "JS: No submit allowed for kctx %p\n",
+		dev_dbg(kbdev->dev, "JS: No submit allowed for kctx %pK\n",
 			(void *)kctx);
 		return NULL;
 	}
@@ -2491,18 +2491,18 @@ struct kbase_jd_atom *kbase_js_pull(struct kbase_context *kctx, int js)
 
 	katom = jsctx_rb_peek(kctx, js);
 	if (!katom) {
-		dev_dbg(kbdev->dev, "JS: No pullable atom in kctx %p (s:%d)\n",
+		dev_dbg(kbdev->dev, "JS: No pullable atom in kctx %pK (s:%d)\n",
 			(void *)kctx, js);
 		return NULL;
 	}
 	if (kctx->blocked_js[js][katom->sched_priority]) {
 		dev_dbg(kbdev->dev,
-			"JS: kctx %p is blocked from submitting atoms at priority %d (s:%d)\n",
+			"JS: kctx %pK is blocked from submitting atoms at priority %d (s:%d)\n",
 			(void *)kctx, katom->sched_priority, js);
 		return NULL;
 	}
 	if (atomic_read(&katom->blocked)) {
-		dev_dbg(kbdev->dev, "JS: Atom %p is blocked in js_pull\n",
+		dev_dbg(kbdev->dev, "JS: Atom %pK is blocked in js_pull\n",
 			(void *)katom);
 		return NULL;
 	}
@@ -2524,14 +2524,14 @@ struct kbase_jd_atom *kbase_js_pull(struct kbase_context *kctx, int js)
 				KBASE_ATOM_GPU_RB_NOT_IN_SLOT_RB ||
 				katom->x_pre_dep->will_fail_event_code)	{
 			dev_dbg(kbdev->dev,
-				"JS: X pre-dep %p is not present in slot FIFO or will fail\n",
+				"JS: X pre-dep %pK is not present in slot FIFO or will fail\n",
 				(void *)katom->x_pre_dep);
 			return NULL;
 		}
 		if ((katom->atom_flags & KBASE_KATOM_FLAG_FAIL_BLOCKER) &&
 				kbase_backend_nr_atoms_on_slot(kbdev, js)) {
 			dev_dbg(kbdev->dev,
-				"JS: Atom %p has cross-slot fail dependency and atoms on slot (s:%d)\n",
+				"JS: Atom %pK has cross-slot fail dependency and atoms on slot (s:%d)\n",
 				(void *)katom, js);
 			return NULL;
 		}
@@ -2556,7 +2556,7 @@ struct kbase_jd_atom *kbase_js_pull(struct kbase_context *kctx, int js)
 
 	katom->ticks = 0;
 
-	dev_dbg(kbdev->dev, "JS: successfully pulled atom %p from kctx %p (s:%d)\n",
+	dev_dbg(kbdev->dev, "JS: successfully pulled atom %pK from kctx %pK (s:%d)\n",
 		(void *)katom, (void *)kctx, js);
 
 	return katom;
@@ -2599,7 +2599,7 @@ static void js_return_of_start_rp(struct kbase_jd_atom *const start_katom)
 		return;
 
 	dev_dbg(kctx->kbdev->dev,
-		"JS return start atom %p in state %d of RP %d\n",
+		"JS return start atom %pK in state %d of RP %d\n",
 		(void *)start_katom, (int)rp->state,
 		start_katom->renderpass_id);
 
@@ -2627,7 +2627,7 @@ static void js_return_of_start_rp(struct kbase_jd_atom *const start_katom)
 	/* Prevent the tiler job being pulled for execution in the
 	 * job scheduler again.
 	 */
-	dev_dbg(kbdev->dev, "Blocking start atom %p\n",
+	dev_dbg(kbdev->dev, "Blocking start atom %pK\n",
 		(void *)start_katom);
 	atomic_inc(&start_katom->blocked);
 
@@ -2639,14 +2639,14 @@ static void js_return_of_start_rp(struct kbase_jd_atom *const start_katom)
 	/* Was the fragment job chain submitted to kbase yet? */
 	end_katom = rp->end_katom;
 	if (end_katom) {
-		dev_dbg(kctx->kbdev->dev, "JS return add end atom %p\n",
+		dev_dbg(kctx->kbdev->dev, "JS return add end atom %pK\n",
 			(void *)end_katom);
 
 		if (rp->state == KBASE_JD_RP_RETRY_OOM) {
 			/* Allow the end of the renderpass to be pulled for
 			 * execution again to continue incremental rendering.
 			 */
-			dev_dbg(kbdev->dev, "Unblocking end atom %p\n",
+			dev_dbg(kbdev->dev, "Unblocking end atom %pK\n",
 				(void *)end_katom);
 			atomic_dec(&end_katom->blocked);
 			WARN_ON(!(end_katom->atom_flags &
@@ -2708,7 +2708,7 @@ static void js_return_of_end_rp(struct kbase_jd_atom *const end_katom)
 		return;
 
 	dev_dbg(kctx->kbdev->dev,
-		"JS return end atom %p in state %d of RP %d\n",
+		"JS return end atom %pK in state %d of RP %d\n",
 		(void *)end_katom, (int)rp->state, end_katom->renderpass_id);
 
 	if (WARN_ON(rp->state != KBASE_JD_RP_OOM &&
@@ -2730,14 +2730,14 @@ static void js_return_of_end_rp(struct kbase_jd_atom *const end_katom)
 		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
 		dev_dbg(kbdev->dev,
-			"Reset backing to %zu pages for region %p\n",
+			"Reset backing to %zu pages for region %pK\n",
 			reg->threshold_pages, (void *)reg);
 
 		if (!WARN_ON(reg->flags & KBASE_REG_VA_FREED))
 			kbase_mem_shrink(kctx, reg, reg->threshold_pages);
 
 		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-		dev_dbg(kbdev->dev, "Deleting region %p from list\n",
+		dev_dbg(kbdev->dev, "Deleting region %pK from list\n",
 			(void *)reg);
 		list_del_init(&reg->link);
 		kbase_va_region_alloc_put(kctx, reg);
@@ -2755,7 +2755,7 @@ static void js_return_of_end_rp(struct kbase_jd_atom *const end_katom)
 	 */
 	start_katom = rp->start_katom;
 	if (!WARN_ON(!start_katom)) {
-		dev_dbg(kbdev->dev, "Unblocking start atom %p\n",
+		dev_dbg(kbdev->dev, "Unblocking start atom %pK\n",
 			(void *)start_katom);
 		atomic_dec(&start_katom->blocked);
 		(void)kbase_js_ctx_list_add_pullable_head_nolock(kbdev, kctx,
@@ -2781,7 +2781,7 @@ static void js_return_worker(struct work_struct *data)
 	unsigned long flags;
 	base_jd_core_req core_req = katom->core_req;
 
-	dev_dbg(kbdev->dev, "%s for atom %p with event code 0x%x\n",
+	dev_dbg(kbdev->dev, "%s for atom %pK with event code 0x%x\n",
 		__func__, (void *)katom, katom->event_code);
 
 	if (katom->event_code != BASE_JD_EVENT_END_RP_DONE)
@@ -2826,12 +2826,12 @@ static void js_return_worker(struct work_struct *data)
 
 	if (!atomic_read(&kctx->atoms_pulled)) {
 		dev_dbg(kbdev->dev,
-			"No atoms currently pulled from context %p\n",
+			"No atoms currently pulled from context %pK\n",
 			(void *)kctx);
 
 		if (!kctx->slots_pullable) {
 			dev_dbg(kbdev->dev,
-				"Context %p %s counted as runnable\n",
+				"Context %pK %s counted as runnable\n",
 				(void *)kctx,
 				kbase_ctx_flag(kctx, KCTX_RUNNABLE_REF) ?
 					"is" : "isn't");
@@ -2867,7 +2867,7 @@ static void js_return_worker(struct work_struct *data)
 
 	if (context_idle) {
 		dev_dbg(kbdev->dev,
-			"Context %p %s counted as active\n",
+			"Context %pK %s counted as active\n",
 			(void *)kctx,
 			kbase_ctx_flag(kctx, KCTX_ACTIVE) ?
 				"is" : "isn't");
@@ -2906,13 +2906,13 @@ static void js_return_worker(struct work_struct *data)
 
 	kbase_backend_complete_wq_post_sched(kbdev, core_req);
 
-	dev_dbg(kbdev->dev, "Leaving %s for atom %p\n",
+	dev_dbg(kbdev->dev, "Leaving %s for atom %pK\n",
 		__func__, (void *)katom);
 }
 
 void kbase_js_unpull(struct kbase_context *kctx, struct kbase_jd_atom *katom)
 {
-	dev_dbg(kctx->kbdev->dev, "Unpulling atom %p in kctx %p\n",
+	dev_dbg(kctx->kbdev->dev, "Unpulling atom %pK in kctx %pK\n",
 		(void *)katom, (void *)kctx);
 
 	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
@@ -2967,7 +2967,7 @@ static bool js_complete_start_rp(struct kbase_context *kctx,
 		return false;
 
 	dev_dbg(kctx->kbdev->dev,
-		"Start atom %p is done in state %d of RP %d\n",
+		"Start atom %pK is done in state %d of RP %d\n",
 		(void *)start_katom, (int)rp->state,
 		start_katom->renderpass_id);
 
@@ -2979,7 +2979,7 @@ static bool js_complete_start_rp(struct kbase_context *kctx,
 		unsigned long flags;
 
 		dev_dbg(kctx->kbdev->dev,
-			"Start atom %p completed before soft-stop\n",
+			"Start atom %pK completed before soft-stop\n",
 			(void *)start_katom);
 
 		kbase_gpu_vm_lock(kctx);
@@ -2991,7 +2991,7 @@ static bool js_complete_start_rp(struct kbase_context *kctx,
 						 struct kbase_va_region, link);
 
 			WARN_ON(reg->flags & KBASE_REG_VA_FREED);
-			dev_dbg(kctx->kbdev->dev, "Deleting region %p from list\n",
+			dev_dbg(kctx->kbdev->dev, "Deleting region %pK from list\n",
 				(void *)reg);
 			list_del_init(&reg->link);
 			kbase_va_region_alloc_put(kctx, reg);
@@ -3001,7 +3001,7 @@ static bool js_complete_start_rp(struct kbase_context *kctx,
 		kbase_gpu_vm_unlock(kctx);
 	} else {
 		dev_dbg(kctx->kbdev->dev,
-			"Start atom %p did not exceed memory threshold\n",
+			"Start atom %pK did not exceed memory threshold\n",
 			(void *)start_katom);
 
 		WARN_ON(rp->state != KBASE_JD_RP_START &&
@@ -3018,7 +3018,7 @@ static bool js_complete_start_rp(struct kbase_context *kctx,
 			/* Allow the end of the renderpass to be pulled for
 			 * execution again to continue incremental rendering.
 			 */
-			dev_dbg(kbdev->dev, "Unblocking end atom %p!\n",
+			dev_dbg(kbdev->dev, "Unblocking end atom %pK!\n",
 				(void *)end_katom);
 			atomic_dec(&end_katom->blocked);
 
@@ -3062,7 +3062,7 @@ static void js_complete_end_rp(struct kbase_context *kctx,
 	if (WARN_ON(rp->end_katom != end_katom))
 		return;
 
-	dev_dbg(kbdev->dev, "End atom %p is done in state %d of RP %d\n",
+	dev_dbg(kbdev->dev, "End atom %pK is done in state %d of RP %d\n",
 		(void *)end_katom, (int)rp->state, end_katom->renderpass_id);
 
 	if (WARN_ON(rp->state == KBASE_JD_RP_COMPLETE) ||
@@ -3096,7 +3096,7 @@ bool kbase_js_complete_atom_wq(struct kbase_context *kctx,
 	kbdev = kctx->kbdev;
 	atom_slot = katom->slot_nr;
 
-	dev_dbg(kbdev->dev, "%s for atom %p (s:%d)\n",
+	dev_dbg(kbdev->dev, "%s for atom %pK (s:%d)\n",
 		__func__, (void *)katom, atom_slot);
 
 	/* Update the incremental rendering state machine.
@@ -3115,7 +3115,7 @@ bool kbase_js_complete_atom_wq(struct kbase_context *kctx,
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 
 	if (katom->atom_flags & KBASE_KATOM_FLAG_JSCTX_IN_TREE) {
-		dev_dbg(kbdev->dev, "Atom %p is in runnable_tree\n",
+		dev_dbg(kbdev->dev, "Atom %pK is in runnable_tree\n",
 			(void *)katom);
 
 		context_idle = !atomic_dec_return(&kctx->atoms_pulled);
@@ -3136,7 +3136,7 @@ bool kbase_js_complete_atom_wq(struct kbase_context *kctx,
 		if (!kctx->atoms_pulled_slot_pri[atom_slot][prio]
 				&& kctx->blocked_js[atom_slot][prio]) {
 			dev_dbg(kbdev->dev,
-				"kctx %p is no longer blocked from submitting on slot %d at priority %d\n",
+				"kctx %pK is no longer blocked from submitting on slot %d at priority %d\n",
 				(void *)kctx, atom_slot, prio);
 
 			kctx->blocked_js[atom_slot][prio] = false;
@@ -3190,7 +3190,7 @@ bool kbase_js_complete_atom_wq(struct kbase_context *kctx,
 	 * jd_done_worker().
 	 */
 	if (context_idle) {
-		dev_dbg(kbdev->dev, "kctx %p is no longer active\n",
+		dev_dbg(kbdev->dev, "kctx %pK is no longer active\n",
 			(void *)kctx);
 		kbase_ctx_flag_clear(kctx, KCTX_ACTIVE);
 	}
@@ -3241,7 +3241,7 @@ static bool js_end_rp_is_complete(struct kbase_jd_atom *const end_katom)
 		return true;
 
 	dev_dbg(kbdev->dev,
-		"JS complete end atom %p in state %d of RP %d\n",
+		"JS complete end atom %pK in state %d of RP %d\n",
 		(void *)end_katom, (int)rp->state,
 		end_katom->renderpass_id);
 
@@ -3270,7 +3270,7 @@ struct kbase_jd_atom *kbase_js_complete_atom(struct kbase_jd_atom *katom,
 	struct kbase_jd_atom *x_dep = katom->x_post_dep;
 
 	kbdev = kctx->kbdev;
-	dev_dbg(kbdev->dev, "Atom %p complete in kctx %p (post-dep %p)\n",
+	dev_dbg(kbdev->dev, "Atom %pK complete in kctx %pK (post-dep %pK)\n",
 		(void *)katom, (void *)kctx, (void *)x_dep);
 
 	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
@@ -3286,7 +3286,7 @@ struct kbase_jd_atom *kbase_js_complete_atom(struct kbase_jd_atom *katom,
 		katom->event_code = katom->will_fail_event_code;
 
 	katom->status = KBASE_JD_ATOM_STATE_HW_COMPLETED;
-	dev_dbg(kbdev->dev, "Atom %p status to HW completed\n", (void *)katom);
+	dev_dbg(kbdev->dev, "Atom %pK status to HW completed\n", (void *)katom);
 
 	if (katom->event_code != BASE_JD_EVENT_DONE) {
 		kbase_js_evict_deps(kctx, katom, katom->slot_nr,
@@ -3308,7 +3308,7 @@ struct kbase_jd_atom *kbase_js_complete_atom(struct kbase_jd_atom *katom,
 		bool was_pullable = kbase_js_ctx_pullable(kctx, x_dep->slot_nr,
 				false);
 		x_dep->atom_flags &= ~KBASE_KATOM_FLAG_X_DEP_BLOCKED;
-		dev_dbg(kbdev->dev, "Cleared X_DEP flag on atom %p\n",
+		dev_dbg(kbdev->dev, "Cleared X_DEP flag on atom %pK\n",
 			(void *)x_dep);
 
 		kbase_js_move_to_tree(x_dep);
@@ -3319,13 +3319,13 @@ struct kbase_jd_atom *kbase_js_complete_atom(struct kbase_jd_atom *katom,
 					x_dep->slot_nr);
 
 		if (x_dep->atom_flags & KBASE_KATOM_FLAG_JSCTX_IN_TREE) {
-			dev_dbg(kbdev->dev, "Atom %p is in runnable tree\n",
+			dev_dbg(kbdev->dev, "Atom %pK is in runnable tree\n",
 				(void *)x_dep);
 			return x_dep;
 		}
 	} else {
 		dev_dbg(kbdev->dev,
-			"No cross-slot dep to unblock for atom %p\n",
+			"No cross-slot dep to unblock for atom %pK\n",
 			(void *)katom);
 	}
 
@@ -3356,13 +3356,13 @@ bool kbase_js_atom_blocked_on_x_dep(struct kbase_jd_atom *const katom)
 
 	if (!(katom->atom_flags &
 			KBASE_KATOM_FLAG_X_DEP_BLOCKED)) {
-		dev_dbg(kbdev->dev, "Atom %p is not blocked on a cross-slot dependency",
+		dev_dbg(kbdev->dev, "Atom %pK is not blocked on a cross-slot dependency",
 			(void *)katom);
 		return false;
 	}
 
 	if (!(katom->core_req & BASE_JD_REQ_END_RENDERPASS)) {
-		dev_dbg(kbdev->dev, "Atom %p is blocked on a cross-slot dependency",
+		dev_dbg(kbdev->dev, "Atom %pK is blocked on a cross-slot dependency",
 			(void *)katom);
 		return true;
 	}
@@ -3388,12 +3388,12 @@ bool kbase_js_atom_blocked_on_x_dep(struct kbase_jd_atom *const katom)
 	 * if it only depends on the tiler job chain.
 	 */
 	if (katom->x_pre_dep != rp->start_katom) {
-		dev_dbg(kbdev->dev, "Dependency is on %p not start atom %p\n",
+		dev_dbg(kbdev->dev, "Dependency is on %pK not start atom %pK\n",
 			(void *)katom->x_pre_dep, (void *)rp->start_katom);
 		return true;
 	}
 
-	dev_dbg(kbdev->dev, "Ignoring cross-slot dep on atom %p\n",
+	dev_dbg(kbdev->dev, "Ignoring cross-slot dep on atom %pK\n",
 		(void *)katom->x_pre_dep);
 
 	return false;
@@ -3407,7 +3407,7 @@ void kbase_js_sched(struct kbase_device *kbdev, int js_mask)
 	bool ctx_waiting[BASE_JM_MAX_NR_SLOTS];
 	int js;
 
-	dev_dbg(kbdev->dev, "%s kbdev %p mask 0x%x\n",
+	dev_dbg(kbdev->dev, "%s kbdev %pK mask 0x%x\n",
 		__func__, (void *)kbdev, (unsigned int)js_mask);
 
 	js_devdata = &kbdev->js_data;
@@ -3442,7 +3442,7 @@ void kbase_js_sched(struct kbase_device *kbdev, int js_mask)
 				context_idle = true;
 
 				dev_dbg(kbdev->dev,
-					"kctx %p is not active (s:%d)\n",
+					"kctx %pK is not active (s:%d)\n",
 					(void *)kctx, js);
 
 				if (kbase_pm_context_active_handle_suspend(
@@ -3472,7 +3472,7 @@ void kbase_js_sched(struct kbase_device *kbdev, int js_mask)
 					&kctx->jctx.sched_info.ctx.jsctx_mutex);
 
 				dev_dbg(kbdev->dev,
-					"kctx %p cannot be used at this time\n",
+					"kctx %pK cannot be used at this time\n",
 					kctx);
 
 				spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
@@ -3514,7 +3514,7 @@ void kbase_js_sched(struct kbase_device *kbdev, int js_mask)
 				bool pullable;
 
 				dev_dbg(kbdev->dev,
-					"No atoms pulled from kctx %p (s:%d)\n",
+					"No atoms pulled from kctx %pK (s:%d)\n",
 					(void *)kctx, js);
 
 				pullable = kbase_js_ctx_pullable(kctx, js,
@@ -3576,7 +3576,7 @@ void kbase_js_sched(struct kbase_device *kbdev, int js_mask)
 				break; /* Could not run atoms on this slot */
 			}
 
-			dev_dbg(kbdev->dev, "Push kctx %p to back of list\n",
+			dev_dbg(kbdev->dev, "Push kctx %pK to back of list\n",
 				(void *)kctx);
 			if (kbase_js_ctx_pullable(kctx, js, true))
 				timer_sync |=
@@ -3598,7 +3598,7 @@ void kbase_js_sched(struct kbase_device *kbdev, int js_mask)
 	for (js = 0; js < BASE_JM_MAX_NR_SLOTS; js++) {
 		if (kbdev->hwaccess.active_kctx[js] == last_active[js] &&
 				ctx_waiting[js]) {
-			dev_dbg(kbdev->dev, "Marking kctx %p as inactive (s:%d)\n",
+			dev_dbg(kbdev->dev, "Marking kctx %pK as inactive (s:%d)\n",
 					(void *)last_active[js], js);
 			kbdev->hwaccess.active_kctx[js] = NULL;
 		}
@@ -3629,7 +3629,7 @@ void kbase_js_zap_context(struct kbase_context *kctx)
 	mutex_lock(&js_kctx_info->ctx.jsctx_mutex);
 	kbase_ctx_flag_set(kctx, KCTX_DYING);
 
-	dev_dbg(kbdev->dev, "Zap: Try Evict Ctx %p", kctx);
+	dev_dbg(kbdev->dev, "Zap: Try Evict Ctx %pK", kctx);
 
 	/*
 	 * At this point we know:
@@ -3693,7 +3693,7 @@ void kbase_js_zap_context(struct kbase_context *kctx)
 
 		KBASE_KTRACE_ADD_JM(kbdev, JM_ZAP_NON_SCHEDULED, kctx, NULL, 0u, kbase_ctx_flag(kctx, KCTX_SCHEDULED));
 
-		dev_dbg(kbdev->dev, "Zap: Ctx %p scheduled=0", kctx);
+		dev_dbg(kbdev->dev, "Zap: Ctx %pK scheduled=0", kctx);
 
 		/* Only cancel jobs when we evicted from the
 		 * queue. No Power Manager active reference was held.
@@ -3714,7 +3714,7 @@ void kbase_js_zap_context(struct kbase_context *kctx)
 		 * Pool
 		 */
 		KBASE_KTRACE_ADD_JM(kbdev, JM_ZAP_SCHEDULED, kctx, NULL, 0u, kbase_ctx_flag(kctx, KCTX_SCHEDULED));
-		dev_dbg(kbdev->dev, "Zap: Ctx %p is in RunPool", kctx);
+		dev_dbg(kbdev->dev, "Zap: Ctx %pK is in RunPool", kctx);
 
 		/* Disable the ctx from submitting any more jobs */
 		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
@@ -3732,7 +3732,7 @@ void kbase_js_zap_context(struct kbase_context *kctx)
 		 */
 		KBASE_DEBUG_ASSERT(was_retained);
 
-		dev_dbg(kbdev->dev, "Zap: Ctx %p Kill Any Running jobs", kctx);
+		dev_dbg(kbdev->dev, "Zap: Ctx %pK Kill Any Running jobs", kctx);
 
 		/* Cancel any remaining running jobs for this kctx - if any.
 		 * Submit is disallowed which takes effect immediately, so no
@@ -3745,7 +3745,7 @@ void kbase_js_zap_context(struct kbase_context *kctx)
 		mutex_unlock(&js_devdata->queue_mutex);
 		mutex_unlock(&kctx->jctx.lock);
 
-		dev_dbg(kbdev->dev, "Zap: Ctx %p Release (may or may not schedule out immediately)",
+		dev_dbg(kbdev->dev, "Zap: Ctx %pK Release (may or may not schedule out immediately)",
 									kctx);
 
 		kbasep_js_runpool_release_ctx(kbdev, kctx);
diff --git a/mali_kbase/mali_kbase_kinstr_jm.c b/mali_kbase/mali_kbase_kinstr_jm.c
index 76cff41..cc8dd86 100644
--- a/mali_kbase/mali_kbase_kinstr_jm.c
+++ b/mali_kbase/mali_kbase_kinstr_jm.c
@@ -25,7 +25,7 @@
  */
 
 #include "mali_kbase_kinstr_jm.h"
-#include "mali_kbase_kinstr_jm_reader.h"
+#include <uapi/gpu/arm/midgard/mali_kbase_kinstr_jm_reader.h>
 
 #include "mali_kbase.h"
 #include "mali_kbase_linux.h"
diff --git a/mali_kbase/mali_kbase_kinstr_jm.h b/mali_kbase/mali_kbase_kinstr_jm.h
index 74fe5cf..2b81636 100644
--- a/mali_kbase/mali_kbase_kinstr_jm.h
+++ b/mali_kbase/mali_kbase_kinstr_jm.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2019, 2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -63,7 +63,7 @@
 #ifndef _KBASE_KINSTR_JM_H_
 #define _KBASE_KINSTR_JM_H_
 
-#include "mali_kbase_kinstr_jm_reader.h"
+#include <uapi/gpu/arm/midgard/mali_kbase_kinstr_jm_reader.h>
 
 #ifdef __KERNEL__
 #include <linux/version.h>
diff --git a/mali_kbase/mali_kbase_mem.c b/mali_kbase/mali_kbase_mem.c
index fd992e2..326917c 100644
--- a/mali_kbase/mali_kbase_mem.c
+++ b/mali_kbase/mali_kbase_mem.c
@@ -849,7 +849,7 @@ bool kbase_has_exec_va_zone(struct kbase_context *kctx)
  *
  * Return: true if any allocs exist on any zone, false otherwise
  */
-bool kbase_region_tracker_has_allocs(struct kbase_context *kctx)
+static bool kbase_region_tracker_has_allocs(struct kbase_context *kctx)
 {
 	unsigned int zone_idx;
 
@@ -1393,7 +1393,7 @@ void kbase_free_alloced_region(struct kbase_va_region *reg)
 		if (WARN_ON(kbase_is_region_invalid(reg)))
 			return;
 
-		dev_dbg(kctx->kbdev->dev, "Freeing memory region %p\n",
+		dev_dbg(kctx->kbdev->dev, "Freeing memory region %pK\n",
 			(void *)reg);
 #if MALI_USE_CSF
 		if (reg->flags & KBASE_REG_CSF_EVENT)
@@ -1916,7 +1916,7 @@ int kbase_mem_free_region(struct kbase_context *kctx, struct kbase_va_region *re
 
 	KBASE_DEBUG_ASSERT(kctx != NULL);
 	KBASE_DEBUG_ASSERT(reg != NULL);
-	dev_dbg(kctx->kbdev->dev, "%s %p in kctx %p\n",
+	dev_dbg(kctx->kbdev->dev, "%s %pK in kctx %pK\n",
 		__func__, (void *)reg, (void *)kctx);
 	lockdep_assert_held(&kctx->reg_lock);
 
@@ -1975,7 +1975,7 @@ int kbase_mem_free(struct kbase_context *kctx, u64 gpu_addr)
 	struct kbase_va_region *reg;
 
 	KBASE_DEBUG_ASSERT(kctx != NULL);
-	dev_dbg(kctx->kbdev->dev, "%s 0x%llx in kctx %p\n",
+	dev_dbg(kctx->kbdev->dev, "%s 0x%llx in kctx %pK\n",
 		__func__, gpu_addr, (void *)kctx);
 
 	if ((gpu_addr & ~PAGE_MASK) && (gpu_addr >= PAGE_SIZE)) {
@@ -2772,6 +2772,7 @@ void kbase_free_phy_pages_helper_locked(struct kbase_mem_phy_alloc *alloc,
 		kbase_trace_gpu_mem_usage_dec(kctx->kbdev, kctx, freed);
 	}
 }
+KBASE_EXPORT_TEST_API(kbase_free_phy_pages_helper_locked);
 
 #if MALI_USE_CSF
 /**
@@ -4233,8 +4234,11 @@ void kbase_jit_free(struct kbase_context *kctx, struct kbase_va_region *reg)
 			div_u64(old_pages * (100 - kctx->trim_level), 100));
 		u64 delta = old_pages - new_size;
 
-		if (delta)
+		if (delta) {
+			mutex_lock(&kctx->reg_lock);
 			kbase_mem_shrink(kctx, reg, old_pages - delta);
+			mutex_unlock(&kctx->reg_lock);
+		}
 	}
 
 #if MALI_JIT_PRESSURE_LIMIT_BASE
diff --git a/mali_kbase/mali_kbase_mem.h b/mali_kbase/mali_kbase_mem.h
index cda6b57..d12ec31 100644
--- a/mali_kbase/mali_kbase_mem.h
+++ b/mali_kbase/mali_kbase_mem.h
@@ -31,7 +31,7 @@
 #endif
 
 #include <linux/kref.h>
-#include "mali_base_kernel.h"
+#include <uapi/gpu/arm/midgard/mali_base_kernel.h>
 #include <mali_kbase_hw.h>
 #include "mali_kbase_pm.h"
 #include "mali_kbase_defs.h"
@@ -549,7 +549,7 @@ static inline struct kbase_va_region *kbase_va_region_alloc_get(
 	WARN_ON(!region->va_refcnt);
 
 	/* non-atomic as kctx->reg_lock is held */
-	dev_dbg(kctx->kbdev->dev, "va_refcnt %d before get %p\n",
+	dev_dbg(kctx->kbdev->dev, "va_refcnt %d before get %pK\n",
 		region->va_refcnt, (void *)region);
 	region->va_refcnt++;
 
@@ -566,7 +566,7 @@ static inline struct kbase_va_region *kbase_va_region_alloc_put(
 
 	/* non-atomic as kctx->reg_lock is held */
 	region->va_refcnt--;
-	dev_dbg(kctx->kbdev->dev, "va_refcnt %d after put %p\n",
+	dev_dbg(kctx->kbdev->dev, "va_refcnt %d after put %pK\n",
 		region->va_refcnt, (void *)region);
 	if (!region->va_refcnt)
 		kbase_region_refcnt_free(region);
diff --git a/mali_kbase/mali_kbase_mem_linux.c b/mali_kbase/mali_kbase_mem_linux.c
index 7c9c08e..cc80927 100644
--- a/mali_kbase/mali_kbase_mem_linux.c
+++ b/mali_kbase/mali_kbase_mem_linux.c
@@ -42,7 +42,7 @@
 #include <mali_kbase.h>
 #include <mali_kbase_mem_linux.h>
 #include <tl/mali_kbase_tracepoints.h>
-#include <mali_kbase_ioctl.h>
+#include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h>
 #include <mmu/mali_kbase_mmu.h>
 #include <mali_kbase_caps.h>
 #include <mali_kbase_trace_gpu_mem.h>
@@ -1104,7 +1104,7 @@ int kbase_mem_do_sync_imported(struct kbase_context *kctx,
 				dir);
 #endif /* KBASE_MEM_ION_SYNC_WORKAROUND */
 		break;
-	};
+	}
 
 	if (unlikely(ret))
 		dev_warn(kctx->kbdev->dev,
@@ -2718,7 +2718,7 @@ int kbase_context_mmap(struct kbase_context *const kctx,
 {
 	struct kbase_va_region *reg = NULL;
 	void *kaddr = NULL;
-	size_t nr_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	size_t nr_pages = vma_pages(vma);
 	int err = 0;
 	int free_on_close = 0;
 	struct device *dev = kctx->kbdev->dev;
@@ -3333,7 +3333,7 @@ static int kbase_csf_cpu_mmap_user_io_pages(struct kbase_context *kctx,
 {
 	unsigned long cookie =
 		vma->vm_pgoff - PFN_DOWN(BASEP_MEM_CSF_USER_IO_PAGES_HANDLE);
-	size_t nr_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	size_t nr_pages = vma_pages(vma);
 	struct kbase_queue *queue;
 	int err = 0;
 
diff --git a/mali_kbase/mali_kbase_mem_pool.c b/mali_kbase/mali_kbase_mem_pool.c
index 9b5854a..1874a6f 100644
--- a/mali_kbase/mali_kbase_mem_pool.c
+++ b/mali_kbase/mali_kbase_mem_pool.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2015-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2015-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -309,7 +309,7 @@ void kbase_mem_pool_set_max_size(struct kbase_mem_pool *pool, size_t max_size)
 
 	kbase_mem_pool_unlock(pool);
 }
-
+KBASE_EXPORT_TEST_API(kbase_mem_pool_set_max_size);
 
 static unsigned long kbase_mem_pool_reclaim_count_objects(struct shrinker *s,
 		struct shrink_control *sc)
@@ -804,8 +804,8 @@ void kbase_mem_pool_free_pages_locked(struct kbase_mem_pool *pool,
 		nr_to_pool = kbase_mem_pool_capacity(pool);
 		nr_to_pool = min(nr_pages, nr_to_pool);
 
-		kbase_mem_pool_add_array_locked(pool, nr_pages, pages, false,
-				dirty);
+		kbase_mem_pool_add_array_locked(pool, nr_to_pool, pages, false,
+						dirty);
 
 		i += nr_to_pool;
 	}
diff --git a/mali_kbase/mali_kbase_mipe_gen_header.h b/mali_kbase/mali_kbase_mipe_gen_header.h
index 87eb65b..d1ea7ad 100644
--- a/mali_kbase/mali_kbase_mipe_gen_header.h
+++ b/mali_kbase/mali_kbase_mipe_gen_header.h
@@ -39,14 +39,14 @@
  * defined. See documentation below:
  */
 
-/**
+/*
  * The name of the variable where the result BLOB will be stored.
  */
 #if !defined(MIPE_HEADER_BLOB_VAR_NAME)
 #error "MIPE_HEADER_BLOB_VAR_NAME must be defined!"
 #endif
 
-/**
+/*
  * A compiler attribute for the BLOB variable.
  *
  * e.g. __attribute__((section("my_section")))
@@ -77,7 +77,7 @@
 #error "MIPE_HEADER_STREAM_ID must be defined!"
 #endif
 
-/**
+/*
  * MIPE packet class.
  *
  * See enum tl_packet_class.
@@ -86,7 +86,7 @@
 #error "MIPE_HEADER_PKT_CLASS must be defined!"
 #endif
 
-/**
+/*
  * The list of tracepoints to process.
  *
  * It should be defined as follows:
@@ -105,14 +105,14 @@
 #error "MIPE_HEADER_TRACEPOINT_LIST must be defined!"
 #endif
 
-/**
+/*
  * The number of entries in MIPE_HEADER_TRACEPOINT_LIST.
  */
 #if !defined(MIPE_HEADER_TRACEPOINT_LIST_SIZE)
 #error "MIPE_HEADER_TRACEPOINT_LIST_SIZE must be defined!"
 #endif
 
-/**
+/*
  * The list of enums to process.
  *
  * It should be defined as follows:
@@ -129,7 +129,7 @@
  */
 #if defined(MIPE_HEADER_ENUM_LIST)
 
-/**
+/*
  * Tracepoint message ID used for enums declaration.
  */
 #if !defined(MIPE_HEADER_ENUM_MSG_ID)
diff --git a/mali_kbase/mali_kbase_pm.c b/mali_kbase/mali_kbase_pm.c
index da09a97..3ded47b 100644
--- a/mali_kbase/mali_kbase_pm.c
+++ b/mali_kbase/mali_kbase_pm.c
@@ -256,9 +256,15 @@ void kbase_pm_driver_resume(struct kbase_device *kbdev, bool arb_gpu_start)
 	kbase_pm_context_idle(kbdev);
 
 	/* Re-enable GPU hardware counters */
+#if MALI_USE_CSF
+	kbase_csf_scheduler_spin_lock(kbdev, &flags);
+	kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
+	kbase_csf_scheduler_spin_unlock(kbdev, flags);
+#else
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+#endif
 
 	/* Resume vinstr */
 	kbase_vinstr_resume(kbdev->vinstr_ctx);
diff --git a/mali_kbase/mali_kbase_reset_gpu.h b/mali_kbase/mali_kbase_reset_gpu.h
index 4f66972..cb8a082 100644
--- a/mali_kbase/mali_kbase_reset_gpu.h
+++ b/mali_kbase/mali_kbase_reset_gpu.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -143,8 +143,16 @@ void kbase_reset_gpu_assert_prevented(struct kbase_device *kbdev);
 void kbase_reset_gpu_assert_failed_or_prevented(struct kbase_device *kbdev);
 
 /**
+ * Flags for kbase_prepare_to_reset_gpu
+ */
+#define RESET_FLAGS_NONE ((unsigned int)0)
+/* This reset should be treated as an unrecoverable error by HW counter logic */
+#define RESET_FLAGS_HWC_UNRECOVERABLE_ERROR ((unsigned int)(1 << 0))
+
+/**
  * kbase_prepare_to_reset_gpu_locked - Prepare for resetting the GPU.
  * @kbdev: Device pointer
+ * @flags: Bitfield indicating impact of reset (see flag defines)
  *
  * Caller is expected to hold the kbdev->hwaccess_lock.
  *
@@ -153,18 +161,20 @@ void kbase_reset_gpu_assert_failed_or_prevented(struct kbase_device *kbdev);
  * - false - Another thread is performing a reset, kbase_reset_gpu should
  *           not be called.
  */
-bool kbase_prepare_to_reset_gpu_locked(struct kbase_device *kbdev);
+bool kbase_prepare_to_reset_gpu_locked(struct kbase_device *kbdev,
+				       unsigned int flags);
 
 /**
  * kbase_prepare_to_reset_gpu - Prepare for resetting the GPU.
  * @kbdev: Device pointer
- *
+ * @flags: Bitfield indicating impact of reset (see flag defines)
+
  * Return: a boolean which should be interpreted as follows:
  * - true  - Prepared for reset, kbase_reset_gpu should be called.
  * - false - Another thread is performing a reset, kbase_reset_gpu should
  *           not be called.
  */
-bool kbase_prepare_to_reset_gpu(struct kbase_device *kbdev);
+bool kbase_prepare_to_reset_gpu(struct kbase_device *kbdev, unsigned int flags);
 
 /**
  * kbase_reset_gpu - Reset the GPU
diff --git a/mali_kbase/mali_kbase_softjobs.c b/mali_kbase/mali_kbase_softjobs.c
index 654c029..e14a4be 100644
--- a/mali_kbase/mali_kbase_softjobs.c
+++ b/mali_kbase/mali_kbase_softjobs.c
@@ -27,7 +27,7 @@
 #include <mali_kbase_sync.h>
 #endif
 #include <linux/dma-mapping.h>
-#include <mali_base_kernel.h>
+#include <uapi/gpu/arm/midgard/mali_base_kernel.h>
 #include <mali_kbase_hwaccess_time.h>
 #include <mali_kbase_kinstr_jm.h>
 #include <mali_kbase_mem_linux.h>
@@ -145,6 +145,9 @@ static int kbase_dump_cpu_gpu_time(struct kbase_jd_atom *katom)
 	 * delay suspend until we process the atom (which may be at the end of a
 	 * long chain of dependencies
 	 */
+#ifdef CONFIG_MALI_ARBITER_SUPPORT
+	atomic_inc(&kctx->kbdev->pm.gpu_users_waiting);
+#endif /* CONFIG_MALI_ARBITER_SUPPORT */
 	pm_active_err = kbase_pm_context_active_handle_suspend(kctx->kbdev, KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE);
 	if (pm_active_err) {
 		struct kbasep_js_device_data *js_devdata = &kctx->kbdev->js_data;
@@ -162,6 +165,10 @@ static int kbase_dump_cpu_gpu_time(struct kbase_jd_atom *katom)
 
 		return pm_active_err;
 	}
+#ifdef CONFIG_MALI_ARBITER_SUPPORT
+	else
+		atomic_dec(&kctx->kbdev->pm.gpu_users_waiting);
+#endif /* CONFIG_MALI_ARBITER_SUPPORT */
 
 	kbase_backend_get_gpu_time(kctx->kbdev, &cycle_counter, &system_time,
 									&ts);
@@ -291,7 +298,7 @@ static void kbase_fence_debug_check_atom(struct kbase_jd_atom *katom)
 
 				if (!kbase_sync_fence_in_info_get(dep, &info)) {
 					dev_warn(dev,
-						 "\tVictim trigger atom %d fence [%p] %s: %s\n",
+						 "\tVictim trigger atom %d fence [%pK] %s: %s\n",
 						 kbase_jd_atom_id(kctx, dep),
 						 info.fence,
 						 info.name,
@@ -320,11 +327,11 @@ static void kbase_fence_debug_wait_timeout(struct kbase_jd_atom *katom)
 		return;
 	}
 
-	dev_warn(dev, "ctx %d_%d: Atom %d still waiting for fence [%p] after %dms\n",
+	dev_warn(dev, "ctx %d_%d: Atom %d still waiting for fence [%pK] after %dms\n",
 		 kctx->tgid, kctx->id,
 		 kbase_jd_atom_id(kctx, katom),
 		 info.fence, timeout_ms);
-	dev_warn(dev, "\tGuilty fence [%p] %s: %s\n",
+	dev_warn(dev, "\tGuilty fence [%pK] %s: %s\n",
 		 info.fence, info.name,
 		 kbase_sync_status_string(info.status));
 
@@ -1422,41 +1429,27 @@ static int kbase_ext_res_prepare(struct kbase_jd_atom *katom)
 	struct base_external_resource_list *ext_res;
 	u64 count = 0;
 	size_t copy_size;
-	int ret;
 
 	user_ext_res = (__user struct base_external_resource_list *)
 			(uintptr_t) katom->jc;
 
 	/* Fail the job if there is no info structure */
-	if (!user_ext_res) {
-		ret = -EINVAL;
-		goto fail;
-	}
+	if (!user_ext_res)
+		return -EINVAL;
 
-	if (copy_from_user(&count, &user_ext_res->count, sizeof(u64)) != 0) {
-		ret = -EINVAL;
-		goto fail;
-	}
+	if (copy_from_user(&count, &user_ext_res->count, sizeof(u64)) != 0)
+		return -EINVAL;
 
 	/* Is the number of external resources in range? */
-	if (!count || count > BASE_EXT_RES_COUNT_MAX) {
-		ret = -EINVAL;
-		goto fail;
-	}
+	if (!count || count > BASE_EXT_RES_COUNT_MAX)
+		return -EINVAL;
 
 	/* Copy the information for safe access and future storage */
 	copy_size = sizeof(*ext_res);
 	copy_size += sizeof(struct base_external_resource) * (count - 1);
-	ext_res = kzalloc(copy_size, GFP_KERNEL);
-	if (!ext_res) {
-		ret = -ENOMEM;
-		goto fail;
-	}
-
-	if (copy_from_user(ext_res, user_ext_res, copy_size) != 0) {
-		ret = -EINVAL;
-		goto free_info;
-	}
+	ext_res = memdup_user(user_ext_res, copy_size);
+	if (IS_ERR(ext_res))
+		return PTR_ERR(ext_res);
 
 	/*
 	 * Overwrite the count with the first value incase it was changed
@@ -1467,11 +1460,6 @@ static int kbase_ext_res_prepare(struct kbase_jd_atom *katom)
 	katom->softjob_data = ext_res;
 
 	return 0;
-
-free_info:
-	kfree(ext_res);
-fail:
-	return ret;
 }
 
 static void kbase_ext_res_process(struct kbase_jd_atom *katom, bool map)
@@ -1793,6 +1781,9 @@ void kbase_resume_suspended_soft_jobs(struct kbase_device *kbdev)
 		if (kbase_process_soft_job(katom_iter) == 0) {
 			kbase_finish_soft_job(katom_iter);
 			resched |= jd_done_nolock(katom_iter, NULL);
+#ifdef CONFIG_MALI_ARBITER_SUPPORT
+			atomic_dec(&kbdev->pm.gpu_users_waiting);
+#endif /* CONFIG_MALI_ARBITER_SUPPORT */
 		}
 		mutex_unlock(&kctx->jctx.lock);
 	}
diff --git a/mali_kbase/mali_kbase_sync_common.c b/mali_kbase/mali_kbase_sync_common.c
index 2061f53..39a68c2 100644
--- a/mali_kbase/mali_kbase_sync_common.c
+++ b/mali_kbase/mali_kbase_sync_common.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2012-2016, 2018-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2012-2016, 2018-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -20,7 +20,7 @@
  */
 
 /*
- * @file mali_kbase_sync_common.c
+ * @file
  *
  * Common code for our explicit fence functionality
  */
diff --git a/mali_kbase/mali_kbase_vinstr.c b/mali_kbase/mali_kbase_vinstr.c
index bc985cb..4ac0d0e 100644
--- a/mali_kbase/mali_kbase_vinstr.c
+++ b/mali_kbase/mali_kbase_vinstr.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2011-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2011-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -22,9 +22,9 @@
 #include "mali_kbase_vinstr.h"
 #include "mali_kbase_hwcnt_virtualizer.h"
 #include "mali_kbase_hwcnt_types.h"
-#include "mali_kbase_hwcnt_reader.h"
+#include <uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h>
 #include "mali_kbase_hwcnt_gpu.h"
-#include "mali_kbase_ioctl.h"
+#include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h>
 #include "mali_malisw.h"
 #include "mali_kbase_debug.h"
 
@@ -898,11 +898,12 @@ static long kbasep_vinstr_hwcnt_reader_ioctl_get_api_version(
 	struct kbase_vinstr_client *cli, unsigned long arg, size_t size)
 {
 	long ret = -EINVAL;
-	u8 clk_cnt = cli->vctx->metadata->clk_cnt;
 
 	if (size == sizeof(u32)) {
 		ret = put_user(HWCNT_READER_API, (u32 __user *)arg);
 	} else if (size == sizeof(struct kbase_hwcnt_reader_api_version)) {
+		u8 clk_cnt = cli->vctx->metadata->clk_cnt;
+		unsigned long bytes = 0;
 		struct kbase_hwcnt_reader_api_version api_version = {
 			.version = HWCNT_READER_API,
 			.features = KBASE_HWCNT_READER_API_VERSION_NO_FEATURE,
@@ -915,8 +916,16 @@ static long kbasep_vinstr_hwcnt_reader_ioctl_get_api_version(
 			api_version.features |=
 			    KBASE_HWCNT_READER_API_VERSION_FEATURE_CYCLES_SHADER_CORES;
 
-		ret = copy_to_user(
+		bytes = copy_to_user(
 			(void __user *)arg, &api_version, sizeof(api_version));
+
+		/* copy_to_user returns zero in case of success.
+		 * If it fails, it returns the number of bytes that could NOT be copied
+		 */
+		if (bytes == 0)
+			ret = 0;
+		else
+			ret = -EFAULT;
 	}
 	return ret;
 }
@@ -1042,7 +1051,16 @@ static int kbasep_vinstr_hwcnt_reader_mmap(
 		return -EINVAL;
 
 	vm_size = vma->vm_end - vma->vm_start;
-	size = cli->dump_bufs.buf_cnt * cli->vctx->metadata->dump_buf_bytes;
+
+	/* The mapping is allowed to span the entirety of the page allocation,
+	 * not just the chunk where the dump buffers are allocated.
+	 * This accommodates the corner case where the combined size of the
+	 * dump buffers is smaller than a single page.
+	 * This does not pose a security risk as the pages are zeroed on
+	 * allocation, and anything out of bounds of the dump buffers is never
+	 * written to.
+	 */
+	size = (1ull << cli->dump_bufs.page_order) * PAGE_SIZE;
 
 	if (vma->vm_pgoff > (size >> PAGE_SHIFT))
 		return -EINVAL;
diff --git a/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c b/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c
index 6b7cb42..8240817 100644
--- a/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c
+++ b/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c
@@ -83,10 +83,19 @@ static void submit_work_pagefault(struct kbase_device *kbdev, u32 as_nr,
 			.addr = fault->addr,
 		};
 
-		if (WARN_ON(!queue_work(as->pf_wq, &as->work_pagefault)))
+		/*
+		 * A page fault work item could already be pending for the
+		 * context's address space, when the page fault occurs for
+		 * MCU's address space.
+		 */
+		if (!queue_work(as->pf_wq, &as->work_pagefault))
 			kbase_ctx_sched_release_ctx(kctx);
-		else
+		else {
+			dev_dbg(kbdev->dev,
+				"Page fault is already pending for as %u\n",
+				as_nr);
 			atomic_inc(&kbdev->faults_pending);
+		}
 	}
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 }
@@ -117,15 +126,9 @@ void kbase_mmu_report_mcu_as_fault_and_reset(struct kbase_device *kbdev,
 	for (as_no = 1; as_no < kbdev->nr_hw_address_spaces; as_no++)
 		submit_work_pagefault(kbdev, as_no, fault);
 
-	/* MCU AS fault could mean hardware counters will stop working.
-	 * Put the backend into the unrecoverable error state to cause
-	 * current and subsequent counter operations to immediately
-	 * fail, avoiding the risk of a hang.
-	 */
-	kbase_hwcnt_backend_csf_on_unrecoverable_error(&kbdev->hwcnt_gpu_iface);
-
 	/* GPU reset is required to recover */
-	if (kbase_prepare_to_reset_gpu(kbdev))
+	if (kbase_prepare_to_reset_gpu(kbdev,
+				       RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
 		kbase_reset_gpu(kbdev);
 }
 KBASE_EXPORT_TEST_API(kbase_mmu_report_mcu_as_fault_and_reset);
diff --git a/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c b/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c
index 18a74ab..ae334c1 100644
--- a/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c
+++ b/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c
@@ -206,7 +206,7 @@ static void kbase_mmu_interrupt_process(struct kbase_device *kbdev,
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
 	dev_dbg(kbdev->dev,
-		"Entering %s kctx %p, as %p\n",
+		"Entering %s kctx %pK, as %pK\n",
 		__func__, (void *)kctx, (void *)as);
 
 	if (!kctx) {
@@ -255,14 +255,10 @@ static void kbase_mmu_interrupt_process(struct kbase_device *kbdev,
 		 */
 		kbasep_js_clear_submit_allowed(js_devdata, kctx);
 
-		if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_AARCH64_MMU))
-			dev_warn(kbdev->dev,
-					"Bus error in AS%d at VA=0x%016llx, IPA=0x%016llx\n",
-					as->number, fault->addr,
-					fault->extra_addr);
-		else
-			dev_warn(kbdev->dev, "Bus error in AS%d at 0x%016llx\n",
-					as->number, fault->addr);
+		dev_warn(kbdev->dev,
+				"Bus error in AS%d at VA=0x%016llx, IPA=0x%016llx\n",
+				as->number, fault->addr,
+				fault->extra_addr);
 
 		/*
 		 * We need to switch to UNMAPPED mode - but we do this in a
@@ -276,7 +272,7 @@ static void kbase_mmu_interrupt_process(struct kbase_device *kbdev,
 	}
 
 	dev_dbg(kbdev->dev,
-		"Leaving %s kctx %p, as %p\n",
+		"Leaving %s kctx %pK, as %pK\n",
 		__func__, (void *)kctx, (void *)as);
 }
 
@@ -375,14 +371,11 @@ void kbase_mmu_interrupt(struct kbase_device *kbdev, u32 irq_stat)
 		/* record the fault status */
 		fault->status = kbase_reg_read(kbdev, MMU_AS_REG(as_no,
 				AS_FAULTSTATUS));
-
-		if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_AARCH64_MMU)) {
-			fault->extra_addr = kbase_reg_read(kbdev,
-					MMU_AS_REG(as_no, AS_FAULTEXTRA_HI));
-			fault->extra_addr <<= 32;
-			fault->extra_addr |= kbase_reg_read(kbdev,
-					MMU_AS_REG(as_no, AS_FAULTEXTRA_LO));
-		}
+		fault->extra_addr = kbase_reg_read(kbdev,
+				MMU_AS_REG(as_no, AS_FAULTEXTRA_HI));
+		fault->extra_addr <<= 32;
+		fault->extra_addr |= kbase_reg_read(kbdev,
+				MMU_AS_REG(as_no, AS_FAULTEXTRA_LO));
 
 		if (kbase_as_has_bus_fault(as, fault)) {
 			/* Mark bus fault as handled.
@@ -423,7 +416,7 @@ int kbase_mmu_switch_to_ir(struct kbase_context *const kctx,
 	struct kbase_va_region *const reg)
 {
 	dev_dbg(kctx->kbdev->dev,
-		"Switching to incremental rendering for region %p\n",
+		"Switching to incremental rendering for region %pK\n",
 		(void *)reg);
 	return kbase_job_slot_softstop_start_rp(kctx, reg);
 }
diff --git a/mali_kbase/mmu/mali_kbase_mmu.c b/mali_kbase/mmu/mali_kbase_mmu.c
index 51bee43..0761f68 100644
--- a/mali_kbase/mmu/mali_kbase_mmu.c
+++ b/mali_kbase/mmu/mali_kbase_mmu.c
@@ -561,7 +561,7 @@ void kbase_mmu_page_fault_worker(struct work_struct *data)
 
 	kbdev = container_of(faulting_as, struct kbase_device, as[as_no]);
 	dev_dbg(kbdev->dev,
-		"Entering %s %p, fault_pfn %lld, as_no %d\n",
+		"Entering %s %pK, fault_pfn %lld, as_no %d\n",
 		__func__, (void *)data, fault_pfn, as_no);
 
 	/* Grab the context that was already refcounted in kbase_mmu_interrupt()
@@ -634,21 +634,13 @@ void kbase_mmu_page_fault_worker(struct work_struct *data)
 		goto fault_done;
 
 	case AS_FAULTSTATUS_EXCEPTION_CODE_ADDRESS_SIZE_FAULT:
-		if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_AARCH64_MMU))
-			kbase_mmu_report_fault_and_kill(kctx, faulting_as,
-					"Address size fault", fault);
-		else
-			kbase_mmu_report_fault_and_kill(kctx, faulting_as,
-					"Unknown fault code", fault);
+		kbase_mmu_report_fault_and_kill(kctx, faulting_as,
+				"Address size fault", fault);
 		goto fault_done;
 
 	case AS_FAULTSTATUS_EXCEPTION_CODE_MEMORY_ATTRIBUTES_FAULT:
-		if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_AARCH64_MMU))
-			kbase_mmu_report_fault_and_kill(kctx, faulting_as,
-					"Memory attributes fault", fault);
-		else
-			kbase_mmu_report_fault_and_kill(kctx, faulting_as,
-					"Unknown fault code", fault);
+		kbase_mmu_report_fault_and_kill(kctx, faulting_as,
+				"Memory attributes fault", fault);
 		goto fault_done;
 
 	default:
@@ -852,7 +844,7 @@ page_fault_retry:
 
 			if (kbase_mmu_switch_to_ir(kctx, region) >= 0) {
 				dev_dbg(kctx->kbdev->dev,
-					"Get region %p for IR\n",
+					"Get region %pK for IR\n",
 					(void *)region);
 				kbase_va_region_alloc_get(kctx, region);
 			}
@@ -980,7 +972,7 @@ fault_done:
 	release_ctx(kbdev, kctx);
 
 	atomic_dec(&kbdev->faults_pending);
-	dev_dbg(kbdev->dev, "Leaving page_fault_worker %p\n", (void *)data);
+	dev_dbg(kbdev->dev, "Leaving page_fault_worker %pK\n", (void *)data);
 }
 
 static phys_addr_t kbase_mmu_alloc_pgd(struct kbase_device *kbdev,
@@ -1557,7 +1549,7 @@ static void kbase_mmu_flush_invalidate_noretain(struct kbase_context *kctx,
 		 */
 		dev_err(kbdev->dev, "Flush for GPU page table update did not complete. Issuing GPU soft-reset to recover\n");
 
-		if (kbase_prepare_to_reset_gpu_locked(kbdev))
+		if (kbase_prepare_to_reset_gpu_locked(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu_locked(kbdev);
 	}
 }
@@ -1613,17 +1605,8 @@ static void kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev,
 		 */
 		dev_err(kbdev->dev, "Flush for GPU page table update did not complete. Issuing GPU soft-reset to recover\n");
 
-#if MALI_USE_CSF
-		/* A GPU hang could mean hardware counters will stop working.
-		 * Put the backend into the unrecoverable error state to cause
-		 * current and subsequent counter operations to immediately
-		 * fail, avoiding the risk of a hang.
-		 */
-		kbase_hwcnt_backend_csf_on_unrecoverable_error(
-			&kbdev->hwcnt_gpu_iface);
-#endif /* MALI_USE_CSF */
-
-		if (kbase_prepare_to_reset_gpu(kbdev))
+		if (kbase_prepare_to_reset_gpu(
+			    kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
 			kbase_reset_gpu(kbdev);
 	}
 
@@ -1659,7 +1642,7 @@ static void kbase_mmu_flush_invalidate(struct kbase_context *kctx,
 	ctx_is_in_runpool = kbase_ctx_sched_inc_refcount(kctx);
 	mutex_unlock(&kbdev->js_data.queue_mutex);
 #else
-	ctx_is_in_runpool = kbase_ctx_sched_refcount_mmu_flush(kctx, sync);
+	ctx_is_in_runpool = kbase_ctx_sched_inc_refcount_if_as_valid(kctx);
 #endif /* !MALI_USE_CSF */
 
 	if (ctx_is_in_runpool) {
@@ -1681,11 +1664,6 @@ void kbase_mmu_update(struct kbase_device *kbdev,
 	KBASE_DEBUG_ASSERT(as_nr != KBASEP_AS_NR_INVALID);
 
 	kbdev->mmu_mode->update(kbdev, mmut, as_nr);
-
-#if MALI_USE_CSF
-	if (mmut->kctx)
-		mmut->kctx->mmu_flush_pend_state = KCTX_MMU_FLUSH_NOT_PEND;
-#endif
 }
 KBASE_EXPORT_TEST_API(kbase_mmu_update);
 
@@ -1719,10 +1697,6 @@ void kbase_mmu_disable(struct kbase_context *kctx)
 	kbase_mmu_flush_invalidate_noretain(kctx, 0, ~0, true);
 
 	kctx->kbdev->mmu_mode->disable_as(kctx->kbdev, kctx->as_nr);
-
-#if MALI_USE_CSF
-	kctx->mmu_flush_pend_state = KCTX_MMU_FLUSH_NOT_PEND;
-#endif
 }
 KBASE_EXPORT_TEST_API(kbase_mmu_disable);
 
@@ -2312,30 +2286,3 @@ void kbase_flush_mmu_wqs(struct kbase_device *kbdev)
 		flush_workqueue(as->pf_wq);
 	}
 }
-
-#if MALI_USE_CSF
-void kbase_mmu_deferred_flush_invalidate(struct kbase_context *kctx)
-{
-	struct kbase_device *kbdev = kctx->kbdev;
-
-	lockdep_assert_held(&kbdev->mmu_hw_mutex);
-
-	if (kctx->as_nr == KBASEP_AS_NR_INVALID)
-		return;
-
-	if (kctx->mmu_flush_pend_state == KCTX_MMU_FLUSH_NOT_PEND)
-		return;
-
-	WARN_ON(!atomic_read(&kctx->refcount));
-
-	/* Specify the entire address space as the locked region.
-	 * The flush of entire L2 cache and complete TLB invalidation will
-	 * anyways happen for the exisiting CSF GPUs, regardless of the locked
-	 * range. This may have to be revised later on.
-	 */
-	kbase_mmu_flush_invalidate_noretain(kctx, 0, ~0,
-		kctx->mmu_flush_pend_state == KCTX_MMU_FLUSH_PEND_SYNC);
-
-	kctx->mmu_flush_pend_state = KCTX_MMU_FLUSH_NOT_PEND;
-}
-#endif
diff --git a/mali_kbase/mmu/mali_kbase_mmu.h b/mali_kbase/mmu/mali_kbase_mmu.h
index 1d877ac..bf4fd91 100644
--- a/mali_kbase/mmu/mali_kbase_mmu.h
+++ b/mali_kbase/mmu/mali_kbase_mmu.h
@@ -152,21 +152,4 @@ int kbase_mmu_bus_fault_interrupt(struct kbase_device *kbdev, u32 status,
 void kbase_mmu_gpu_fault_interrupt(struct kbase_device *kbdev, u32 status,
 		u32 as_nr, u64 address, bool as_valid);
 
-#if MALI_USE_CSF
-/**
- * kbase_mmu_deferred_flush_invalidate() - Perform deferred MMU flush
- *                                         operations for a Kbase context.
- * @kctx:    Pointer to the Kbase context for which MMU flush operations
- *           are pending.
- *
- * This function performs the MMU flush operations that are pending for a Kbase
- * context. The flush operations will be deferred if the context is inactive,
- * i.e. kctx->refcount is zero which happens when all the queue groups of a
- * context have gone off CSG slots.
- * This needs to be called when first queue group of the context is put back
- * on the CSG slot.
- */
-void kbase_mmu_deferred_flush_invalidate(struct kbase_context *kctx);
-#endif
-
 #endif /* _KBASE_MMU_H_ */
diff --git a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
index b0596af..88fd9cf 100644
--- a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
+++ b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
@@ -124,38 +124,33 @@ void kbase_mmu_hw_configure(struct kbase_device *kbdev, struct kbase_as *as)
 	struct kbase_mmu_setup *current_setup = &as->current_setup;
 	u64 transcfg = 0;
 
-	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_AARCH64_MMU)) {
-		transcfg = current_setup->transcfg;
+	transcfg = current_setup->transcfg;
 
-		/* Set flag AS_TRANSCFG_PTW_MEMATTR_WRITE_BACK
-		 * Clear PTW_MEMATTR bits
-		 */
-		transcfg &= ~AS_TRANSCFG_PTW_MEMATTR_MASK;
-		/* Enable correct PTW_MEMATTR bits */
-		transcfg |= AS_TRANSCFG_PTW_MEMATTR_WRITE_BACK;
-		/* Ensure page-tables reads use read-allocate cache-policy in
-		 * the L2
-		 */
-		transcfg |= AS_TRANSCFG_R_ALLOCATE;
-
-		if (kbdev->system_coherency != COHERENCY_NONE) {
-			/* Set flag AS_TRANSCFG_PTW_SH_OS (outer shareable)
-			 * Clear PTW_SH bits
-			 */
-			transcfg = (transcfg & ~AS_TRANSCFG_PTW_SH_MASK);
-			/* Enable correct PTW_SH bits */
-			transcfg = (transcfg | AS_TRANSCFG_PTW_SH_OS);
-		}
+	/* Set flag AS_TRANSCFG_PTW_MEMATTR_WRITE_BACK
+	 * Clear PTW_MEMATTR bits
+	 */
+	transcfg &= ~AS_TRANSCFG_PTW_MEMATTR_MASK;
+	/* Enable correct PTW_MEMATTR bits */
+	transcfg |= AS_TRANSCFG_PTW_MEMATTR_WRITE_BACK;
+	/* Ensure page-tables reads use read-allocate cache-policy in
+	 * the L2
+	 */
+	transcfg |= AS_TRANSCFG_R_ALLOCATE;
 
-		kbase_reg_write(kbdev, MMU_AS_REG(as->number, AS_TRANSCFG_LO),
-				transcfg);
-		kbase_reg_write(kbdev, MMU_AS_REG(as->number, AS_TRANSCFG_HI),
-				(transcfg >> 32) & 0xFFFFFFFFUL);
-	} else {
-		if (kbdev->system_coherency != COHERENCY_NONE)
-			current_setup->transtab |= AS_TRANSTAB_LPAE_SHARE_OUTER;
+	if (kbdev->system_coherency != COHERENCY_NONE) {
+		/* Set flag AS_TRANSCFG_PTW_SH_OS (outer shareable)
+		 * Clear PTW_SH bits
+		 */
+		transcfg = (transcfg & ~AS_TRANSCFG_PTW_SH_MASK);
+		/* Enable correct PTW_SH bits */
+		transcfg = (transcfg | AS_TRANSCFG_PTW_SH_OS);
 	}
 
+	kbase_reg_write(kbdev, MMU_AS_REG(as->number, AS_TRANSCFG_LO),
+			transcfg);
+	kbase_reg_write(kbdev, MMU_AS_REG(as->number, AS_TRANSCFG_HI),
+			(transcfg >> 32) & 0xFFFFFFFFUL);
+
 	kbase_reg_write(kbdev, MMU_AS_REG(as->number, AS_TRANSTAB_LO),
 			current_setup->transtab & 0xFFFFFFFFUL);
 	kbase_reg_write(kbdev, MMU_AS_REG(as->number, AS_TRANSTAB_HI),
diff --git a/mali_kbase/mmu/mali_kbase_mmu_mode_lpae.c b/mali_kbase/mmu/mali_kbase_mmu_mode_lpae.c
deleted file mode 100644
index 09793e1..0000000
--- a/mali_kbase/mmu/mali_kbase_mmu_mode_lpae.c
+++ /dev/null
@@ -1,195 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *
- * (C) COPYRIGHT 2010-2021 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU license.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- */
-
-#include "mali_kbase.h"
-#include <gpu/mali_kbase_gpu_regmap.h>
-#include "mali_kbase_defs.h"
-
-#define ENTRY_TYPE_MASK     3ULL
-#define ENTRY_IS_ATE        1ULL
-#define ENTRY_IS_INVAL      2ULL
-#define ENTRY_IS_PTE        3ULL
-
-#define ENTRY_ATTR_BITS (7ULL << 2)	/* bits 4:2 */
-#define ENTRY_RD_BIT (1ULL << 6)
-#define ENTRY_WR_BIT (1ULL << 7)
-#define ENTRY_SHARE_BITS (3ULL << 8)	/* bits 9:8 */
-#define ENTRY_ACCESS_BIT (1ULL << 10)
-#define ENTRY_NX_BIT (1ULL << 54)
-
-#define ENTRY_FLAGS_MASK (ENTRY_ATTR_BITS | ENTRY_RD_BIT | ENTRY_WR_BIT | \
-		ENTRY_SHARE_BITS | ENTRY_ACCESS_BIT | ENTRY_NX_BIT)
-
-/* Helper Function to perform assignment of page table entries, to
- * ensure the use of strd, which is required on LPAE systems.
- */
-static inline void page_table_entry_set(u64 *pte, u64 phy)
-{
-	WRITE_ONCE(*pte, phy);
-}
-
-static void mmu_get_as_setup(struct kbase_mmu_table *mmut,
-		struct kbase_mmu_setup * const setup)
-{
-	/* Set up the required caching policies at the correct indices
-	 * in the memattr register.
-	 */
-	setup->memattr =
-		(AS_MEMATTR_LPAE_IMPL_DEF_CACHE_POLICY <<
-		(AS_MEMATTR_INDEX_IMPL_DEF_CACHE_POLICY * 8)) |
-		(AS_MEMATTR_LPAE_FORCE_TO_CACHE_ALL    <<
-		(AS_MEMATTR_INDEX_FORCE_TO_CACHE_ALL * 8))    |
-		(AS_MEMATTR_LPAE_WRITE_ALLOC           <<
-		(AS_MEMATTR_INDEX_WRITE_ALLOC * 8))           |
-		(AS_MEMATTR_LPAE_OUTER_IMPL_DEF        <<
-		(AS_MEMATTR_INDEX_OUTER_IMPL_DEF * 8))        |
-		(AS_MEMATTR_LPAE_OUTER_WA              <<
-		(AS_MEMATTR_INDEX_OUTER_WA * 8))              |
-		0; /* The other indices are unused for now */
-
-	setup->transtab = ((u64)mmut->pgd &
-		((0xFFFFFFFFULL << 32) | AS_TRANSTAB_LPAE_ADDR_SPACE_MASK)) |
-		AS_TRANSTAB_LPAE_ADRMODE_TABLE |
-		AS_TRANSTAB_LPAE_READ_INNER;
-
-	setup->transcfg = AS_TRANSCFG_ADRMODE_LEGACY;
-}
-
-static void mmu_update(struct kbase_device *kbdev,
-		struct kbase_mmu_table *mmut,
-		int as_nr)
-{
-	struct kbase_as *as;
-	struct kbase_mmu_setup *current_setup;
-
-	if (WARN_ON(as_nr == KBASEP_AS_NR_INVALID))
-		return;
-
-	as = &kbdev->as[as_nr];
-	current_setup = &as->current_setup;
-
-	mmu_get_as_setup(mmut, current_setup);
-
-	/* Apply the address space setting */
-	kbase_mmu_hw_configure(kbdev, as);
-}
-
-static void mmu_disable_as(struct kbase_device *kbdev, int as_nr)
-{
-	struct kbase_as * const as = &kbdev->as[as_nr];
-	struct kbase_mmu_setup * const current_setup = &as->current_setup;
-
-	current_setup->transtab = AS_TRANSTAB_LPAE_ADRMODE_UNMAPPED;
-
-	/* Apply the address space setting */
-	kbase_mmu_hw_configure(kbdev, as);
-}
-
-static phys_addr_t pte_to_phy_addr(u64 entry)
-{
-	if (!(entry & 1))
-		return 0;
-
-	return entry & ~0xFFF;
-}
-
-static int ate_is_valid(u64 ate, int const level)
-{
-	return ((ate & ENTRY_TYPE_MASK) == ENTRY_IS_ATE);
-}
-
-static int pte_is_valid(u64 pte, int const level)
-{
-	return ((pte & ENTRY_TYPE_MASK) == ENTRY_IS_PTE);
-}
-
-/*
- * Map KBASE_REG flags to MMU flags
- */
-static u64 get_mmu_flags(unsigned long flags)
-{
-	u64 mmu_flags;
-	unsigned long memattr_idx;
-
-	memattr_idx = KBASE_REG_MEMATTR_VALUE(flags);
-	if (WARN(memattr_idx == AS_MEMATTR_INDEX_NON_CACHEABLE,
-			"Legacy Mode MMU cannot honor GPU non-cachable memory, will use default instead\n"))
-		memattr_idx = AS_MEMATTR_INDEX_DEFAULT;
-	/* store mem_attr index as 4:2, noting that:
-	 * - macro called above ensures 3 bits already
-	 * - all AS_MEMATTR_INDEX_<...> macros only use 3 bits
-	 */
-	mmu_flags = memattr_idx << 2;
-
-	/* write perm if requested */
-	mmu_flags |= (flags & KBASE_REG_GPU_WR) ? ENTRY_WR_BIT : 0;
-	/* read perm if requested */
-	mmu_flags |= (flags & KBASE_REG_GPU_RD) ? ENTRY_RD_BIT : 0;
-	/* nx if requested */
-	mmu_flags |= (flags & KBASE_REG_GPU_NX) ? ENTRY_NX_BIT : 0;
-
-	if (flags & KBASE_REG_SHARE_BOTH) {
-		/* inner and outer shareable */
-		mmu_flags |= SHARE_BOTH_BITS;
-	} else if (flags & KBASE_REG_SHARE_IN) {
-		/* inner shareable coherency */
-		mmu_flags |= SHARE_INNER_BITS;
-	}
-
-	return mmu_flags;
-}
-
-static void entry_set_ate(u64 *entry,
-		struct tagged_addr phy,
-		unsigned long flags,
-		int const level)
-{
-	page_table_entry_set(entry, as_phys_addr_t(phy) | get_mmu_flags(flags) |
-			     ENTRY_IS_ATE);
-}
-
-static void entry_set_pte(u64 *entry, phys_addr_t phy)
-{
-	page_table_entry_set(entry, (phy & ~0xFFF) | ENTRY_IS_PTE);
-}
-
-static void entry_invalidate(u64 *entry)
-{
-	page_table_entry_set(entry, ENTRY_IS_INVAL);
-}
-
-static struct kbase_mmu_mode const lpae_mode = {
-	.update = mmu_update,
-	.get_as_setup = mmu_get_as_setup,
-	.disable_as = mmu_disable_as,
-	.pte_to_phy_addr = pte_to_phy_addr,
-	.ate_is_valid = ate_is_valid,
-	.pte_is_valid = pte_is_valid,
-	.entry_set_ate = entry_set_ate,
-	.entry_set_pte = entry_set_pte,
-	.entry_invalidate = entry_invalidate,
-	.flags = 0
-};
-
-struct kbase_mmu_mode const *kbase_mmu_mode_get_lpae(void)
-{
-	return &lpae_mode;
-}
diff --git a/mali_kbase/tests/Mconfig b/mali_kbase/tests/Mconfig
index 2630736..a21810b 100644
--- a/mali_kbase/tests/Mconfig
+++ b/mali_kbase/tests/Mconfig
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 #
-# (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@@ -40,6 +40,10 @@ config BUILD_CSF_TESTS
 
 config BUILD_ARBIF_TESTS
 	bool
-	default y if UNIT_TEST_KERNEL_MODULES && MALI_ARBITER_SUPPORT
+	default y if UNIT_TEST_CODE && MALI_ARBITER_SUPPORT
 	default n
 
+config BUILD_ARBIF_KERNEL_TESTS
+	bool
+	default y if BUILD_KERNEL_MODULES && BUILD_ARBIF_TESTS
+	default n
diff --git a/mali_kbase/tl/backend/mali_kbase_timeline_csf.c b/mali_kbase/tl/backend/mali_kbase_timeline_csf.c
index 42f1e2d..7455ce2 100644
--- a/mali_kbase/tl/backend/mali_kbase_timeline_csf.c
+++ b/mali_kbase/tl/backend/mali_kbase_timeline_csf.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -159,7 +159,7 @@ void kbase_create_timeline_objects(struct kbase_device *kbdev)
 		 * this iteration of the loop, so will start to correctly update
 		 * the object model state.
 		 */
-	};
+	}
 
 	mutex_unlock(&timeline->tl_kctx_list_lock);
 
diff --git a/mali_kbase/tl/backend/mali_kbase_timeline_jm.c b/mali_kbase/tl/backend/mali_kbase_timeline_jm.c
index f016e8b..6659d2d 100644
--- a/mali_kbase/tl/backend/mali_kbase_timeline_jm.c
+++ b/mali_kbase/tl/backend/mali_kbase_timeline_jm.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -74,7 +74,7 @@ void kbase_create_timeline_objects(struct kbase_device *kbdev)
 				kctx,
 				kctx->id,
 				(u32)(kctx->tgid));
-	};
+	}
 
 	/* Reset body stream buffers while holding the kctx lock.
 	 * This ensures we can't fire both summary and normal tracepoints for
diff --git a/mali_kbase/tl/mali_kbase_timeline.c b/mali_kbase/tl/mali_kbase_timeline.c
index 4f955a1..20d7b16 100644
--- a/mali_kbase/tl/mali_kbase_timeline.c
+++ b/mali_kbase/tl/mali_kbase_timeline.c
@@ -186,7 +186,7 @@ static void kbase_tlstream_current_devfreq_target(struct kbase_device *kbdev)
 
 int kbase_timeline_io_acquire(struct kbase_device *kbdev, u32 flags)
 {
-	int ret;
+	int ret = 0;
 	u32 timeline_flags = TLSTREAM_ENABLED | flags;
 	struct kbase_timeline *timeline = kbdev->timeline;
 
@@ -262,6 +262,9 @@ int kbase_timeline_io_acquire(struct kbase_device *kbdev, u32 flags)
 		ret = -EBUSY;
 	}
 
+	if (ret >= 0)
+		timeline->last_acquire_time = ktime_get();
+
 	return ret;
 }
 
diff --git a/mali_kbase/tl/mali_kbase_timeline.h b/mali_kbase/tl/mali_kbase_timeline.h
index 9315fcc..0465352 100644
--- a/mali_kbase/tl/mali_kbase_timeline.h
+++ b/mali_kbase/tl/mali_kbase_timeline.h
@@ -107,32 +107,6 @@ void kbase_timeline_pre_kbase_context_destroy(struct kbase_context *kctx);
 void kbase_timeline_post_kbase_context_destroy(struct kbase_context *kctx);
 
 #if MALI_UNIT_TEST
-/**
- * kbase_timeline_test - start timeline stream data generator
- * @kbdev:     Kernel common context
- * @tpw_count: Number of trace point writers in each context
- * @msg_delay: Time delay in milliseconds between trace points written by one
- *             writer
- * @msg_count: Number of trace points written by one writer
- * @aux_msg:   If non-zero aux messages will be included
- *
- * This test starts a requested number of asynchronous writers in both IRQ and
- * thread context. Each writer will generate required number of test
- * tracepoints (tracepoints with embedded information about writer that
- * should be verified by user space reader). Tracepoints will be emitted in
- * all timeline body streams. If aux_msg is non-zero writer will also
- * generate not testable tracepoints (tracepoints without information about
- * writer). These tracepoints are used to check correctness of remaining
- * timeline message generating functions. Writer will wait requested time
- * between generating another set of messages. This call blocks until all
- * writers finish.
- */
-void kbase_timeline_test(
-	struct kbase_device *kbdev,
-	unsigned int tpw_count,
-	unsigned int msg_delay,
-	unsigned int msg_count,
-	int          aux_msg);
 
 /**
  * kbase_timeline_stats - read timeline stream statistics
diff --git a/mali_kbase/tl/mali_kbase_timeline_io.c b/mali_kbase/tl/mali_kbase_timeline_io.c
index 8587ba0..e3b6fbc 100644
--- a/mali_kbase/tl/mali_kbase_timeline_io.c
+++ b/mali_kbase/tl/mali_kbase_timeline_io.c
@@ -24,6 +24,7 @@
 #include "mali_kbase_tracepoints.h"
 #include "mali_kbase_timeline.h"
 
+#include <linux/delay.h>
 #include <linux/poll.h>
 
 /* The timeline stream file operations functions. */
@@ -46,7 +47,8 @@ const struct file_operations kbasep_tlstream_fops = {
 
 /**
  * kbasep_timeline_io_packet_pending - check timeline streams for pending
- *packets
+ *                                     packets
+ *
  * @timeline:      Timeline instance
  * @ready_stream:  Pointer to variable where stream will be placed
  * @rb_idx_raw:    Pointer to variable where read buffer index will be placed
@@ -86,8 +88,8 @@ kbasep_timeline_io_packet_pending(struct kbase_timeline *timeline,
 }
 
 /**
- * kbasep_timeline_has_header_data() -
- *	check timeline headers for pending packets
+ * kbasep_timeline_has_header_data() - check timeline headers for pending
+ *                                     packets
  *
  * @timeline:      Timeline instance
  *
@@ -139,6 +141,7 @@ static inline int copy_stream_header(char __user *buffer, size_t size,
 
 /**
  * kbasep_timeline_copy_header - copy timeline headers to the user
+ *
  * @timeline:    Timeline instance
  * @buffer:      Pointer to the buffer provided by user
  * @size:        Maximum amount of data that can be stored in the buffer
@@ -174,6 +177,7 @@ static inline int kbasep_timeline_copy_headers(struct kbase_timeline *timeline,
 
 /**
  * kbasep_timeline_io_read - copy data from streams to buffer provided by user
+ *
  * @filp:   Pointer to file structure
  * @buffer: Pointer to the buffer provided by user
  * @size:   Maximum amount of data that can be stored in the buffer
@@ -198,7 +202,7 @@ static ssize_t kbasep_timeline_io_read(struct file *filp, char __user *buffer,
 	if (!buffer)
 		return -EINVAL;
 
-	if ((*f_pos < 0) || (size < PACKET_SIZE))
+	if (*f_pos < 0)
 		return -EINVAL;
 
 	mutex_lock(&timeline->reader_lock);
@@ -217,10 +221,10 @@ static ssize_t kbasep_timeline_io_read(struct file *filp, char __user *buffer,
 		}
 
 		/* If we already read some packets and there is no
-     * packet pending then return back to user.
-     * If we don't have any data yet, wait for packet to be
-     * submitted.
-     */
+		 * packet pending then return back to user.
+		 * If we don't have any data yet, wait for packet to be
+		 * submitted.
+		 */
 		if (copy_len > 0) {
 			if (!kbasep_timeline_io_packet_pending(
 				    timeline, &stream, &rb_idx_raw))
@@ -241,8 +245,8 @@ static ssize_t kbasep_timeline_io_read(struct file *filp, char __user *buffer,
 		}
 
 		/* Check if this packet fits into the user buffer.
-     * If so copy its content.
-     */
+		 * If so copy its content.
+		 */
 		rb_idx = rb_idx_raw % PACKET_COUNT;
 		rb_size = atomic_read(&stream->buffer[rb_idx].size);
 		if (rb_size > size - copy_len)
@@ -254,10 +258,10 @@ static ssize_t kbasep_timeline_io_read(struct file *filp, char __user *buffer,
 		}
 
 		/* If the distance between read buffer index and write
-     * buffer index became more than PACKET_COUNT, then overflow
-     * happened and we need to ignore the last portion of bytes
-     * that we have just sent to user.
-     */
+		 * buffer index became more than PACKET_COUNT, then overflow
+		 * happened and we need to ignore the last portion of bytes
+		 * that we have just sent to user.
+		 */
 		smp_rmb();
 		wb_idx_raw = atomic_read(&stream->wbi);
 
@@ -321,6 +325,8 @@ static unsigned int kbasep_timeline_io_poll(struct file *filp, poll_table *wait)
 static int kbasep_timeline_io_release(struct inode *inode, struct file *filp)
 {
 	struct kbase_timeline *timeline;
+	ktime_t elapsed_time;
+	s64 elapsed_time_ms, time_to_sleep;
 
 	KBASE_DEBUG_ASSERT(inode);
 	KBASE_DEBUG_ASSERT(filp);
@@ -330,6 +336,18 @@ static int kbasep_timeline_io_release(struct inode *inode, struct file *filp)
 
 	timeline = (struct kbase_timeline *)filp->private_data;
 
+	/* Get the amount of time passed since the timeline was acquired and ensure
+	 * we sleep for long enough such that it has been at least
+	 * TIMELINE_HYSTERESIS_TIMEOUT_MS amount of time between acquire and release.
+	 * This prevents userspace from spamming acquire and release too quickly.
+	 */
+	elapsed_time = ktime_sub(ktime_get(), timeline->last_acquire_time);
+	elapsed_time_ms = ktime_to_ms(elapsed_time);
+	time_to_sleep = MIN(TIMELINE_HYSTERESIS_TIMEOUT_MS,
+	                    TIMELINE_HYSTERESIS_TIMEOUT_MS - elapsed_time_ms);
+	if (time_to_sleep > 0)
+		msleep(time_to_sleep);
+
 #if MALI_USE_CSF
 	kbase_csf_tl_reader_stop(&timeline->csf_tl_reader);
 #endif
diff --git a/mali_kbase/tl/mali_kbase_timeline_priv.h b/mali_kbase/tl/mali_kbase_timeline_priv.h
index 2825f77..8a58a13 100644
--- a/mali_kbase/tl/mali_kbase_timeline_priv.h
+++ b/mali_kbase/tl/mali_kbase_timeline_priv.h
@@ -34,6 +34,11 @@
 #include <linux/atomic.h>
 #include <linux/mutex.h>
 
+/* The minimum amount of time timeline must be acquired for before release is
+ * allowed, to prevent DoS attacks.
+ */
+#define TIMELINE_HYSTERESIS_TIMEOUT_MS ((s64)500)
+
 /**
  * struct kbase_timeline - timeline state structure
  * @streams:                The timeline streams generated by kernel
@@ -49,6 +54,7 @@
  *                          otherwise. See kbase_timeline_io_acquire().
  * @obj_header_btc:         Remaining bytes to copy for the object stream header
  * @aux_header_btc:         Remaining bytes to copy for the aux stream header
+ * @last_acquire_time:      The time at which timeline was last acquired.
  * @csf_tl_reader:          CSFFW timeline reader
  */
 struct kbase_timeline {
@@ -65,6 +71,7 @@ struct kbase_timeline {
 	atomic_t         *timeline_flags;
 	size_t            obj_header_btc;
 	size_t            aux_header_btc;
+	ktime_t           last_acquire_time;
 #if MALI_USE_CSF
 	struct kbase_csf_tl_reader csf_tl_reader;
 #endif
diff --git a/mali_kbase/tl/mali_kbase_tlstream.c b/mali_kbase/tl/mali_kbase_tlstream.c
index c6eb3c8..202c12f 100644
--- a/mali_kbase/tl/mali_kbase_tlstream.c
+++ b/mali_kbase/tl/mali_kbase_tlstream.c
@@ -56,20 +56,19 @@ static void kbasep_packet_header_setup(
  * @numbered:   non-zero if the stream is numbered
  *
  * Function updates mutable part of packet header in the given buffer.
- * Note that value of data_size must not including size of the header.
+ * Note that value of data_size must not include size of the header.
  */
 static void kbasep_packet_header_update(
 		char  *buffer,
 		size_t data_size,
 		int    numbered)
 {
-	u32 word0;
 	u32 word1 = MIPE_PACKET_HEADER_W1((u32)data_size, !!numbered);
 
 	KBASE_DEBUG_ASSERT(buffer);
-	CSTD_UNUSED(word0);
 
-	memcpy(&buffer[sizeof(word0)], &word1, sizeof(word1));
+	/* we copy the contents of word1 to its respective position in the buffer */
+	memcpy(&buffer[sizeof(u32)], &word1, sizeof(word1));
 }
 
 /**
diff --git a/mali_kbase/tl/mali_kbase_tracepoints.c b/mali_kbase/tl/mali_kbase_tracepoints.c
index 479f0f4..ece23b3 100644
--- a/mali_kbase/tl/mali_kbase_tracepoints.c
+++ b/mali_kbase/tl/mali_kbase_tracepoints.c
@@ -69,6 +69,7 @@ enum tl_msg_id_obj {
 	KBASE_TL_ARBITER_STARTED,
 	KBASE_TL_ARBITER_STOP_REQUESTED,
 	KBASE_TL_ARBITER_STOPPED,
+	KBASE_TL_ARBITER_REQUESTED,
 	KBASE_JD_GPU_SOFT_RESET,
 	KBASE_TL_KBASE_NEW_DEVICE,
 	KBASE_TL_KBASE_DEVICE_PROGRAM_CSG,
@@ -288,6 +289,10 @@ enum tl_msg_id_aux {
 		"Driver has stopped using gpu", \
 		"@p", \
 		"gpu") \
+	TRACEPOINT_DESC(KBASE_TL_ARBITER_REQUESTED, \
+		"Driver has requested the arbiter for gpu access", \
+		"@p", \
+		"gpu") \
 	TRACEPOINT_DESC(KBASE_JD_GPU_SOFT_RESET, \
 		"gpu soft reset", \
 		"@p", \
@@ -1565,6 +1570,28 @@ void __kbase_tlstream_tl_arbiter_stopped(
 	kbase_tlstream_msgbuf_release(stream, acq_flags);
 }
 
+void __kbase_tlstream_tl_arbiter_requested(
+	struct kbase_tlstream *stream,
+	const void *gpu)
+{
+	const u32 msg_id = KBASE_TL_ARBITER_REQUESTED;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(gpu)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &gpu, sizeof(gpu));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
 void __kbase_tlstream_jd_gpu_soft_reset(
 	struct kbase_tlstream *stream,
 	const void *gpu)
diff --git a/mali_kbase/tl/mali_kbase_tracepoints.h b/mali_kbase/tl/mali_kbase_tracepoints.h
index a3fd7c1..f3f554a 100644
--- a/mali_kbase/tl/mali_kbase_tracepoints.h
+++ b/mali_kbase/tl/mali_kbase_tracepoints.h
@@ -237,6 +237,9 @@ void __kbase_tlstream_tl_arbiter_stop_requested(
 void __kbase_tlstream_tl_arbiter_stopped(
 	struct kbase_tlstream *stream,
 	const void *gpu);
+void __kbase_tlstream_tl_arbiter_requested(
+	struct kbase_tlstream *stream,
+	const void *gpu);
 void __kbase_tlstream_jd_gpu_soft_reset(
 	struct kbase_tlstream *stream,
 	const void *gpu);
@@ -1301,6 +1304,25 @@ struct kbase_tlstream;
 	} while (0)
 
 /**
+ * KBASE_TLSTREAM_TL_ARBITER_REQUESTED -
+ *   Driver has requested the arbiter for gpu access
+ *
+ * @kbdev: Kbase device
+ * @gpu: Name of the GPU object
+ */
+#define KBASE_TLSTREAM_TL_ARBITER_REQUESTED(	\
+	kbdev,	\
+	gpu	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_tl_arbiter_requested(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				gpu);	\
+	} while (0)
+
+/**
  * KBASE_TLSTREAM_JD_GPU_SOFT_RESET -
  *   gpu soft reset
  *
diff --git a/mali_pixel/memory_group_manager.c b/mali_pixel/memory_group_manager.c
deleted file mode 100644
index 6e10722..0000000
--- a/mali_pixel/memory_group_manager.c
+++ /dev/null
@@ -1,492 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *
- * (C) COPYRIGHT 2019-2020 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU license.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- */
-
-#include <linux/fs.h>
-#include <linux/of.h>
-#include <linux/slab.h>
-#include <linux/platform_device.h>
-#include <linux/version.h>
-#include <linux/module.h>
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#endif
-#include <linux/mm.h>
-#include <linux/memory_group_manager.h>
-
-#if (KERNEL_VERSION(4, 20, 0) > LINUX_VERSION_CODE)
-static inline vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma,
-			unsigned long addr, unsigned long pfn, pgprot_t pgprot)
-{
-	int err;
-
-#if ((KERNEL_VERSION(4, 4, 147) >= LINUX_VERSION_CODE) || \
-		((KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE) && \
-		 (KERNEL_VERSION(4, 5, 0) <= LINUX_VERSION_CODE)))
-	if (pgprot_val(pgprot) != pgprot_val(vma->vm_page_prot))
-		return VM_FAULT_SIGBUS;
-
-	err = vm_insert_pfn(vma, addr, pfn);
-#else
-	err = vm_insert_pfn_prot(vma, addr, pfn, pgprot);
-#endif
-
-	if (unlikely(err == -ENOMEM))
-		return VM_FAULT_OOM;
-	if (unlikely(err < 0 && err != -EBUSY))
-		return VM_FAULT_SIGBUS;
-
-	return VM_FAULT_NOPAGE;
-}
-#endif
-
-#define IMPORTED_MEMORY_ID (MEMORY_GROUP_MANAGER_NR_GROUPS - 1)
-
-/**
- * struct mgm_group - Structure to keep track of the number of allocated
- *                    pages per group
- *
- * @size:  The number of allocated small(4KB) pages
- * @lp_size:  The number of allocated large(2MB) pages
- * @insert_pfn: The number of calls to map pages for CPU access.
- * @update_gpu_pte: The number of calls to update GPU page table entries.
- *
- * This structure allows page allocation information to be displayed via
- * debugfs. Display is organized per group with small and large sized pages.
- */
-struct mgm_group {
-	size_t size;
-	size_t lp_size;
-	size_t insert_pfn;
-	size_t update_gpu_pte;
-};
-
-/**
- * struct mgm_groups - Structure for groups of memory group manager
- *
- * @groups: To keep track of the number of allocated pages of all groups
- * @dev: device attached
- * @mgm_debugfs_root: debugfs root directory of memory group manager
- *
- * This structure allows page allocation information to be displayed via
- * debugfs. Display is organized per group with small and large sized pages.
- */
-struct mgm_groups {
-	struct mgm_group groups[MEMORY_GROUP_MANAGER_NR_GROUPS];
-	struct device *dev;
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *mgm_debugfs_root;
-#endif
-};
-
-#ifdef CONFIG_DEBUG_FS
-
-static int mgm_size_get(void *data, u64 *val)
-{
-	struct mgm_group *group = data;
-
-	*val = group->size;
-
-	return 0;
-}
-
-static int mgm_lp_size_get(void *data, u64 *val)
-{
-	struct mgm_group *group = data;
-
-	*val = group->lp_size;
-
-	return 0;
-}
-
-static int mgm_insert_pfn_get(void *data, u64 *val)
-{
-	struct mgm_group *group = data;
-
-	*val = group->insert_pfn;
-
-	return 0;
-}
-
-static int mgm_update_gpu_pte_get(void *data, u64 *val)
-{
-	struct mgm_group *group = data;
-
-	*val = group->update_gpu_pte;
-
-	return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(fops_mgm_size, mgm_size_get, NULL, "%llu\n");
-DEFINE_SIMPLE_ATTRIBUTE(fops_mgm_lp_size, mgm_lp_size_get, NULL, "%llu\n");
-
-DEFINE_SIMPLE_ATTRIBUTE(fops_mgm_insert_pfn, mgm_insert_pfn_get, NULL,
-	"%llu\n");
-
-DEFINE_SIMPLE_ATTRIBUTE(fops_mgm_update_gpu_pte, mgm_update_gpu_pte_get, NULL,
-	"%llu\n");
-
-static void mgm_term_debugfs(struct mgm_groups *data)
-{
-	debugfs_remove_recursive(data->mgm_debugfs_root);
-}
-
-#define MGM_DEBUGFS_GROUP_NAME_MAX 10
-static int mgm_initialize_debugfs(struct mgm_groups *mgm_data)
-{
-	int i;
-	struct dentry *e, *g;
-	char debugfs_group_name[MGM_DEBUGFS_GROUP_NAME_MAX];
-
-	/*
-	 * Create root directory of memory-group-manager
-	 */
-	mgm_data->mgm_debugfs_root =
-		debugfs_create_dir("physical-memory-group-manager", NULL);
-	if (IS_ERR(mgm_data->mgm_debugfs_root)) {
-		dev_err(mgm_data->dev, "fail to create debugfs root directory\n");
-		return -ENODEV;
-	}
-
-	/*
-	 * Create debugfs files per group
-	 */
-	for (i = 0; i < MEMORY_GROUP_MANAGER_NR_GROUPS; i++) {
-		scnprintf(debugfs_group_name, MGM_DEBUGFS_GROUP_NAME_MAX,
-				"group_%d", i);
-		g = debugfs_create_dir(debugfs_group_name,
-				mgm_data->mgm_debugfs_root);
-		if (IS_ERR(g)) {
-			dev_err(mgm_data->dev, "fail to create group[%d]\n", i);
-			goto remove_debugfs;
-		}
-
-		e = debugfs_create_file("size", 0444, g, &mgm_data->groups[i],
-				&fops_mgm_size);
-		if (IS_ERR(e)) {
-			dev_err(mgm_data->dev, "fail to create size[%d]\n", i);
-			goto remove_debugfs;
-		}
-
-		e = debugfs_create_file("lp_size", 0444, g,
-				&mgm_data->groups[i], &fops_mgm_lp_size);
-		if (IS_ERR(e)) {
-			dev_err(mgm_data->dev,
-				"fail to create lp_size[%d]\n", i);
-			goto remove_debugfs;
-		}
-
-		e = debugfs_create_file("insert_pfn", 0444, g,
-				&mgm_data->groups[i], &fops_mgm_insert_pfn);
-		if (IS_ERR(e)) {
-			dev_err(mgm_data->dev,
-				"fail to create insert_pfn[%d]\n", i);
-			goto remove_debugfs;
-		}
-
-		e = debugfs_create_file("update_gpu_pte", 0444, g,
-				&mgm_data->groups[i], &fops_mgm_update_gpu_pte);
-		if (IS_ERR(e)) {
-			dev_err(mgm_data->dev,
-				"fail to create update_gpu_pte[%d]\n", i);
-			goto remove_debugfs;
-		}
-	}
-
-	return 0;
-
-remove_debugfs:
-	mgm_term_debugfs(mgm_data);
-	return -ENODEV;
-}
-
-#else
-
-static void mgm_term_debugfs(struct mgm_groups *data)
-{
-}
-
-static int mgm_initialize_debugfs(struct mgm_groups *mgm_data)
-{
-	return 0;
-}
-
-#endif /* CONFIG_DEBUG_FS */
-
-#define ORDER_SMALL_PAGE 0
-#define ORDER_LARGE_PAGE 9
-static void update_size(struct memory_group_manager_device *mgm_dev, int
-		group_id, int order, bool alloc)
-{
-	struct mgm_groups *data = mgm_dev->data;
-
-	switch (order) {
-	case ORDER_SMALL_PAGE:
-		if (alloc)
-			data->groups[group_id].size++;
-		else {
-			WARN_ON(data->groups[group_id].size == 0);
-			data->groups[group_id].size--;
-		}
-	break;
-
-	case ORDER_LARGE_PAGE:
-		if (alloc)
-			data->groups[group_id].lp_size++;
-		else {
-			WARN_ON(data->groups[group_id].lp_size == 0);
-			data->groups[group_id].lp_size--;
-		}
-	break;
-
-	default:
-		dev_err(data->dev, "Unknown order(%d)\n", order);
-	break;
-	}
-}
-
-static struct page *example_mgm_alloc_page(
-	struct memory_group_manager_device *mgm_dev, int group_id,
-	gfp_t gfp_mask, unsigned int order)
-{
-	struct mgm_groups *const data = mgm_dev->data;
-	struct page *p;
-
-	dev_dbg(data->dev, "%s(mgm_dev=%p, group_id=%d gfp_mask=0x%x order=%u\n",
-		__func__, (void *)mgm_dev, group_id, gfp_mask, order);
-
-	if (WARN_ON(group_id < 0) ||
-		WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS))
-		return NULL;
-
-	p = alloc_pages(gfp_mask, order);
-
-	if (p) {
-		update_size(mgm_dev, group_id, order, true);
-	} else {
-		struct mgm_groups *data = mgm_dev->data;
-
-		dev_err(data->dev, "alloc_pages failed\n");
-	}
-
-	return p;
-}
-
-static void example_mgm_free_page(
-	struct memory_group_manager_device *mgm_dev, int group_id,
-	struct page *page, unsigned int order)
-{
-	struct mgm_groups *const data = mgm_dev->data;
-
-	dev_dbg(data->dev, "%s(mgm_dev=%p, group_id=%d page=%p order=%u\n",
-		__func__, (void *)mgm_dev, group_id, (void *)page, order);
-
-	if (WARN_ON(group_id < 0) ||
-		WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS))
-		return;
-
-	__free_pages(page, order);
-
-	update_size(mgm_dev, group_id, order, false);
-}
-
-static int example_mgm_get_import_memory_id(
-	struct memory_group_manager_device *mgm_dev,
-	struct memory_group_manager_import_data *import_data)
-{
-	struct mgm_groups *const data = mgm_dev->data;
-
-	dev_dbg(data->dev, "%s(mgm_dev=%p, import_data=%p (type=%d)\n",
-		__func__, (void *)mgm_dev, (void *)import_data,
-		(int)import_data->type);
-
-	if (!WARN_ON(!import_data)) {
-		WARN_ON(!import_data->u.dma_buf);
-
-		WARN_ON(import_data->type !=
-				MEMORY_GROUP_MANAGER_IMPORT_TYPE_DMA_BUF);
-	}
-
-	return IMPORTED_MEMORY_ID;
-}
-
-static u64 example_mgm_update_gpu_pte(
-	struct memory_group_manager_device *const mgm_dev, int const group_id,
-	int const mmu_level, u64 pte)
-{
-	struct mgm_groups *const data = mgm_dev->data;
-	const u32 pbha_bit_pos = 59; /* bits 62:59 */
-	const u32 pbha_bit_mask = 0xf; /* 4-bit */
-
-	dev_dbg(data->dev,
-		"%s(mgm_dev=%p, group_id=%d, mmu_level=%d, pte=0x%llx)\n",
-		__func__, (void *)mgm_dev, group_id, mmu_level, pte);
-
-	if (WARN_ON(group_id < 0) ||
-		WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS))
-		return pte;
-
-	pte |= ((u64)group_id & pbha_bit_mask) << pbha_bit_pos;
-
-	data->groups[group_id].update_gpu_pte++;
-
-	return pte;
-}
-
-static vm_fault_t example_mgm_vmf_insert_pfn_prot(
-	struct memory_group_manager_device *const mgm_dev, int const group_id,
-	struct vm_area_struct *const vma, unsigned long const addr,
-	unsigned long const pfn, pgprot_t const prot)
-{
-	struct mgm_groups *const data = mgm_dev->data;
-	vm_fault_t fault;
-
-	dev_dbg(data->dev,
-		"%s(mgm_dev=%p, group_id=%d, vma=%p, addr=0x%lx, pfn=0x%lx, prot=0x%llx)\n",
-		__func__, (void *)mgm_dev, group_id, (void *)vma, addr, pfn,
-		(unsigned long long int) pgprot_val(prot));
-
-	if (WARN_ON(group_id < 0) ||
-		WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS))
-		return VM_FAULT_SIGBUS;
-
-	fault = vmf_insert_pfn_prot(vma, addr, pfn, prot);
-
-	if (fault == VM_FAULT_NOPAGE)
-		data->groups[group_id].insert_pfn++;
-	else
-		dev_err(data->dev, "vmf_insert_pfn_prot failed\n");
-
-	return fault;
-}
-
-static int mgm_initialize_data(struct mgm_groups *mgm_data)
-{
-	int i;
-
-	for (i = 0; i < MEMORY_GROUP_MANAGER_NR_GROUPS; i++) {
-		mgm_data->groups[i].size = 0;
-		mgm_data->groups[i].lp_size = 0;
-		mgm_data->groups[i].insert_pfn = 0;
-		mgm_data->groups[i].update_gpu_pte = 0;
-	}
-
-	return mgm_initialize_debugfs(mgm_data);
-}
-
-static void mgm_term_data(struct mgm_groups *data)
-{
-	int i;
-
-	for (i = 0; i < MEMORY_GROUP_MANAGER_NR_GROUPS; i++) {
-		if (data->groups[i].size != 0)
-			dev_warn(data->dev,
-				"%zu 0-order pages in group(%d) leaked\n",
-				data->groups[i].size, i);
-		if (data->groups[i].lp_size != 0)
-			dev_warn(data->dev,
-				"%zu 9 order pages in group(%d) leaked\n",
-				data->groups[i].lp_size, i);
-	}
-
-	mgm_term_debugfs(data);
-}
-
-static int memory_group_manager_probe(struct platform_device *pdev)
-{
-	struct memory_group_manager_device *mgm_dev;
-	struct mgm_groups *mgm_data;
-
-	mgm_dev = kzalloc(sizeof(*mgm_dev), GFP_KERNEL);
-	if (!mgm_dev)
-		return -ENOMEM;
-
-	mgm_dev->owner = THIS_MODULE;
-	mgm_dev->ops.mgm_alloc_page = example_mgm_alloc_page;
-	mgm_dev->ops.mgm_free_page = example_mgm_free_page;
-	mgm_dev->ops.mgm_get_import_memory_id =
-			example_mgm_get_import_memory_id;
-	mgm_dev->ops.mgm_vmf_insert_pfn_prot = example_mgm_vmf_insert_pfn_prot;
-	mgm_dev->ops.mgm_update_gpu_pte = example_mgm_update_gpu_pte;
-
-	mgm_data = kzalloc(sizeof(*mgm_data), GFP_KERNEL);
-	if (!mgm_data) {
-		kfree(mgm_dev);
-		return -ENOMEM;
-	}
-
-	mgm_dev->data = mgm_data;
-	mgm_data->dev = &pdev->dev;
-
-	if (mgm_initialize_data(mgm_data)) {
-		kfree(mgm_data);
-		kfree(mgm_dev);
-		return -ENOENT;
-	}
-
-	platform_set_drvdata(pdev, mgm_dev);
-	dev_info(&pdev->dev, "Memory group manager probed successfully\n");
-
-	return 0;
-}
-
-static int memory_group_manager_remove(struct platform_device *pdev)
-{
-	struct memory_group_manager_device *mgm_dev =
-		platform_get_drvdata(pdev);
-	struct mgm_groups *mgm_data = mgm_dev->data;
-
-	mgm_term_data(mgm_data);
-	kfree(mgm_data);
-
-	kfree(mgm_dev);
-
-	dev_info(&pdev->dev, "Memory group manager removed successfully\n");
-
-	return 0;
-}
-
-static const struct of_device_id memory_group_manager_dt_ids[] = {
-	{ .compatible = "arm,physical-memory-group-manager" },
-	{ /* sentinel */ }
-};
-MODULE_DEVICE_TABLE(of, memory_group_manager_dt_ids);
-
-static struct platform_driver memory_group_manager_driver = {
-	.probe = memory_group_manager_probe,
-	.remove = memory_group_manager_remove,
-	.driver = {
-		.name = "physical-memory-group-manager",
-		.owner = THIS_MODULE,
-		.of_match_table = of_match_ptr(memory_group_manager_dt_ids),
-		/*
-		 * Prevent the mgm_dev from being unbound and freed, as other's
-		 * may have pointers to it and would get confused, or crash, if
-		 * it suddenly disappear.
-		 */
-		.suppress_bind_attrs = true,
-	}
-};
-
-module_platform_driver(memory_group_manager_driver);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("ARM Ltd.");
-MODULE_VERSION("1.0");
author	Sidath Senanayake <sidaths@google.com>	2021-06-15 13:39:30 +0100
committer	Sidath Senanayake <sidaths@google.com>	2021-06-15 14:11:16 +0100
commit	fca8613cfcf585bf9113dca96a05daea9fd89794 (patch)
tree	f2baa14910f83edf00450bc30d3703eb255a0bba
parent	8037b534570814775d79aeddd06b76e5ee941f59 (diff)
download	gpu-fca8613cfcf585bf9113dca96a05daea9fd89794.tar.gz