119 files changed, 7469 insertions, 1916 deletions
diff --git a/common/include/uapi/gpu/arm/midgard/backend/gpu/mali_kbase_model_dummy.h b/common/include/uapi/gpu/arm/midgard/backend/gpu/mali_kbase_model_dummy.h
new file mode 100644
index 0000000..9d677ca
--- /dev/null
+++ b/common/include/uapi/gpu/arm/midgard/backend/gpu/mali_kbase_model_dummy.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * Dummy Model interface
+ */
+
+#ifndef _UAPI_KBASE_MODEL_DUMMY_H_
+#define _UAPI_KBASE_MODEL_DUMMY_H_
+
+#include <linux/types.h>
+
+#define KBASE_DUMMY_MODEL_COUNTER_HEADER_DWORDS (4)
+#define KBASE_DUMMY_MODEL_COUNTER_PER_CORE      (60)
+#define KBASE_DUMMY_MODEL_COUNTERS_PER_BIT      (4)
+#define KBASE_DUMMY_MODEL_COUNTER_ENABLED(enable_mask, ctr_idx) \
+	(enable_mask & (1 << (ctr_idx / KBASE_DUMMY_MODEL_COUNTERS_PER_BIT)))
+
+#define KBASE_DUMMY_MODEL_HEADERS_PER_BLOCK 4
+#define KBASE_DUMMY_MODEL_COUNTERS_PER_BLOCK 60
+#define KBASE_DUMMY_MODEL_VALUES_PER_BLOCK                                     \
+	(KBASE_DUMMY_MODEL_COUNTERS_PER_BLOCK +                                \
+	 KBASE_DUMMY_MODEL_HEADERS_PER_BLOCK)
+#define KBASE_DUMMY_MODEL_BLOCK_SIZE                                           \
+	(KBASE_DUMMY_MODEL_VALUES_PER_BLOCK * sizeof(__u32))
+#define KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS      8
+#define KBASE_DUMMY_MODEL_MAX_SHADER_CORES       32
+#define KBASE_DUMMY_MODEL_MAX_NUM_PERF_BLOCKS    \
+	(1 + 1 + KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS + KBASE_DUMMY_MODEL_MAX_SHADER_CORES)
+#define KBASE_DUMMY_MODEL_COUNTER_TOTAL                                        \
+	(KBASE_DUMMY_MODEL_MAX_NUM_PERF_BLOCKS *                               \
+	 KBASE_DUMMY_MODEL_COUNTER_PER_CORE)
+
+#define DUMMY_IMPLEMENTATION_SHADER_PRESENT (0xFull)
+#define DUMMY_IMPLEMENTATION_TILER_PRESENT (0x1ull)
+#define DUMMY_IMPLEMENTATION_L2_PRESENT (0x1ull)
+#define DUMMY_IMPLEMENTATION_STACK_PRESENT (0xFull)
+
+#endif /* _UAPI_KBASE_MODEL_DUMMY_H_ */
diff --git a/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h b/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h
index a5dc745..1d15f57 100644
--- a/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h
+++ b/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h
@@ -251,8 +251,20 @@
 /* CS_KERNEL_INPUT_BLOCK register set definitions */
 /* GLB_VERSION register */
 #define GLB_VERSION_PATCH_SHIFT (0)
+#define GLB_VERSION_PATCH_MASK ((0xFFFF) << GLB_VERSION_PATCH_SHIFT)
+#define GLB_VERSION_PATCH_GET(reg_val) (((reg_val)&GLB_VERSION_PATCH_MASK) >> GLB_VERSION_PATCH_SHIFT)
+#define GLB_VERSION_PATCH_SET(reg_val, value) \
+	(((reg_val) & ~GLB_VERSION_PATCH_MASK) | (((value) << GLB_VERSION_PATCH_SHIFT) & GLB_VERSION_PATCH_MASK))
 #define GLB_VERSION_MINOR_SHIFT (16)
+#define GLB_VERSION_MINOR_MASK ((0xFF) << GLB_VERSION_MINOR_SHIFT)
+#define GLB_VERSION_MINOR_GET(reg_val) (((reg_val)&GLB_VERSION_MINOR_MASK) >> GLB_VERSION_MINOR_SHIFT)
+#define GLB_VERSION_MINOR_SET(reg_val, value) \
+	(((reg_val) & ~GLB_VERSION_MINOR_MASK) | (((value) << GLB_VERSION_MINOR_SHIFT) & GLB_VERSION_MINOR_MASK))
 #define GLB_VERSION_MAJOR_SHIFT (24)
+#define GLB_VERSION_MAJOR_MASK ((0xFF) << GLB_VERSION_MAJOR_SHIFT)
+#define GLB_VERSION_MAJOR_GET(reg_val) (((reg_val)&GLB_VERSION_MAJOR_MASK) >> GLB_VERSION_MAJOR_SHIFT)
+#define GLB_VERSION_MAJOR_SET(reg_val, value) \
+	(((reg_val) & ~GLB_VERSION_MAJOR_MASK) | (((value) << GLB_VERSION_MAJOR_SHIFT) & GLB_VERSION_MAJOR_MASK))
 
 /* CS_REQ register */
 #define CS_REQ_STATE_SHIFT 0
@@ -935,6 +947,7 @@
 	(((reg_val) & ~CSG_PROTM_SUSPEND_BUF_POINTER_MASK) |  \
 	 (((value) << CSG_PROTM_SUSPEND_BUF_POINTER_SHIFT) & CSG_PROTM_SUSPEND_BUF_POINTER_MASK))
 
+
 /* End of CSG_INPUT_BLOCK register set definitions */
 
 /* CSG_OUTPUT_BLOCK register set definitions */
diff --git a/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h b/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h
index ec4870c..3df8a01 100644
--- a/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h
+++ b/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h
@@ -46,10 +46,14 @@
  *   trace configurations with CSF trace_command.
  * 1.6:
  * - Added new HW performance counters interface to all GPUs.
+ * 1.7:
+ * - Added reserved field to QUEUE_GROUP_CREATE ioctl for future use
+ * 1.8:
+ * - Removed Kernel legacy HWC interface
  */
 
 #define BASE_UK_VERSION_MAJOR 1
-#define BASE_UK_VERSION_MINOR 5
+#define BASE_UK_VERSION_MINOR 8
 
 /**
  * struct kbase_ioctl_version_check - Check version compatibility between
@@ -179,6 +183,50 @@ struct kbase_ioctl_cs_queue_terminate {
 	_IOW(KBASE_IOCTL_TYPE, 41, struct kbase_ioctl_cs_queue_terminate)
 
 /**
+ * union kbase_ioctl_cs_queue_group_create_1_6 - Create a GPU command queue
+ *                                               group
+ * @in:               Input parameters
+ * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
+ * @in.fragment_mask: Mask of fragment endpoints the group is allowed to use.
+ * @in.compute_mask:  Mask of compute endpoints the group is allowed to use.
+ * @in.cs_min:        Minimum number of CSs required.
+ * @in.priority:      Queue group's priority within a process.
+ * @in.tiler_max:     Maximum number of tiler endpoints the group is allowed
+ *                    to use.
+ * @in.fragment_max:  Maximum number of fragment endpoints the group is
+ *                    allowed to use.
+ * @in.compute_max:   Maximum number of compute endpoints the group is allowed
+ *                    to use.
+ * @in.padding:       Currently unused, must be zero
+ * @out:              Output parameters
+ * @out.group_handle: Handle of a newly created queue group.
+ * @out.padding:      Currently unused, must be zero
+ * @out.group_uid:    UID of the queue group available to base.
+ */
+union kbase_ioctl_cs_queue_group_create_1_6 {
+	struct {
+		__u64 tiler_mask;
+		__u64 fragment_mask;
+		__u64 compute_mask;
+		__u8 cs_min;
+		__u8 priority;
+		__u8 tiler_max;
+		__u8 fragment_max;
+		__u8 compute_max;
+		__u8 padding[3];
+
+	} in;
+	struct {
+		__u8 group_handle;
+		__u8 padding[3];
+		__u32 group_uid;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6                                  \
+	_IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create_1_6)
+
+/**
  * union kbase_ioctl_cs_queue_group_create - Create a GPU command queue group
  * @in:               Input parameters
  * @in.tiler_mask:    Mask of tiler endpoints the group is allowed to use.
@@ -209,7 +257,7 @@ union kbase_ioctl_cs_queue_group_create {
 		__u8 fragment_max;
 		__u8 compute_max;
 		__u8 padding[3];
-
+		__u64 reserved;
 	} in;
 	struct {
 		__u8 group_handle;
@@ -218,8 +266,8 @@ union kbase_ioctl_cs_queue_group_create {
 	} out;
 };
 
-#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE \
-	_IOWR(KBASE_IOCTL_TYPE, 42, union kbase_ioctl_cs_queue_group_create)
+#define KBASE_IOCTL_CS_QUEUE_GROUP_CREATE                                      \
+	_IOWR(KBASE_IOCTL_TYPE, 58, union kbase_ioctl_cs_queue_group_create)
 
 /**
  * struct kbase_ioctl_cs_queue_group_term - Terminate a GPU command queue group
diff --git a/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h
index 4001a4c..b1720ed 100644
--- a/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h
+++ b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h
@@ -250,6 +250,17 @@
 		 GPU_COMMAND_FLUSH_PAYLOAD_LSC_CLEAN_INVALIDATE |              \
 		 GPU_COMMAND_FLUSH_PAYLOAD_OTHER_NONE))
 
+/* Clean and invalidate L2, LSC, and Other caches */
+#define GPU_COMMAND_CACHE_CLN_INV_FULL                                         \
+	GPU_COMMAND_CODE_PAYLOAD(                                              \
+		GPU_COMMAND_CODE_FLUSH_CACHES,                                 \
+		(GPU_COMMAND_FLUSH_PAYLOAD_L2_CLEAN_INVALIDATE |               \
+		 GPU_COMMAND_FLUSH_PAYLOAD_LSC_CLEAN_INVALIDATE |              \
+		 GPU_COMMAND_FLUSH_PAYLOAD_OTHER_INVALIDATE))
+
+/* Merge cache flush commands */
+#define GPU_COMMAND_FLUSH_CACHE_MERGE(cmd1, cmd2) ((cmd1) | (cmd2))
+
 /* Places the GPU in protected mode */
 #define GPU_COMMAND_SET_PROTECTED_MODE \
 	GPU_COMMAND_CODE_PAYLOAD(GPU_COMMAND_CODE_SET_PROTECTED_MODE, 0)
diff --git a/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h
index dcadcc7..ecf812c 100644
--- a/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h
+++ b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h
@@ -175,6 +175,7 @@
 /* Possible values of JS_CONFIG and JS_CONFIG_NEXT registers */
 #define JS_CONFIG_START_FLUSH_NO_ACTION        (0u << 0)
 #define JS_CONFIG_START_FLUSH_CLEAN            (1u << 8)
+#define JS_CONFIG_START_FLUSH_INV_SHADER_OTHER (2u << 8)
 #define JS_CONFIG_START_FLUSH_CLEAN_INVALIDATE (3u << 8)
 #define JS_CONFIG_START_MMU                    (1u << 10)
 #define JS_CONFIG_JOB_CHAIN_FLAG               (1u << 11)
@@ -264,6 +265,11 @@
 /* GPU_COMMAND cache flush alias to CSF command payload */
 #define GPU_COMMAND_CACHE_CLN_INV_L2 GPU_COMMAND_CLEAN_INV_CACHES
 #define GPU_COMMAND_CACHE_CLN_INV_L2_LSC GPU_COMMAND_CLEAN_INV_CACHES
+#define GPU_COMMAND_CACHE_CLN_INV_FULL GPU_COMMAND_CLEAN_INV_CACHES
+
+/* Merge cache flush commands */
+#define GPU_COMMAND_FLUSH_CACHE_MERGE(cmd1, cmd2)                              \
+	((cmd1) > (cmd2) ? (cmd1) : (cmd2))
 
 /* IRQ flags */
 #define GPU_FAULT               (1 << 0)    /* A GPU Fault has occurred */
diff --git a/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h b/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h
index 2598e20..d957dea 100644
--- a/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h
+++ b/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h
@@ -121,9 +121,11 @@
  * - Added ioctl 55: set_limited_core_count.
  * 11.32:
  * - Added new HW performance counters interface to all GPUs.
+ * 11.33:
+ * - Removed Kernel legacy HWC interface
  */
 #define BASE_UK_VERSION_MAJOR 11
-#define BASE_UK_VERSION_MINOR 31
+#define BASE_UK_VERSION_MINOR 33
 
 /**
  * struct kbase_ioctl_version_check - Check version compatibility between
diff --git a/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h b/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h
index 15843ee..2cdd29c 100644
--- a/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h
+++ b/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h
@@ -126,6 +126,7 @@ enum prfcnt_list_type {
 
 #define PRFCNT_REQUEST_TYPE_MODE FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_REQUEST, 0)
 #define PRFCNT_REQUEST_TYPE_ENABLE FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_REQUEST, 1)
+#define PRFCNT_REQUEST_TYPE_SCOPE FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_REQUEST, 2)
 
 #define PRFCNT_SAMPLE_META_TYPE_SAMPLE                                         \
 	FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_SAMPLE_META, 0)
@@ -150,6 +151,7 @@ struct prfcnt_item_header {
  * @PRFCNT_BLOCK_TYPE_TILER:       Tiler.
  * @PRFCNT_BLOCK_TYPE_MEMORY:      Memory System.
  * @PRFCNT_BLOCK_TYPE_SHADER_CORE: Shader Core.
+ * @PRFCNT_BLOCK_TYPE_RESERVED:    Reserved.
  */
 enum prfcnt_block_type {
 	PRFCNT_BLOCK_TYPE_FE,
@@ -160,10 +162,11 @@ enum prfcnt_block_type {
 };
 
 /**
- * enum prfcnt_block_set - Type of performance counter block set.
+ * enum prfcnt_set - Type of performance counter block set.
  * @PRFCNT_SET_PRIMARY:   Primary.
  * @PRFCNT_SET_SECONDARY: Secondary.
  * @PRFCNT_SET_TERTIARY:  Tertiary.
+ * @PRFCNT_SET_RESERVED:  Reserved.
  */
 enum prfcnt_set {
 	PRFCNT_SET_PRIMARY,
@@ -176,19 +179,19 @@ enum prfcnt_set {
  * struct prfcnt_enum_block_counter - Performance counter block descriptor.
  * @block_type:    Type of performance counter block.
  * @set:           Which SET this represents: primary, secondary or tertiary.
+ * @pad:           Padding bytes.
  * @num_instances: How many instances of this block type exist in the hardware.
  * @num_values:    How many entries in the values array there are for samples
  *                 from this block.
- * @pad:           Padding bytes.
  * @counter_mask:  Bitmask that indicates the availability of counters in this
  *                 block.
  */
 struct prfcnt_enum_block_counter {
 	__u8 block_type;
 	__u8 set;
-	__u8 num_instances;
-	__u8 num_values;
-	__u8 pad[4];
+	__u8 pad[2];
+	__u16 num_instances;
+	__u16 num_values;
 	__u64 counter_mask[2];
 };
 
@@ -206,12 +209,14 @@ struct prfcnt_enum_request {
 
 /**
  * struct prfcnt_enum_item - Performance counter enumeration item.
- * @hdr:           Header describing the type of item in the list.
- * @block_counter: Performance counter block descriptor.
- * @request:       Request descriptor.
+ * @hdr:             Header describing the type of item in the list.
+ * @u:               Structure containing discriptor for enumeration item type.
+ * @u.block_counter: Performance counter block descriptor.
+ * @u.request:       Request descriptor.
  */
 struct prfcnt_enum_item {
 	struct prfcnt_item_header hdr;
+	/** union u - union of block_counter and request */
 	union {
 		struct prfcnt_enum_block_counter block_counter;
 		struct prfcnt_enum_request request;
@@ -222,6 +227,7 @@ struct prfcnt_enum_item {
  * enum prfcnt_mode - Capture mode for counter sampling.
  * @PRFCNT_MODE_MANUAL:   Manual sampling mode.
  * @PRFCNT_MODE_PERIODIC: Periodic sampling mode.
+ * @PRFCNT_MODE_RESERVED: Reserved.
  */
 enum prfcnt_mode {
 	PRFCNT_MODE_MANUAL,
@@ -231,16 +237,19 @@ enum prfcnt_mode {
 
 /**
  * struct prfcnt_request_mode - Mode request descriptor.
- * @mode:      Capture mode for the session, either manual or periodic.
- * @pad:       Padding bytes.
- * @period_us: Period in microseconds, for periodic mode.
+ * @mode:                         Capture mode for the session, either manual or periodic.
+ * @pad:                          Padding bytes.
+ * @mode_config:                  Structure containing configuration for periodic mode.
+ * @mode_config.period:           Periodic config.
+ * @mode_config.period.period_ns: Period in nanoseconds, for periodic mode.
  */
 struct prfcnt_request_mode {
 	__u8 mode;
 	__u8 pad[7];
+	/** union mode_config - request mode configuration*/
 	union {
 		struct {
-			__u64 period_us;
+			__u64 period_ns;
 		} periodic;
 	} mode_config;
 };
@@ -261,16 +270,40 @@ struct prfcnt_request_enable {
 };
 
 /**
+ * enum prfcnt_scope - Scope of performance counters.
+ * @PRFCNT_SCOPE_GLOBAL:   Global scope.
+ * @PRFCNT_SCOPE_RESERVED: Reserved.
+ */
+enum prfcnt_scope {
+	PRFCNT_SCOPE_GLOBAL,
+	PRFCNT_SCOPE_RESERVED = 255,
+};
+
+/**
+ * struct prfcnt_request_scope - Scope request descriptor.
+ * @scope: Scope of the performance counters to capture.
+ * @pad:   Padding bytes.
+ */
+struct prfcnt_request_scope {
+	__u8 scope;
+	__u8 pad[7];
+};
+
+/**
  * struct prfcnt_request_item - Performance counter request item.
- * @hdr:        Header describing the type of item in the list.
- * @req_mode:   Mode request descriptor.
- * @req_enable: Enable request descriptor.
+ * @hdr:          Header describing the type of item in the list.
+ * @u:            Structure containing descriptor for request type.
+ * @u.req_mode:   Mode request descriptor.
+ * @u.req_enable: Enable request descriptor.
+ * @u.req_scope:  Scope request descriptor.
  */
 struct prfcnt_request_item {
 	struct prfcnt_item_header hdr;
+	/** union u - union on req_mode and req_enable */
 	union {
 		struct prfcnt_request_mode req_mode;
 		struct prfcnt_request_enable req_enable;
+		struct prfcnt_request_scope req_scope;
 	} u;
 };
 
@@ -278,12 +311,19 @@ struct prfcnt_request_item {
  * enum prfcnt_request_type - Type of request descriptor.
  * @PRFCNT_REQUEST_MODE:   Specify the capture mode to be used for the session.
  * @PRFCNT_REQUEST_ENABLE: Specify which performance counters to capture.
+ * @PRFCNT_REQUEST_SCOPE:  Specify the scope of the performance counters.
  */
 enum prfcnt_request_type {
 	PRFCNT_REQUEST_MODE,
 	PRFCNT_REQUEST_ENABLE,
+	PRFCNT_REQUEST_SCOPE,
 };
 
+/* This sample contains overflows from dump duration stretch because the sample buffer was full */
+#define SAMPLE_FLAG_OVERFLOW (1u << 0)
+/* This sample has had an error condition for sample duration */
+#define SAMPLE_FLAG_ERROR (1u << 30)
+
 /**
  * struct prfcnt_sample_metadata - Metadata for counter sample data.
  * @timestamp_start: Earliest timestamp that values in this sample represent.
@@ -292,6 +332,7 @@ enum prfcnt_request_type {
  *                   GET_SAMPLE.
  * @user_data:       User data provided to HWC_CMD_START or HWC_CMD_SAMPLE_*
  * @flags:           Property flags.
+ * @pad:             Padding bytes.
  */
 struct prfcnt_sample_metadata {
 	__u64 timestamp_start;
@@ -302,18 +343,25 @@ struct prfcnt_sample_metadata {
 	__u32 pad;
 };
 
+/* Maximum number of domains a metadata for clock cycles can refer to */
+#define MAX_REPORTED_DOMAINS (4)
+
 /**
  * struct prfcnt_clock_metadata - Metadata for clock cycles.
  * @num_domains: Number of domains this metadata refers to.
+ * @pad:         Padding bytes.
  * @cycles:      Number of cycles elapsed in each counter domain between
- *               timestamp_start and timestamp_end.
+ *               timestamp_start and timestamp_end. Valid only for the
+ *               first @p num_domains.
  */
 struct prfcnt_clock_metadata {
 	__u32 num_domains;
 	__u32 pad;
-	__u64 *cycles;
+	__u64 cycles[MAX_REPORTED_DOMAINS];
 };
 
+/* This block state is unknown */
+#define BLOCK_STATE_UNKNOWN (0)
 /* This block was powered on for at least some portion of the sample */
 #define BLOCK_STATE_ON (1 << 0)
 /* This block was powered off for at least some portion of the sample */
@@ -336,10 +384,12 @@ struct prfcnt_clock_metadata {
  * @block_type:    Type of performance counter block.
  * @block_idx:     Index of performance counter block.
  * @set:           Set of performance counter block.
+ * @pad_u8:        Padding bytes.
  * @block_state:   Bits set indicate the states which the block is known
  *                 to have operated in during this sample.
  * @values_offset: Offset from the start of the mmapped region, to the values
  *                 for this block. The values themselves are an array of __u64.
+ * @pad_u32:       Padding bytes.
  */
 struct prfcnt_block_metadata {
 	__u8 block_type;
@@ -351,6 +401,14 @@ struct prfcnt_block_metadata {
 	__u32 pad_u32;
 };
 
+/**
+ * struct prfcnt_metadata - Performance counter metadata item.
+ * @hdr:         Header describing the type of item in the list.
+ * @u:           Structure containing descriptor for metadata type.
+ * @u.sample_md: Counter sample data metadata descriptor.
+ * @u.clock_md:  Clock cycles metadata descriptor.
+ * @u.block_md:  Counter block metadata descriptor.
+ */
 struct prfcnt_metadata {
 	struct prfcnt_item_header hdr;
 	union {
@@ -360,5 +418,67 @@ struct prfcnt_metadata {
 	} u;
 };
 
+/**
+ * enum prfcnt_control_cmd_code - Control command code for client session.
+ * @PRFCNT_CONTROL_CMD_START:        Start the counter data dump run for
+ *                                   the calling client session.
+ * @PRFCNT_CONTROL_CMD_STOP:         Stop the counter data dump run for the
+ *                                   calling client session.
+ * @PRFCNT_CONTROL_CMD_SAMPLE_SYNC:  Trigger a synchronous manual sample.
+ * @PRFCNT_CONTROL_CMD_SAMPLE_ASYNC: Trigger an asynchronous manual sample.
+ * @PRFCNT_CONTROL_CMD_DISCARD:      Discard all samples which have not yet
+ *                                   been consumed by userspace. Note that
+ *                                   this can race with new samples if
+ *                                   HWC_CMD_STOP is not called first.
+ */
+enum prfcnt_control_cmd_code {
+	PRFCNT_CONTROL_CMD_START = 1,
+	PRFCNT_CONTROL_CMD_STOP,
+	PRFCNT_CONTROL_CMD_SAMPLE_SYNC,
+	PRFCNT_CONTROL_CMD_SAMPLE_ASYNC,
+	PRFCNT_CONTROL_CMD_DISCARD,
+};
+
+/** struct prfcnt_control_cmd - Control command
+ * @cmd:       Control command for the session.
+ * @pad:       Padding bytes.
+ * @user_data: Pointer to user data, which will be returned as part of
+ *             sample metadata. It only affects a single sample if used
+ *             with CMD_SAMPLE_SYNC or CMD_SAMPLE_ASYNC. It affects all
+ *             samples between CMD_START and CMD_STOP if used with the
+ *             periodic sampling.
+ */
+struct prfcnt_control_cmd {
+	__u16 cmd;
+	__u16 pad[3];
+	__u64 user_data;
+};
+
+/** struct prfcnt_sample_access - Metadata to access a sample.
+ * @sequence:            Sequence number for the sample.
+ *                       For GET_SAMPLE, it will be set by the kernel.
+ *                       For PUT_SAMPLE, it shall be equal to the same value
+ *                       provided by the kernel for GET_SAMPLE.
+ * @sample_offset_bytes: Offset from the start of the mapped area to the first
+ *                       entry in the metadata list (sample_metadata) for this
+ *                       sample.
+ */
+struct prfcnt_sample_access {
+	__u64 sequence;
+	__u64 sample_offset_bytes;
+};
+
+/* The ids of ioctl commands, on a reader file descriptor, magic number */
+#define KBASE_KINSTR_PRFCNT_READER 0xBF
+/* Ioctl ID for issuing a session operational command */
+#define KBASE_IOCTL_KINSTR_PRFCNT_CMD                                          \
+	_IOW(KBASE_KINSTR_PRFCNT_READER, 0x00, struct prfcnt_control_cmd)
+/* Ioctl ID for fetching a dumpped sample */
+#define KBASE_IOCTL_KINSTR_PRFCNT_GET_SAMPLE                                   \
+	_IOR(KBASE_KINSTR_PRFCNT_READER, 0x01, struct prfcnt_sample_access)
+/* Ioctl ID for release internal buffer of the previously fetched sample */
+#define KBASE_IOCTL_KINSTR_PRFCNT_PUT_SAMPLE                                   \
+	_IOW(KBASE_KINSTR_PRFCNT_READER, 0x10, struct prfcnt_sample_access)
+
 #endif /* _UAPI_KBASE_HWCNT_READER_H_ */
 
diff --git a/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h b/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h
index 8e1ed55..63dd3c8 100644
--- a/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h
+++ b/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h
@@ -171,34 +171,6 @@ struct kbase_ioctl_hwcnt_reader_setup {
 	_IOW(KBASE_IOCTL_TYPE, 8, struct kbase_ioctl_hwcnt_reader_setup)
 
 /**
- * struct kbase_ioctl_hwcnt_enable - Enable hardware counter collection
- * @dump_buffer:  GPU address to write counters to
- * @fe_bm:        counters selection bitmask (Front end)
- * @shader_bm:    counters selection bitmask (Shader)
- * @tiler_bm:     counters selection bitmask (Tiler)
- * @mmu_l2_bm:    counters selection bitmask (MMU_L2)
- */
-struct kbase_ioctl_hwcnt_enable {
-	__u64 dump_buffer;
-	__u32 fe_bm;
-	__u32 shader_bm;
-	__u32 tiler_bm;
-	__u32 mmu_l2_bm;
-};
-
-/* This IOCTL is deprecated as of R33, and will be removed in R35. */
-#define KBASE_IOCTL_HWCNT_ENABLE \
-	_IOW(KBASE_IOCTL_TYPE, 9, struct kbase_ioctl_hwcnt_enable)
-
-/* This IOCTL is deprecated as of R33, and will be removed in R35. */
-#define KBASE_IOCTL_HWCNT_DUMP \
-	_IO(KBASE_IOCTL_TYPE, 10)
-
-/* This IOCTL is deprecated as of R33, and will be removed in R35. */
-#define KBASE_IOCTL_HWCNT_CLEAR \
-	_IO(KBASE_IOCTL_TYPE, 11)
-
-/**
  * struct kbase_ioctl_hwcnt_values - Values to set dummy the dummy counters to.
  * @data:    Counter samples for the dummy model.
  * @size:    Size of the counter sample data.
diff --git a/mali_kbase/Kbuild b/mali_kbase/Kbuild
index e253f1c..afc0f83 100644
--- a/mali_kbase/Kbuild
+++ b/mali_kbase/Kbuild
@@ -71,7 +71,7 @@ endif
 #
 
 # Driver version string which is returned to userspace via an ioctl
-MALI_RELEASE_NAME ?= '"r34p0-00dev1"'
+MALI_RELEASE_NAME ?= '"r35p0-01eac0"'
 # Set up defaults if not defined by build system
 ifeq ($(CONFIG_MALI_DEBUG), y)
     MALI_UNIT_TEST = 1
@@ -82,8 +82,6 @@ else
 endif
 MALI_COVERAGE ?= 0
 
-CONFIG_MALI_PLATFORM_NAME ?= "devicetree"
-
 # Kconfig passes in the name with quotes for in-tree builds - remove them.
 MALI_PLATFORM_DIR := $(shell echo $(CONFIG_MALI_PLATFORM_NAME))
 
@@ -122,7 +120,6 @@ ccflags-y = \
     -DMALI_RELEASE_NAME=$(MALI_RELEASE_NAME) \
     -DMALI_JIT_PRESSURE_LIMIT_BASE=$(MALI_JIT_PRESSURE_LIMIT_BASE) \
     -DMALI_INCREMENTAL_RENDERING=$(MALI_INCREMENTAL_RENDERING) \
-    -DMALI_KBASE_BUILD \
     -DMALI_PLATFORM_DIR=$(MALI_PLATFORM_DIR)
 
 
@@ -166,7 +163,6 @@ mali_kbase-y := \
     mali_kbase_hwcnt.o \
     mali_kbase_hwcnt_gpu.o \
     mali_kbase_hwcnt_gpu_narrow.o \
-    mali_kbase_hwcnt_legacy.o \
     mali_kbase_hwcnt_types.o \
     mali_kbase_hwcnt_virtualizer.o \
     mali_kbase_softjobs.o \
@@ -206,6 +202,7 @@ mali_kbase-$(CONFIG_SYNC_FILE) += \
 ifeq ($(CONFIG_MALI_CSF_SUPPORT),y)
     mali_kbase-y += \
         mali_kbase_hwcnt_backend_csf.o \
+        mali_kbase_hwcnt_watchdog_if_timer.o \
         mali_kbase_hwcnt_backend_csf_if_fw.o
 else
     mali_kbase-y += \
diff --git a/mali_kbase/Makefile b/mali_kbase/Makefile
index 099da33..850b257 100644
--- a/mali_kbase/Makefile
+++ b/mali_kbase/Makefile
@@ -34,10 +34,19 @@ endif
 
 CONFIG_MALI_MIDGARD ?= m
 ifeq ($(CONFIG_MALI_MIDGARD),m)
+    CONFIG_MALI_PLATFORM_NAME ?= "devicetree"
     CONFIG_MALI_GATOR_SUPPORT ?= y
     CONFIG_MALI_ARBITRATION ?= n
     CONFIG_MALI_PARTITION_MANAGER ?= n
 
+    ifeq ($(origin CONFIG_MALI_ABITER_MODULES), undefined)
+        CONFIG_MALI_ARBITER_MODULES := $(CONFIG_MALI_ARBITRATION)
+    endif
+
+    ifeq ($(origin CONFIG_MALI_GPU_POWER_MODULES), undefined)
+        CONFIG_MALI_GPU_POWER_MODULES := $(CONFIG_MALI_ARBITRATION)
+    endif
+
     ifneq ($(CONFIG_MALI_NO_MALI),y)
         # Prevent misuse when CONFIG_MALI_NO_MALI=y
         CONFIG_MALI_REAL_HW ?= y
@@ -135,6 +144,8 @@ ifeq ($(CONFIG_MALI_MIDGARD),m)
 else
     # Prevent misuse when CONFIG_MALI_MIDGARD=n
     CONFIG_MALI_ARBITRATION = n
+    CONFIG_MALI_ARBITER_MODULES = n
+    CONFIG_MALI_GPU_POWER_MODULES = n
     CONFIG_MALI_KUTF = n
     CONFIG_MALI_KUTF_IRQ_TEST = n
     CONFIG_MALI_KUTF_CLK_RATE_TRACE = n
@@ -148,6 +159,8 @@ CONFIGS := \
     CONFIG_MALI_DMA_FENCE \
     CONFIG_MALI_ARBITER_SUPPORT \
     CONFIG_MALI_ARBITRATION \
+    CONFIG_MALI_ARBITER_MODULES \
+    CONFIG_MALI_GPU_POWER_MODULES \
     CONFIG_MALI_PARTITION_MANAGER \
     CONFIG_MALI_REAL_HW \
     CONFIG_MALI_GEM5_BUILD \
@@ -191,6 +204,8 @@ MAKE_ARGS := $(foreach config,$(CONFIGS), \
                         $(value config)=$(value $(value config)), \
                         $(value config)=n))
 
+MAKE_ARGS += CONFIG_MALI_PLATFORM_NAME=$(CONFIG_MALI_PLATFORM_NAME)
+
 #
 # EXTRA_CFLAGS to define the custom CONFIGs on out-of-tree build
 #
@@ -201,6 +216,8 @@ EXTRA_CFLAGS := $(foreach config,$(CONFIGS), \
                     $(if $(filter y m,$(value $(value config))), \
                         -D$(value config)=1))
 
+EXTRA_CFLAGS += -DCONFIG_MALI_PLATFORM_NAME=$(CONFIG_MALI_PLATFORM_NAME)
+
 #
 # KBUILD_EXTRA_SYMBOLS to prevent warnings about unknown functions
 #
diff --git a/mali_kbase/arbiter/mali_kbase_arbiter_pm.c b/mali_kbase/arbiter/mali_kbase_arbiter_pm.c
index 62ff4fd..5425f2b 100644
--- a/mali_kbase/arbiter/mali_kbase_arbiter_pm.c
+++ b/mali_kbase/arbiter/mali_kbase_arbiter_pm.c
@@ -1053,8 +1053,8 @@ void kbase_arbiter_pm_update_gpu_freq(struct kbase_arbiter_freq *arb_freq,
 
 	mutex_lock(&arb_freq->arb_freq_lock);
 	if (arb_freq->arb_freq != freq) {
-		ndata.new_rate = freq * KHZ_TO_HZ;
-		ndata.old_rate = arb_freq->arb_freq * KHZ_TO_HZ;
+		ndata.new_rate = (unsigned long)freq * KHZ_TO_HZ;
+		ndata.old_rate = (unsigned long)arb_freq->arb_freq * KHZ_TO_HZ;
 		ndata.gpu_clk_handle = arb_freq;
 		arb_freq->arb_freq = freq;
 		arb_freq->freq_updated = true;
diff --git a/mali_kbase/arbitration/Kconfig b/mali_kbase/arbitration/Kconfig
index b4d6202..1935c81 100644
--- a/mali_kbase/arbitration/Kconfig
+++ b/mali_kbase/arbitration/Kconfig
@@ -27,5 +27,23 @@ config MALI_XEN
 	  virtualization setup for Mali
 	  If unsure, say N.
 
+config MALI_ARBITER_MODULES
+	tristate "Enable mali arbiter modules"
+	depends on MALI_ARBITRATION
+	default y
+	help
+	  Enables the build of the arbiter modules used in the reference
+	  virtualization setup for Mali
+	  If unsure, say N
+
+config MALI_GPU_POWER_MODULES
+	tristate "Enable gpu power modules"
+	depends on MALI_ARBITRATION
+	default y
+	help
+	  Enables the build of the gpu power modules used in the reference
+	  virtualization setup for Mali
+	  If unsure, say N
+
 
 source "drivers/gpu/arm/midgard/arbitration/ptm/Kconfig"
diff --git a/mali_kbase/backend/gpu/Kbuild b/mali_kbase/backend/gpu/Kbuild
index 5dbcff3..90bf6cd 100644
--- a/mali_kbase/backend/gpu/Kbuild
+++ b/mali_kbase/backend/gpu/Kbuild
@@ -47,3 +47,8 @@ endif
 mali_kbase-$(CONFIG_MALI_DEVFREQ) += \
     backend/gpu/mali_kbase_devfreq.o
 
+# Dummy model
+mali_kbase-$(CONFIG_MALI_NO_MALI) += backend/gpu/mali_kbase_model_dummy.o
+mali_kbase-$(CONFIG_MALI_NO_MALI) += backend/gpu/mali_kbase_model_linux.o
+# HW error simulation
+mali_kbase-$(CONFIG_MALI_NO_MALI) += backend/gpu/mali_kbase_model_error_generator.o
diff --git a/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.h b/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.h
index df30b63..a6ee959 100644
--- a/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.h
+++ b/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.h
@@ -64,13 +64,12 @@ int kbase_clk_rate_trace_manager_init(struct kbase_device *kbdev);
  * kbase_init_lowest_gpu_freq() - Find the lowest frequency that the GPU can
  *                                run as using the device tree, and save this
  *                                within kbdev.
+ * @kbdev: Pointer to kbase device.
  *
  * This function could be called from kbase_clk_rate_trace_manager_init,
  * but is left separate as it can be called as soon as
  * dev_pm_opp_of_add_table() has been called to initialize the OPP table.
  *
- * @kbdev: Pointer to kbase device.
- *
  * Return: 0 in any case.
  */
 int kbase_lowest_gpu_freq_init(struct kbase_device *kbdev);
diff --git a/mali_kbase/backend/gpu/mali_kbase_devfreq.c b/mali_kbase/backend/gpu/mali_kbase_devfreq.c
index b117e57..a7110b3 100644
--- a/mali_kbase/backend/gpu/mali_kbase_devfreq.c
+++ b/mali_kbase/backend/gpu/mali_kbase_devfreq.c
@@ -43,7 +43,7 @@
  * This function will be called only when the opp table which is compatible with
  * "operating-points-v2-mali", is not present in the devicetree for GPU device.
  *
- * Return: Voltage value in milli volts, 0 in case of error.
+ * Return: Voltage value in micro volts, 0 in case of error.
  */
 static unsigned long get_voltage(struct kbase_device *kbdev, unsigned long freq)
 {
@@ -69,8 +69,8 @@ static unsigned long get_voltage(struct kbase_device *kbdev, unsigned long freq)
 	rcu_read_unlock();
 #endif
 
-	/* Return the voltage in milli volts */
-	return voltage / 1000;
+	/* Return the voltage in micro volts */
+	return voltage;
 }
 
 void kbase_devfreq_opp_translate(struct kbase_device *kbdev, unsigned long freq,
@@ -116,6 +116,9 @@ kbase_devfreq_target(struct device *dev, unsigned long *target_freq, u32 flags)
 	struct dev_pm_opp *opp;
 	unsigned long nominal_freq;
 	unsigned long freqs[BASE_MAX_NR_CLOCKS_REGULATORS] = {0};
+#if IS_ENABLED(CONFIG_REGULATOR)
+	unsigned long original_freqs[BASE_MAX_NR_CLOCKS_REGULATORS] = {0};
+#endif
 	unsigned long volts[BASE_MAX_NR_CLOCKS_REGULATORS] = {0};
 	unsigned int i;
 	u64 core_mask;
@@ -187,6 +190,9 @@ kbase_devfreq_target(struct device *dev, unsigned long *target_freq, u32 flags)
 
 			err = clk_set_rate(kbdev->clocks[i], freqs[i]);
 			if (!err) {
+#if IS_ENABLED(CONFIG_REGULATOR)
+				original_freqs[i] = kbdev->current_freqs[i];
+#endif
 				kbdev->current_freqs[i] = freqs[i];
 			} else {
 				dev_err(dev, "Failed to set clock %lu (target %lu)\n",
@@ -200,7 +206,7 @@ kbase_devfreq_target(struct device *dev, unsigned long *target_freq, u32 flags)
 	for (i = 0; i < kbdev->nr_clocks; i++) {
 		if (kbdev->regulators[i] &&
 				kbdev->current_voltages[i] != volts[i] &&
-				kbdev->current_freqs[i] > freqs[i]) {
+				original_freqs[i] > freqs[i]) {
 			int err;
 
 			err = regulator_set_voltage(kbdev->regulators[i],
diff --git a/mali_kbase/backend/gpu/mali_kbase_devfreq.h b/mali_kbase/backend/gpu/mali_kbase_devfreq.h
index 901827e..ac88b02 100644
--- a/mali_kbase/backend/gpu/mali_kbase_devfreq.h
+++ b/mali_kbase/backend/gpu/mali_kbase_devfreq.h
@@ -55,6 +55,7 @@ void kbase_devfreq_enqueue_work(struct kbase_device *kbdev,
  * This function will only perform translation if an operating-points-v2-mali
  * table is present in devicetree. If one is not present then it will return an
  * untranslated frequency (and corresponding voltage) and all cores enabled.
+ * The voltages returned are in micro Volts (uV).
  */
 void kbase_devfreq_opp_translate(struct kbase_device *kbdev, unsigned long freq,
 	u64 *core_mask, unsigned long *freqs, unsigned long *volts);
diff --git a/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c b/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c
index 7b04286..268a888 100644
--- a/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c
@@ -46,10 +46,10 @@ int kbase_backend_gpuprops_get(struct kbase_device *kbdev,
 	registers.core_features = kbase_reg_read(kbdev,
 				GPU_CONTROL_REG(CORE_FEATURES));
 #else /* !MALI_USE_CSF */
-	if (((registers.gpu_id & GPU_ID2_PRODUCT_MODEL) ==
-	     GPU_ID2_PRODUCT_TGRX) ||
-	    ((registers.gpu_id & GPU_ID2_PRODUCT_MODEL) ==
-	     GPU_ID2_PRODUCT_TVAX))
+	if (!(((registers.gpu_id & GPU_ID2_PRODUCT_MODEL) ==
+	       GPU_ID2_PRODUCT_TDUX) ||
+	      ((registers.gpu_id & GPU_ID2_PRODUCT_MODEL) ==
+	       GPU_ID2_PRODUCT_TODX)))
 		registers.core_features =
 			kbase_reg_read(kbdev, GPU_CONTROL_REG(CORE_FEATURES));
 #endif /* MALI_USE_CSF */
diff --git a/mali_kbase/backend/gpu/mali_kbase_instr_backend.c b/mali_kbase/backend/gpu/mali_kbase_instr_backend.c
index 90cc537..1691a87 100644
--- a/mali_kbase/backend/gpu/mali_kbase_instr_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_instr_backend.c
@@ -119,29 +119,62 @@ int kbase_instr_hwcnt_enable_internal(struct kbase_device *kbdev,
 	return err;
 }
 
+static void kbasep_instr_hwc_disable_hw_prfcnt(struct kbase_device *kbdev)
+{
+	u32 irq_mask;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+	lockdep_assert_held(&kbdev->hwcnt.lock);
+
+	if (kbase_is_gpu_removed(kbdev))
+		/* GPU has been removed by Arbiter */
+		return;
+
+	/* Disable interrupt */
+	irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
+
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK), irq_mask & ~PRFCNT_SAMPLE_COMPLETED);
+
+	/* Disable the counters */
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(PRFCNT_CONFIG), 0);
+
+	kbdev->hwcnt.kctx = NULL;
+	kbdev->hwcnt.addr = 0ULL;
+	kbdev->hwcnt.addr_bytes = 0ULL;
+}
+
 int kbase_instr_hwcnt_disable_internal(struct kbase_context *kctx)
 {
 	unsigned long flags, pm_flags;
 	int err = -EINVAL;
-	u32 irq_mask;
 	struct kbase_device *kbdev = kctx->kbdev;
 
 	while (1) {
 		spin_lock_irqsave(&kbdev->hwaccess_lock, pm_flags);
 		spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
 
+		if (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_UNRECOVERABLE_ERROR) {
+			/* Instrumentation is in unrecoverable error state,
+			 * there is nothing for us to do.
+			 */
+			spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
+			spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
+			/* Already disabled, return no error. */
+			return 0;
+		}
+
 		if (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_DISABLED) {
 			/* Instrumentation is not enabled */
 			spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
 			spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
-			goto out;
+			return err;
 		}
 
 		if (kbdev->hwcnt.kctx != kctx) {
 			/* Instrumentation has been setup for another context */
 			spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
 			spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
-			goto out;
+			return err;
 		}
 
 		if (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_IDLE)
@@ -158,25 +191,7 @@ int kbase_instr_hwcnt_disable_internal(struct kbase_context *kctx)
 	kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_DISABLED;
 	kbdev->hwcnt.backend.triggered = 0;
 
-	if (kbase_is_gpu_removed(kbdev)) {
-		/* GPU has been removed by Arbiter */
-		spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
-		err = 0;
-		goto out;
-	}
-
-	/* Disable interrupt */
-	irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
-	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK),
-				irq_mask & ~PRFCNT_SAMPLE_COMPLETED);
-
-	/* Disable the counters */
-	kbase_reg_write(kbdev, GPU_CONTROL_REG(PRFCNT_CONFIG), 0);
-
-	kbdev->hwcnt.kctx = NULL;
-	kbdev->hwcnt.addr = 0ULL;
-	kbdev->hwcnt.addr_bytes = 0ULL;
+	kbasep_instr_hwc_disable_hw_prfcnt(kbdev);
 
 	spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
@@ -184,9 +199,7 @@ int kbase_instr_hwcnt_disable_internal(struct kbase_context *kctx)
 	dev_dbg(kbdev->dev, "HW counters dumping disabled for context %pK",
 									kctx);
 
-	err = 0;
- out:
-	return err;
+	return 0;
 }
 
 int kbase_instr_hwcnt_request_dump(struct kbase_context *kctx)
@@ -204,7 +217,7 @@ int kbase_instr_hwcnt_request_dump(struct kbase_context *kctx)
 
 	if (kbdev->hwcnt.backend.state != KBASE_INSTR_STATE_IDLE) {
 		/* HW counters are disabled or another dump is ongoing, or we're
-		 * resetting
+		 * resetting, or we are in unrecoverable error state.
 		 */
 		goto unlock;
 	}
@@ -274,6 +287,10 @@ void kbase_instr_hwcnt_sample_done(struct kbase_device *kbdev)
 
 	spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
 
+	/* If the state is in unrecoverable error, we already wake_up the waiter
+	 * and don't need to do any action when sample is done.
+	 */
+
 	if (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_FAULT) {
 		kbdev->hwcnt.backend.triggered = 1;
 		wake_up(&kbdev->hwcnt.backend.wait);
@@ -302,6 +319,8 @@ int kbase_instr_hwcnt_wait_for_dump(struct kbase_context *kctx)
 	if (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_FAULT) {
 		err = -EINVAL;
 		kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_IDLE;
+	} else if (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_UNRECOVERABLE_ERROR) {
+		err = -EIO;
 	} else {
 		/* Dump done */
 		KBASE_DEBUG_ASSERT(kbdev->hwcnt.backend.state ==
@@ -322,8 +341,8 @@ int kbase_instr_hwcnt_clear(struct kbase_context *kctx)
 
 	spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
 
-	/* Check it's the context previously set up and we're not already
-	 * dumping
+	/* Check it's the context previously set up and we're not in IDLE
+	 * state.
 	 */
 	if (kbdev->hwcnt.kctx != kctx || kbdev->hwcnt.backend.state !=
 							KBASE_INSTR_STATE_IDLE)
@@ -347,6 +366,48 @@ out:
 }
 KBASE_EXPORT_SYMBOL(kbase_instr_hwcnt_clear);
 
+void kbase_instr_hwcnt_on_unrecoverable_error(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
+
+	/* If we already in unrecoverable error state, early return. */
+	if (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_UNRECOVERABLE_ERROR) {
+		spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
+		return;
+	}
+
+	kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_UNRECOVERABLE_ERROR;
+
+	/* Need to disable HW if it's not disabled yet. */
+	if (kbdev->hwcnt.backend.state != KBASE_INSTR_STATE_DISABLED)
+		kbasep_instr_hwc_disable_hw_prfcnt(kbdev);
+
+	/* Wake up any waiters. */
+	kbdev->hwcnt.backend.triggered = 1;
+	wake_up(&kbdev->hwcnt.backend.wait);
+
+	spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
+}
+KBASE_EXPORT_SYMBOL(kbase_instr_hwcnt_on_unrecoverable_error);
+
+void kbase_instr_hwcnt_on_before_reset(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
+
+	/* A reset is the only way to exit the unrecoverable error state */
+	if (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_UNRECOVERABLE_ERROR)
+		kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_DISABLED;
+
+	spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
+}
+KBASE_EXPORT_SYMBOL(kbase_instr_hwcnt_on_before_reset);
+
 int kbase_instr_backend_init(struct kbase_device *kbdev)
 {
 	spin_lock_init(&kbdev->hwcnt.lock);
diff --git a/mali_kbase/backend/gpu/mali_kbase_instr_defs.h b/mali_kbase/backend/gpu/mali_kbase_instr_defs.h
index e356348..7190f42 100644
--- a/mali_kbase/backend/gpu/mali_kbase_instr_defs.h
+++ b/mali_kbase/backend/gpu/mali_kbase_instr_defs.h
@@ -38,8 +38,12 @@ enum kbase_instr_state {
 	KBASE_INSTR_STATE_IDLE,
 	/* Hardware is currently dumping a frame. */
 	KBASE_INSTR_STATE_DUMPING,
-	/* An error has occured during DUMPING (page fault). */
-	KBASE_INSTR_STATE_FAULT
+	/* An error has occurred during DUMPING (page fault). */
+	KBASE_INSTR_STATE_FAULT,
+	/* An unrecoverable error has occurred, a reset is the only way to exit
+	 * from unrecoverable error state.
+	 */
+	KBASE_INSTR_STATE_UNRECOVERABLE_ERROR,
 };
 
 /* Structure used for instrumentation and HW counters dumping */
diff --git a/mali_kbase/backend/gpu/mali_kbase_jm_defs.h b/mali_kbase/backend/gpu/mali_kbase_jm_defs.h
index e29ace7..3ce3903 100644
--- a/mali_kbase/backend/gpu/mali_kbase_jm_defs.h
+++ b/mali_kbase/backend/gpu/mali_kbase_jm_defs.h
@@ -38,10 +38,31 @@ struct rb_entry {
 	struct kbase_jd_atom *katom;
 };
 
+/* SLOT_RB_TAG_PURGED assumes a value that is different from
+ * NULL (SLOT_RB_NULL_TAG_VAL) and will not be the result of
+ * any valid pointer via macro translation: SLOT_RB_TAG_KCTX(x).
+ */
+#define SLOT_RB_TAG_PURGED ((u64)(1 << 1))
+#define SLOT_RB_NULL_TAG_VAL ((u64)0)
+
+/**
+ * SLOT_RB_TAG_KCTX() - a function-like macro for converting a pointer to a
+ *			u64 for serving as tagged value.
+ */
+#define SLOT_RB_TAG_KCTX(kctx) (u64)((uintptr_t)(kctx))
 /**
  * struct slot_rb - Slot ringbuffer
  * @entries:		Ringbuffer entries
- * @last_context:	The last context to submit a job on this slot
+ * @last_kctx_tagged:	The last context that submitted a job to the slot's
+ *			HEAD_NEXT register. The value is a tagged variant so
+ *			must not be dereferenced. It is used in operation to
+ *			track when shader core L1 caches might contain a
+ *			previous context's data, and so must only be set to
+ *			SLOT_RB_NULL_TAG_VAL after reset/powerdown of the
+ *			cores. In slot job submission, if there is a kctx
+ *			change, and the relevant katom is configured with
+ *			BASE_JD_REQ_SKIP_CACHE_START, a L1 read only cache
+ *			maintenace operation is enforced.
  * @read_idx:		Current read index of buffer
  * @write_idx:		Current write index of buffer
  * @job_chain_flag:	Flag used to implement jobchain disambiguation
@@ -49,7 +70,7 @@ struct rb_entry {
 struct slot_rb {
 	struct rb_entry entries[SLOT_RB_SIZE];
 
-	struct kbase_context *last_context;
+	u64 last_kctx_tagged;
 
 	u8 read_idx;
 	u8 write_idx;
diff --git a/mali_kbase/backend/gpu/mali_kbase_jm_hw.c b/mali_kbase/backend/gpu/mali_kbase_jm_hw.c
index 001efd9..ec3b906 100644
--- a/mali_kbase/backend/gpu/mali_kbase_jm_hw.c
+++ b/mali_kbase/backend/gpu/mali_kbase_jm_hw.c
@@ -33,6 +33,7 @@
 #include <mali_kbase_reset_gpu.h>
 #include <mali_kbase_ctx_sched.h>
 #include <mali_kbase_kinstr_jm.h>
+#include <mali_kbase_hwaccess_instr.h>
 #include <mali_kbase_hwcnt_context.h>
 #include <device/mali_kbase_device.h>
 #include <backend/gpu/mali_kbase_irq_internal.h>
@@ -198,7 +199,9 @@ void kbase_job_hw_submit(struct kbase_device *kbdev,
 	u32 cfg;
 	u64 const jc_head = select_job_chain(katom);
 	u64 affinity;
+	struct slot_rb *ptr_slot_rb = &kbdev->hwaccess.backend.slot_rb[js];
 
+	lockdep_assert_held(&kbdev->hwaccess_lock);
 	KBASE_DEBUG_ASSERT(kbdev);
 	KBASE_DEBUG_ASSERT(katom);
 
@@ -227,9 +230,23 @@ void kbase_job_hw_submit(struct kbase_device *kbdev,
 			!(kbdev->serialize_jobs & KBASE_SERIALIZE_RESET))
 		cfg |= JS_CONFIG_ENABLE_FLUSH_REDUCTION;
 
-	if (0 != (katom->core_req & BASE_JD_REQ_SKIP_CACHE_START))
-		cfg |= JS_CONFIG_START_FLUSH_NO_ACTION;
-	else
+	if (0 != (katom->core_req & BASE_JD_REQ_SKIP_CACHE_START)) {
+		/* Force a cache maintenance operation if the newly submitted
+		 * katom to the slot is from a different kctx. For a JM GPU
+		 * that has the feature BASE_HW_FEATURE_FLUSH_INV_SHADER_OTHER,
+		 * applies a FLUSH_INV_SHADER_OTHER. Otherwise, do a
+		 * FLUSH_CLEAN_INVALIDATE.
+		 */
+		u64 tagged_kctx = ptr_slot_rb->last_kctx_tagged;
+
+		if (tagged_kctx != SLOT_RB_NULL_TAG_VAL && tagged_kctx != SLOT_RB_TAG_KCTX(kctx)) {
+			if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_FLUSH_INV_SHADER_OTHER))
+				cfg |= JS_CONFIG_START_FLUSH_INV_SHADER_OTHER;
+			else
+				cfg |= JS_CONFIG_START_FLUSH_CLEAN_INVALIDATE;
+		} else
+			cfg |= JS_CONFIG_START_FLUSH_NO_ACTION;
+	} else
 		cfg |= JS_CONFIG_START_FLUSH_CLEAN_INVALIDATE;
 
 	if (0 != (katom->core_req & BASE_JD_REQ_SKIP_CACHE_END) &&
@@ -246,13 +263,13 @@ void kbase_job_hw_submit(struct kbase_device *kbdev,
 	    (katom->core_req & BASE_JD_REQ_END_RENDERPASS))
 		cfg |= JS_CONFIG_DISABLE_DESCRIPTOR_WR_BK;
 
-	if (!kbdev->hwaccess.backend.slot_rb[js].job_chain_flag) {
+	if (!ptr_slot_rb->job_chain_flag) {
 		cfg |= JS_CONFIG_JOB_CHAIN_FLAG;
 		katom->atom_flags |= KBASE_KATOM_FLAGS_JOBCHAIN;
-		kbdev->hwaccess.backend.slot_rb[js].job_chain_flag = true;
+		ptr_slot_rb->job_chain_flag = true;
 	} else {
 		katom->atom_flags &= ~KBASE_KATOM_FLAGS_JOBCHAIN;
-		kbdev->hwaccess.backend.slot_rb[js].job_chain_flag = false;
+		ptr_slot_rb->job_chain_flag = false;
 	}
 
 	kbase_reg_write(kbdev, JOB_SLOT_REG(js, JS_CONFIG_NEXT), cfg);
@@ -290,6 +307,10 @@ void kbase_job_hw_submit(struct kbase_device *kbdev,
 			&kbdev->gpu_props.props.raw_props.js_features[js],
 			"ctx_nr,atom_nr");
 	kbase_kinstr_jm_atom_hw_submit(katom);
+
+	/* Update the slot's last katom submission kctx */
+	ptr_slot_rb->last_kctx_tagged = SLOT_RB_TAG_KCTX(kctx);
+
 #if IS_ENABLED(CONFIG_GPU_TRACEPOINTS)
 	if (!kbase_backend_nr_atoms_submitted(kbdev, js)) {
 		/* If this is the only job on the slot, trace it as starting */
@@ -300,7 +321,6 @@ void kbase_job_hw_submit(struct kbase_device *kbdev,
 						sizeof(js_string)),
 				ktime_to_ns(katom->start_timestamp),
 				(u32)katom->kctx->id, 0, katom->work_id);
-		kbdev->hwaccess.backend.slot_rb[js].last_context = katom->kctx;
 	}
 #endif
 
@@ -823,7 +843,7 @@ void kbase_jm_wait_for_zero_jobs(struct kbase_context *kctx)
 	if (timeout != 0)
 		goto exit;
 
-	if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE)) {
+	if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR)) {
 		dev_err(kbdev->dev,
 			"Issuing GPU soft-reset because jobs failed to be killed (within %d ms) as part of context termination (e.g. process exit)\n",
 			ZAP_TIMEOUT);
@@ -938,6 +958,7 @@ void kbase_job_slot_hardstop(struct kbase_context *kctx, int js,
 	stopped = kbase_backend_soft_hard_stop_slot(kbdev, kctx, js,
 							target_katom,
 							JS_COMMAND_HARD_STOP);
+	CSTD_UNUSED(stopped);
 }
 
 /**
@@ -1177,6 +1198,13 @@ static void kbasep_reset_timeout_worker(struct work_struct *data)
 	kbase_pm_metrics_update(kbdev, NULL);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
+	/* Tell hardware counters a reset is about to occur.
+	 * If the instr backend is in an unrecoverable error state (e.g. due to
+	 * HW being unresponsive), this will transition the backend out of
+	 * it, on the assumption a reset will fix whatever problem there was.
+	 */
+	kbase_instr_hwcnt_on_before_reset(kbdev);
+
 	/* Reset the GPU */
 	kbase_pm_init_hw(kbdev, 0);
 
@@ -1309,7 +1337,7 @@ static void kbasep_try_reset_gpu_early(struct kbase_device *kbdev)
  * @kbdev: kbase device
  * @flags: Bitfield indicating impact of reset (see flag defines)
  *
- * This function just soft-stops all the slots to ensure that as many jobs as
+ * This function soft-stops all the slots to ensure that as many jobs as
  * possible are saved.
  *
  * Return:
@@ -1323,7 +1351,6 @@ bool kbase_prepare_to_reset_gpu_locked(struct kbase_device *kbdev,
 {
 	int i;
 
-	CSTD_UNUSED(flags);
 	KBASE_DEBUG_ASSERT(kbdev);
 
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
@@ -1335,6 +1362,9 @@ bool kbase_prepare_to_reset_gpu_locked(struct kbase_device *kbdev,
 	}
 #endif
 
+	if (flags & RESET_FLAGS_HWC_UNRECOVERABLE_ERROR)
+		kbase_instr_hwcnt_on_unrecoverable_error(kbdev);
+
 	if (atomic_cmpxchg(&kbdev->hwaccess.backend.reset_gpu,
 						KBASE_RESET_GPU_NOT_PENDING,
 						KBASE_RESET_GPU_PREPARED) !=
diff --git a/mali_kbase/backend/gpu/mali_kbase_jm_rb.c b/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
index 1906286..0f2f296 100644
--- a/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
+++ b/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
@@ -760,6 +760,13 @@ static int kbase_jm_exit_protected_mode(struct kbase_device *kbdev,
 		/* ***TRANSITION TO HIGHER STATE*** */
 		fallthrough;
 	case KBASE_ATOM_EXIT_PROTECTED_RESET:
+		/* L2 cache has been turned off (which is needed prior to the reset of GPU
+		 * to exit the protected mode), so the override flag can be safely cleared.
+		 * Even if L2 cache is powered up again before the actual reset, it should
+		 * not be an issue (there are no jobs running on the GPU).
+		 */
+		kbase_pm_protected_override_disable(kbdev);
+
 		/* Issue the reset to the GPU */
 		err = kbase_gpu_protected_mode_reset(kbdev);
 
@@ -768,7 +775,6 @@ static int kbase_jm_exit_protected_mode(struct kbase_device *kbdev,
 
 		if (err) {
 			kbdev->protected_mode_transition = false;
-			kbase_pm_protected_override_disable(kbdev);
 
 			/* Failed to exit protected mode, fail atom */
 			katom[idx]->event_code = BASE_JD_EVENT_JOB_INVALID;
@@ -1069,9 +1075,9 @@ kbase_rb_atom_might_depend(const struct kbase_jd_atom *katom_a,
 /**
  * kbase_gpu_irq_evict - evict a slot's JSn_HEAD_NEXT atom from the HW if it is
  *                       related to a failed JSn_HEAD atom
- * @kbdev kbase device
- * @js job slot to check
- * @completion_code completion code of the failed atom
+ * @kbdev: kbase device
+ * @js: job slot to check
+ * @completion_code: completion code of the failed atom
  *
  * Note: 'STOPPED' atoms are considered 'failed', as they are in the HW, but
  * unlike other failure codes we _can_ re-run them.
@@ -1129,6 +1135,14 @@ bool kbase_gpu_irq_evict(struct kbase_device *kbdev, int js,
 		if (next_katom->core_req & BASE_JD_REQ_PERMON)
 			kbase_pm_release_gpu_cycle_counter_nolock(kbdev);
 
+		/* On evicting the next_katom, the last submission kctx on the
+		 * given job slot then reverts back to the one that owns katom.
+		 * The aim is to enable the next submission that can determine
+		 * if the read only shader core L1 cache should be invalidated.
+		 */
+		kbdev->hwaccess.backend.slot_rb[js].last_kctx_tagged =
+			SLOT_RB_TAG_KCTX(katom->kctx);
+
 		return true;
 	}
 
@@ -1137,11 +1151,11 @@ bool kbase_gpu_irq_evict(struct kbase_device *kbdev, int js,
 
 /**
  * kbase_gpu_complete_hw - complete the atom in a slot's JSn_HEAD
- * @kbdev kbase device
- * @js job slot to check
- * @completion_code completion code of the completed atom
- * @job_tail value read from JSn_TAIL, for STOPPED atoms
- * @end_timestamp pointer to approximate ktime value when the katom completed
+ * @kbdev: kbase device
+ * @js: job slot to check
+ * @completion_code: completion code of the completed atom
+ * @job_tail: value read from JSn_TAIL, for STOPPED atoms
+ * @end_timestamp: pointer to approximate ktime value when the katom completed
  *
  * Among other operations, this also executes step 2 of a 2-step process of
  * removing any related atoms from a slot's JSn_HEAD_NEXT (ringbuffer index 1),
@@ -1323,8 +1337,6 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js,
 						ktime_to_ns(*end_timestamp),
 						(u32)next_katom->kctx->id, 0,
 						next_katom->work_id);
-			kbdev->hwaccess.backend.slot_rb[js].last_context =
-							next_katom->kctx;
 		} else {
 			char js_string[16];
 
@@ -1333,7 +1345,6 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js,
 							sizeof(js_string)),
 						ktime_to_ns(ktime_get()), 0, 0,
 						0);
-			kbdev->hwaccess.backend.slot_rb[js].last_context = 0;
 		}
 	}
 #endif
@@ -1427,6 +1438,9 @@ void kbase_backend_reset(struct kbase_device *kbdev, ktime_t *end_timestamp)
 			katom->event_code = BASE_JD_EVENT_JOB_CANCELLED;
 			kbase_jm_complete(kbdev, katom, end_timestamp);
 		}
+
+		/* Clear the slot's last katom submission kctx on reset */
+		kbdev->hwaccess.backend.slot_rb[js].last_kctx_tagged = SLOT_RB_NULL_TAG_VAL;
 	}
 
 	/* Re-enable GPU hardware counters if we're resetting from protected
@@ -1649,6 +1663,11 @@ bool kbase_backend_soft_hard_stop_slot(struct kbase_device *kbdev,
 						kbase_gpu_remove_atom(kbdev,
 								katom_idx1,
 								action, true);
+						/* Revert the last_context. */
+						kbdev->hwaccess.backend.slot_rb[js]
+							.last_kctx_tagged =
+							SLOT_RB_TAG_KCTX(katom_idx0->kctx);
+
 						stop_x_dep_idx1 =
 					should_stop_x_dep_slot(katom_idx1);
 
@@ -1724,6 +1743,10 @@ bool kbase_backend_soft_hard_stop_slot(struct kbase_device *kbdev,
 					kbase_gpu_remove_atom(kbdev, katom_idx1,
 									action,
 									false);
+					/* Revert the last_context, or mark as purged */
+					kbdev->hwaccess.backend.slot_rb[js].last_kctx_tagged =
+						kctx_idx0 ? SLOT_RB_TAG_KCTX(katom_idx0->kctx) :
+							    SLOT_RB_TAG_PURGED;
 				} else {
 					/* idx0 has already completed - stop
 					 * idx1
@@ -1753,7 +1776,8 @@ void kbase_backend_cache_clean(struct kbase_device *kbdev,
 		struct kbase_jd_atom *katom)
 {
 	if (katom->need_cache_flush_cores_retained) {
-		kbase_gpu_start_cache_clean(kbdev);
+		kbase_gpu_start_cache_clean(kbdev,
+					    GPU_COMMAND_CACHE_CLN_INV_FULL);
 		kbase_gpu_wait_cache_clean(kbdev);
 
 		katom->need_cache_flush_cores_retained = false;
@@ -1811,3 +1835,34 @@ void kbase_gpu_dump_slots(struct kbase_device *kbdev)
 
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 }
+
+void kbase_backend_slot_kctx_purge_locked(struct kbase_device *kbdev, struct kbase_context *kctx)
+{
+	int js;
+	bool tracked = false;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	for (js = 0; js < kbdev->gpu_props.num_job_slots; js++) {
+		u64 tagged_kctx = kbdev->hwaccess.backend.slot_rb[js].last_kctx_tagged;
+
+		if (tagged_kctx == SLOT_RB_TAG_KCTX(kctx)) {
+			/* Marking the slot kctx tracking field is purged */
+			kbdev->hwaccess.backend.slot_rb[js].last_kctx_tagged = SLOT_RB_TAG_PURGED;
+			tracked = true;
+		}
+	}
+
+	if (tracked) {
+		/* The context had run some jobs before the purge, other slots
+		 * in SLOT_RB_NULL_TAG_VAL condition needs to be marked as
+		 * purged as well.
+		 */
+		for (js = 0; js < kbdev->gpu_props.num_job_slots; js++) {
+			if (kbdev->hwaccess.backend.slot_rb[js].last_kctx_tagged ==
+			    SLOT_RB_NULL_TAG_VAL)
+				kbdev->hwaccess.backend.slot_rb[js].last_kctx_tagged =
+					SLOT_RB_TAG_PURGED;
+		}
+	}
+}
diff --git a/mali_kbase/backend/gpu/mali_kbase_l2_mmu_config.c b/mali_kbase/backend/gpu/mali_kbase_l2_mmu_config.c
index 7131546..c2d7a26 100644
--- a/mali_kbase/backend/gpu/mali_kbase_l2_mmu_config.c
+++ b/mali_kbase/backend/gpu/mali_kbase_l2_mmu_config.c
@@ -26,7 +26,7 @@
 #include "mali_kbase_l2_mmu_config.h"
 
 /**
- * struct l2_mmu_config_limit_region
+ * struct l2_mmu_config_limit_region - L2 MMU limit field
  *
  * @value:    The default value to load into the L2_MMU_CONFIG register
  * @mask:     The shifted mask of the field in the L2_MMU_CONFIG register
@@ -39,7 +39,7 @@ struct l2_mmu_config_limit_region {
 };
 
 /**
- * struct l2_mmu_config_limit
+ * struct l2_mmu_config_limit - L2 MMU read and write limit
  *
  * @product_model:    The GPU for which this entry applies
  * @read:             Values for the read limit field
diff --git a/mali_kbase/backend/gpu/mali_kbase_model_dummy.c b/mali_kbase/backend/gpu/mali_kbase_model_dummy.c
new file mode 100644
index 0000000..ccf0e7c
--- /dev/null
+++ b/mali_kbase/backend/gpu/mali_kbase_model_dummy.c
@@ -0,0 +1,2008 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/* NOTES:
+ * - A default GPU can be compiled in during the build, by defining
+ *   CONFIG_MALI_NO_MALI_DEFAULT_GPU. SCons sets this, which means that
+ *   insmod'ing mali_kbase.ko with no arguments after a build with "scons
+ *   gpu=tXYZ" will yield the expected GPU ID for tXYZ. This can always be
+ *   overridden by passing the 'no_mali_gpu' argument to insmod.
+ *
+ * - if CONFIG_MALI_ERROR_INJECT is defined the error injection system is
+ *   activated.
+ */
+
+/* Implementation of failure injection system:
+ *
+ * Error conditions are generated by gpu_generate_error().
+ * According to CONFIG_MALI_ERROR_INJECT definition gpu_generate_error() either
+ * generates an error HW condition randomly (CONFIG_MALI_ERROR_INJECT_RANDOM) or
+ * checks if there is (in error_track_list) an error configuration to be set for
+ * the current job chain (CONFIG_MALI_ERROR_INJECT_RANDOM not defined).
+ * Each error condition will trigger a specific "state" for a certain set of
+ * registers as per Midgard Architecture Specifications doc.
+ *
+ * According to Midgard Architecture Specifications doc the following registers
+ * are always affected by error conditions:
+ *
+ * JOB Exception:
+ *				JOB_IRQ_RAWSTAT
+ *				JOB<n> STATUS AREA
+ *
+ * MMU Exception:
+ *				MMU_IRQ_RAWSTAT
+ *				AS<n>_FAULTSTATUS
+ *				AS<n>_FAULTADDRESS
+ *
+ * GPU Exception:
+ *				GPU_IRQ_RAWSTAT
+ *				GPU_FAULTSTATUS
+ *				GPU_FAULTADDRESS
+ *
+ *	For further clarification on the model behaviour upon specific error
+ *      conditions the user may refer to the Midgard Architecture Specification
+ *      document
+ */
+#include <mali_kbase.h>
+#include <gpu/mali_kbase_gpu_regmap.h>
+#include <backend/gpu/mali_kbase_model_dummy.h>
+#include <mali_kbase_mem_linux.h>
+
+#if MALI_USE_CSF
+#include <csf/mali_kbase_csf_firmware.h>
+
+/* Index of the last value register for each type of core, with the 1st value
+ * register being at index 0.
+ */
+#define IPA_CTL_MAX_VAL_CNT_IDX (KBASE_IPA_CONTROL_NUM_BLOCK_COUNTERS - 1)
+
+/* Array for storing the value of SELECT register for each type of core */
+static u64 ipa_ctl_select_config[KBASE_IPA_CORE_TYPE_NUM];
+static bool ipa_control_timer_enabled;
+#endif
+
+#define LO_MASK(M) ((M) & 0xFFFFFFFF)
+
+static u32 get_implementation_register(u32 reg)
+{
+	switch (reg) {
+	case GPU_CONTROL_REG(SHADER_PRESENT_LO):
+		return LO_MASK(DUMMY_IMPLEMENTATION_SHADER_PRESENT);
+	case GPU_CONTROL_REG(TILER_PRESENT_LO):
+		return LO_MASK(DUMMY_IMPLEMENTATION_TILER_PRESENT);
+	case GPU_CONTROL_REG(L2_PRESENT_LO):
+		return LO_MASK(DUMMY_IMPLEMENTATION_L2_PRESENT);
+	case GPU_CONTROL_REG(STACK_PRESENT_LO):
+		return LO_MASK(DUMMY_IMPLEMENTATION_STACK_PRESENT);
+
+	case GPU_CONTROL_REG(SHADER_PRESENT_HI):
+	case GPU_CONTROL_REG(TILER_PRESENT_HI):
+	case GPU_CONTROL_REG(L2_PRESENT_HI):
+	case GPU_CONTROL_REG(STACK_PRESENT_HI):
+	/* *** FALLTHROUGH *** */
+	default:
+		return 0;
+	}
+}
+
+struct {
+	unsigned long prfcnt_base;
+	u32 *prfcnt_base_cpu;
+	struct kbase_device *kbdev;
+	struct tagged_addr *pages;
+	size_t page_count;
+
+	u32 time;
+
+	struct {
+		u32 jm;
+		u32 tiler;
+		u32 l2;
+		u32 shader;
+	} prfcnt_en;
+
+	u64 l2_present;
+	u64 shader_present;
+
+#if !MALI_USE_CSF
+	u64 jm_counters[KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
+#else
+	u64 cshw_counters[KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
+#endif /* !MALI_USE_CSF */
+	u64 tiler_counters[KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
+	u64 l2_counters[KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS *
+			KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
+	u64 shader_counters[KBASE_DUMMY_MODEL_MAX_SHADER_CORES *
+			    KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
+
+} performance_counters = {
+	.l2_present = DUMMY_IMPLEMENTATION_L2_PRESENT,
+	.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+};
+
+struct job_slot {
+	int job_active;
+	int job_queued;
+	int job_complete_irq_asserted;
+	int job_irq_mask;
+	int job_disabled;
+};
+
+/**
+ * struct control_reg_values_t - control register values specific to the GPU being 'emulated'
+ * @name:			GPU name
+ * @gpu_id:			GPU ID to report
+ * @as_present:			Bitmap of address spaces present
+ * @thread_max_threads:		Maximum number of threads per core
+ * @thread_max_workgroup_size:	Maximum number of threads per workgroup
+ * @thread_max_barrier_size:	Maximum number of threads per barrier
+ * @thread_features:		Thread features, NOT INCLUDING the 2
+ *				most-significant bits, which are always set to
+ *				IMPLEMENTATION_MODEL.
+ * @core_features:		Core features
+ * @tiler_features:		Tiler features
+ * @mmu_features:		MMU features
+ * @gpu_features_lo:		GPU features (low)
+ * @gpu_features_hi:		GPU features (high)
+ */
+struct control_reg_values_t {
+	const char *name;
+	u32 gpu_id;
+	u32 as_present;
+	u32 thread_max_threads;
+	u32 thread_max_workgroup_size;
+	u32 thread_max_barrier_size;
+	u32 thread_features;
+	u32 core_features;
+	u32 tiler_features;
+	u32 mmu_features;
+	u32 gpu_features_lo;
+	u32 gpu_features_hi;
+};
+
+struct dummy_model_t {
+	int reset_completed;
+	int reset_completed_mask;
+	int prfcnt_sample_completed;
+	int power_changed_mask;	/* 2bits: _ALL,_SINGLE */
+	int power_changed;	/* 1bit */
+	bool clean_caches_completed;
+	bool clean_caches_completed_irq_enabled;
+	int power_on;		/* 6bits: SHADER[4],TILER,L2 */
+	u32 stack_power_on_lo;
+	u32 coherency_enable;
+	unsigned int job_irq_js_state;
+	struct job_slot slots[NUM_SLOTS];
+	const struct control_reg_values_t *control_reg_values;
+	u32 l2_config;
+	void *data;
+};
+
+void gpu_device_set_data(void *model, void *data)
+{
+	struct dummy_model_t *dummy = (struct dummy_model_t *)model;
+
+	dummy->data = data;
+}
+
+void *gpu_device_get_data(void *model)
+{
+	struct dummy_model_t *dummy = (struct dummy_model_t *)model;
+
+	return dummy->data;
+}
+
+#define signal_int(m, s) m->slots[(s)].job_complete_irq_asserted = 1
+
+/* SCons should pass in a default GPU, but other ways of building (e.g.
+ * in-tree) won't, so define one here in case.
+ */
+#ifndef CONFIG_MALI_NO_MALI_DEFAULT_GPU
+#define CONFIG_MALI_NO_MALI_DEFAULT_GPU "tMIx"
+#endif
+
+static char *no_mali_gpu = CONFIG_MALI_NO_MALI_DEFAULT_GPU;
+module_param(no_mali_gpu, charp, 0000);
+MODULE_PARM_DESC(no_mali_gpu, "GPU to identify as");
+
+/* Construct a value for the THREAD_FEATURES register, *except* the two most
+ * significant bits, which are set to IMPLEMENTATION_MODEL in
+ * midgard_model_read_reg().
+ */
+#if MALI_USE_CSF
+#define THREAD_FEATURES_PARTIAL(MAX_REGISTERS, MAX_TASK_QUEUE, MAX_TG_SPLIT) \
+	((MAX_REGISTERS) | ((MAX_TASK_QUEUE) << 24))
+#else
+#define THREAD_FEATURES_PARTIAL(MAX_REGISTERS, MAX_TASK_QUEUE, MAX_TG_SPLIT) \
+	((MAX_REGISTERS) | ((MAX_TASK_QUEUE) << 16) | ((MAX_TG_SPLIT) << 24))
+#endif
+
+/* Array associating GPU names with control register values. The first
+ * one is used in the case of no match.
+ */
+static const struct control_reg_values_t all_control_reg_values[] = {
+	{
+		.name = "tMIx",
+		.gpu_id = GPU_ID2_MAKE(6, 0, 10, 0, 0, 1, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tHEx",
+		.gpu_id = GPU_ID2_MAKE(6, 2, 0, 1, 0, 3, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tSIx",
+		.gpu_id = GPU_ID2_MAKE(7, 0, 0, 0, 1, 1, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x300,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x209,
+		.mmu_features = 0x2821,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tDVx",
+		.gpu_id = GPU_ID2_MAKE(7, 0, 0, 3, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x300,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x209,
+		.mmu_features = 0x2821,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tNOx",
+		.gpu_id = GPU_ID2_MAKE(7, 2, 1, 1, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tGOx_r0p0",
+		.gpu_id = GPU_ID2_MAKE(7, 2, 2, 2, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tGOx_r1p0",
+		.gpu_id = GPU_ID2_MAKE(7, 4, 0, 2, 1, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.core_features = 0x2,
+		.tiler_features = 0x209,
+		.mmu_features = 0x2823,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tTRx",
+		.gpu_id = GPU_ID2_MAKE(9, 0, 8, 0, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tNAx",
+		.gpu_id = GPU_ID2_MAKE(9, 0, 8, 1, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tBEx",
+		.gpu_id = GPU_ID2_MAKE(9, 2, 0, 2, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tBAx",
+		.gpu_id = GPU_ID2_MAKE(9, 14, 4, 5, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tDUx",
+		.gpu_id = GPU_ID2_MAKE(10, 2, 0, 1, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tODx",
+		.gpu_id = GPU_ID2_MAKE(10, 8, 0, 2, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tGRx",
+		.gpu_id = GPU_ID2_MAKE(10, 10, 0, 3, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.core_features = 0x0, /* core_1e16fma2tex */
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tVAx",
+		.gpu_id = GPU_ID2_MAKE(10, 12, 0, 4, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.core_features = 0x0, /* core_1e16fma2tex */
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+	},
+	{
+		.name = "tTUx",
+		.gpu_id = GPU_ID2_MAKE(11, 8, 5, 2, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x800,
+		.thread_max_workgroup_size = 0x400,
+		.thread_max_barrier_size = 0x400,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x10000, 4, 0),
+		.core_features = 0x0, /* core_1e32fma2tex */
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0xf,
+		.gpu_features_hi = 0,
+	},
+};
+
+struct error_status_t hw_error_status;
+
+#if MALI_USE_CSF
+static u32 gpu_model_get_prfcnt_value(enum kbase_ipa_core_type core_type,
+				      u32 cnt_idx, bool is_low_word)
+{
+	u64 *counters_data;
+	u32 core_count = 0;
+	u32 event_index;
+	u64 value = 0;
+	u32 core;
+
+	if (WARN_ON(core_type >= KBASE_IPA_CORE_TYPE_NUM))
+		return 0;
+
+	if (WARN_ON(cnt_idx >= KBASE_IPA_CONTROL_NUM_BLOCK_COUNTERS))
+		return 0;
+
+	event_index =
+		(ipa_ctl_select_config[core_type] >> (cnt_idx * 8)) & 0xFF;
+
+	/* Currently only primary counter blocks are supported */
+	if (WARN_ON(event_index >= 64))
+		return 0;
+
+	/* The actual events start index 4 onwards. Spec also says PRFCNT_EN,
+	 * TIMESTAMP_LO or TIMESTAMP_HI pseudo-counters do not make sense for
+	 * IPA counters. If selected, the value returned for them will be zero.
+	 */
+	if (WARN_ON(event_index <= 3))
+		return 0;
+
+	event_index -= 4;
+
+	switch (core_type) {
+	case KBASE_IPA_CORE_TYPE_CSHW:
+		core_count = 1;
+		counters_data = performance_counters.cshw_counters;
+		break;
+	case KBASE_IPA_CORE_TYPE_MEMSYS:
+		core_count = hweight64(performance_counters.l2_present);
+		counters_data = performance_counters.l2_counters;
+		break;
+	case KBASE_IPA_CORE_TYPE_TILER:
+		core_count = 1;
+		counters_data = performance_counters.tiler_counters;
+		break;
+	case KBASE_IPA_CORE_TYPE_SHADER:
+		core_count = hweight64(performance_counters.shader_present);
+		counters_data = performance_counters.shader_counters;
+		break;
+	default:
+		WARN(1, "Invalid core_type %d\n", core_type);
+		break;
+	}
+
+	for (core = 0; core < core_count; core++) {
+		value += counters_data[event_index];
+		event_index += KBASE_DUMMY_MODEL_COUNTER_PER_CORE;
+	}
+
+	if (is_low_word)
+		return (value & U32_MAX);
+	else
+		return (value >> 32);
+}
+
+void gpu_model_clear_prfcnt_values(void)
+{
+	memset(performance_counters.cshw_counters, 0,
+	       sizeof(performance_counters.cshw_counters));
+
+	memset(performance_counters.tiler_counters, 0,
+	       sizeof(performance_counters.tiler_counters));
+
+	memset(performance_counters.l2_counters, 0,
+	       sizeof(performance_counters.l2_counters));
+
+	memset(performance_counters.shader_counters, 0,
+	       sizeof(performance_counters.shader_counters));
+}
+KBASE_EXPORT_TEST_API(gpu_model_clear_prfcnt_values);
+#endif
+
+/**
+ * gpu_model_dump_prfcnt_blocks() - Dump performance counter values to buffer
+ *
+ * @values:             Array of values to be written out
+ * @out_index:          Index into performance counter buffer
+ * @block_count:        Number of blocks to dump
+ * @prfcnt_enable_mask: Counter enable mask
+ * @blocks_present:     Available blocks bit mask
+ */
+static void gpu_model_dump_prfcnt_blocks(u64 *values, u32 *out_index,
+					 u32 block_count,
+					 u32 prfcnt_enable_mask,
+					 u64 blocks_present)
+{
+	u32 block_idx, counter;
+	u32 counter_value = 0;
+	u32 *prfcnt_base;
+	u32 index = 0;
+
+	prfcnt_base = performance_counters.prfcnt_base_cpu;
+
+	for (block_idx = 0; block_idx < block_count; block_idx++) {
+		/* only dump values if core is present */
+		if (!(blocks_present & (1 << block_idx))) {
+#if MALI_USE_CSF
+			/* if CSF dump zeroed out block */
+			memset(&prfcnt_base[*out_index], 0,
+			       KBASE_DUMMY_MODEL_BLOCK_SIZE);
+			*out_index += KBASE_DUMMY_MODEL_VALUES_PER_BLOCK;
+#endif /* MALI_USE_CSF */
+			continue;
+		}
+
+		/* write the header */
+		prfcnt_base[*out_index] = performance_counters.time++;
+		prfcnt_base[*out_index+2] = prfcnt_enable_mask;
+		*out_index += KBASE_DUMMY_MODEL_COUNTER_HEADER_DWORDS;
+
+		/* write the counters */
+		for (counter = 0;
+			 counter < KBASE_DUMMY_MODEL_COUNTER_PER_CORE;
+			 counter++) {
+			/* HW counter values retrieved through
+			 * PRFCNT_SAMPLE request are of 32 bits only.
+			 */
+			counter_value = (u32)values[index++];
+			if (KBASE_DUMMY_MODEL_COUNTER_ENABLED(
+				 prfcnt_enable_mask, (counter +
+				 KBASE_DUMMY_MODEL_COUNTER_HEADER_DWORDS))) {
+				prfcnt_base[*out_index + counter] =
+					counter_value;
+			}
+		}
+		*out_index +=  KBASE_DUMMY_MODEL_COUNTER_PER_CORE;
+	}
+}
+
+/**
+ * gpu_model_sync_dummy_prfcnt() - Synchronize dumped performance counter values
+ *
+ * Used to ensure counter values are not lost if cache invalidation is performed
+ * prior to reading.
+ */
+static void gpu_model_sync_dummy_prfcnt(void)
+{
+	int i;
+	struct page *pg;
+
+	for (i = 0; i < performance_counters.page_count; i++) {
+		pg = as_page(performance_counters.pages[i]);
+		kbase_sync_single_for_device(performance_counters.kbdev,
+					     kbase_dma_addr(pg), PAGE_SIZE,
+					     DMA_BIDIRECTIONAL);
+	}
+}
+
+static void midgard_model_dump_prfcnt(void)
+{
+	u32 index = 0;
+
+#if !MALI_USE_CSF
+	gpu_model_dump_prfcnt_blocks(performance_counters.jm_counters, &index,
+				     1, 0xffffffff, 0x1);
+#else
+	gpu_model_dump_prfcnt_blocks(performance_counters.cshw_counters, &index,
+				     1, 0xffffffff, 0x1);
+#endif /* !MALI_USE_CSF */
+	gpu_model_dump_prfcnt_blocks(performance_counters.tiler_counters,
+				     &index, 1,
+				     performance_counters.prfcnt_en.tiler,
+				     DUMMY_IMPLEMENTATION_TILER_PRESENT);
+	gpu_model_dump_prfcnt_blocks(performance_counters.l2_counters, &index,
+				     KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS,
+				     performance_counters.prfcnt_en.l2,
+				     performance_counters.l2_present);
+	gpu_model_dump_prfcnt_blocks(performance_counters.shader_counters,
+				     &index, KBASE_DUMMY_MODEL_MAX_SHADER_CORES,
+				     performance_counters.prfcnt_en.shader,
+				     performance_counters.shader_present);
+
+	gpu_model_sync_dummy_prfcnt();
+
+	/* simulate a 'long' time between samples */
+	performance_counters.time += 10;
+}
+
+static void init_register_statuses(struct dummy_model_t *dummy)
+{
+	int i;
+
+	hw_error_status.errors_mask = 0;
+	hw_error_status.gpu_error_irq = 0;
+	hw_error_status.gpu_fault_status = 0;
+	hw_error_status.job_irq_rawstat = 0;
+	hw_error_status.job_irq_status = 0;
+	hw_error_status.mmu_irq_rawstat = 0;
+	hw_error_status.mmu_irq_mask = 0;
+
+	for (i = 0; i < NUM_SLOTS; i++) {
+		hw_error_status.js_status[i] = 0;
+		hw_error_status.job_irq_rawstat |=
+			(dummy->slots[i].job_complete_irq_asserted) << i;
+		hw_error_status.job_irq_status |=
+			(dummy->slots[i].job_complete_irq_asserted) << i;
+	}
+	for (i = 0; i < NUM_MMU_AS; i++) {
+		hw_error_status.as_command[i] = 0;
+		hw_error_status.as_faultstatus[i] = 0;
+		hw_error_status.mmu_irq_mask |= 1 << i;
+	}
+
+	performance_counters.time = 0;
+}
+
+static void update_register_statuses(struct dummy_model_t *dummy, int job_slot)
+{
+	if (hw_error_status.errors_mask & IS_A_JOB_ERROR) {
+		if (job_slot == hw_error_status.current_job_slot) {
+#if !MALI_USE_CSF
+			if (hw_error_status.js_status[job_slot] == 0) {
+				/* status reg is clean; it can be written */
+
+				switch (hw_error_status.errors_mask &
+							IS_A_JOB_ERROR) {
+				case KBASE_JOB_INTERRUPTED:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_INTERRUPTED;
+					break;
+
+				case KBASE_JOB_STOPPED:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_STOPPED;
+					break;
+
+				case KBASE_JOB_TERMINATED:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_TERMINATED;
+					break;
+
+				case KBASE_JOB_CONFIG_FAULT:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_CONFIG_FAULT;
+					break;
+
+				case KBASE_JOB_POWER_FAULT:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_POWER_FAULT;
+					break;
+
+				case KBASE_JOB_READ_FAULT:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_READ_FAULT;
+					break;
+
+				case KBASE_JOB_WRITE_FAULT:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_WRITE_FAULT;
+					break;
+
+				case KBASE_JOB_AFFINITY_FAULT:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_AFFINITY_FAULT;
+					break;
+
+				case KBASE_JOB_BUS_FAULT:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_BUS_FAULT;
+					break;
+
+				case KBASE_INSTR_INVALID_PC:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_INSTR_INVALID_PC;
+					break;
+
+				case KBASE_INSTR_INVALID_ENC:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_INSTR_INVALID_ENC;
+					break;
+
+				case KBASE_INSTR_TYPE_MISMATCH:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_INSTR_TYPE_MISMATCH;
+					break;
+
+				case KBASE_INSTR_OPERAND_FAULT:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_INSTR_OPERAND_FAULT;
+					break;
+
+				case KBASE_INSTR_TLS_FAULT:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_INSTR_TLS_FAULT;
+					break;
+
+				case KBASE_INSTR_BARRIER_FAULT:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_INSTR_BARRIER_FAULT;
+					break;
+
+				case KBASE_INSTR_ALIGN_FAULT:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_INSTR_ALIGN_FAULT;
+					break;
+
+				case KBASE_DATA_INVALID_FAULT:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_DATA_INVALID_FAULT;
+					break;
+
+				case KBASE_TILE_RANGE_FAULT:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_TILE_RANGE_FAULT;
+					break;
+
+				case KBASE_ADDR_RANGE_FAULT:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_ADDRESS_RANGE_FAULT;
+					break;
+
+				case KBASE_OUT_OF_MEMORY:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_OUT_OF_MEMORY;
+					break;
+
+				case KBASE_UNKNOWN:
+					hw_error_status.js_status[job_slot] =
+						JS_STATUS_UNKNOWN;
+					break;
+
+				default:
+					model_error_log(KBASE_CORE,
+					"\nAtom Chain 0x%llx: Invalid Error Mask!",
+						hw_error_status.current_jc);
+					break;
+				}
+			}
+#endif /* !MALI_USE_CSF */
+
+			/* we set JOB_FAIL_<n> */
+			hw_error_status.job_irq_rawstat |=
+			(dummy->slots[job_slot].job_complete_irq_asserted) <<
+								(job_slot + 16);
+			hw_error_status.job_irq_status |=
+			(((dummy->slots[job_slot].job_complete_irq_asserted) <<
+								(job_slot)) &
+					(dummy->slots[job_slot].job_irq_mask <<
+							job_slot)) << 16;
+		} else {
+			hw_error_status.job_irq_rawstat |=
+			(dummy->slots[job_slot].job_complete_irq_asserted) <<
+								job_slot;
+			hw_error_status.job_irq_status |=
+			((dummy->slots[job_slot].job_complete_irq_asserted) <<
+								(job_slot)) &
+					(dummy->slots[job_slot].job_irq_mask <<
+								job_slot);
+		}
+	} else {
+		hw_error_status.job_irq_rawstat |=
+			(dummy->slots[job_slot].job_complete_irq_asserted) <<
+								job_slot;
+		hw_error_status.job_irq_status |=
+			((dummy->slots[job_slot].job_complete_irq_asserted) <<
+								(job_slot)) &
+			(dummy->slots[job_slot].job_irq_mask << job_slot);
+	}			/* end of job register statuses */
+
+	if (hw_error_status.errors_mask & IS_A_MMU_ERROR) {
+		int i;
+
+		for (i = 0; i < NUM_MMU_AS; i++) {
+			if (i == hw_error_status.faulty_mmu_as) {
+				if (hw_error_status.as_faultstatus[i] == 0) {
+					u32 status =
+					hw_error_status.as_faultstatus[i];
+					/* status reg is clean; it can be
+					 * written
+					 */
+					switch (hw_error_status.errors_mask &
+							IS_A_MMU_ERROR) {
+					case KBASE_TRANSLATION_FAULT:
+						/* 0xCm means TRANSLATION FAULT
+						 * (m is mmu_table_level)
+						 */
+						status =
+							((1 << 7) | (1 << 6) |
+						hw_error_status.mmu_table_level
+									);
+						break;
+
+					case KBASE_PERMISSION_FAULT:
+						/*0xC8 means PERMISSION FAULT */
+						status = ((1 << 7) | (1 << 6) |
+								(1 << 3));
+						break;
+
+					case KBASE_TRANSTAB_BUS_FAULT:
+						/* 0xDm means TRANSITION TABLE
+						 * BUS FAULT (m is
+						 * mmu_table_level)
+						 */
+						status = ((1 << 7) | (1 << 6) |
+								(1 << 4) |
+						hw_error_status.mmu_table_level
+									);
+						break;
+
+					case KBASE_ACCESS_FLAG:
+						/* 0xD8 means ACCESS FLAG */
+						status = ((1 << 7) | (1 << 6) |
+							(1 << 4) | (1 << 3));
+						break;
+
+					default:
+						model_error_log(KBASE_CORE,
+						"\nAtom Chain 0x%llx: Invalid Error Mask!",
+						hw_error_status.current_jc);
+						break;
+					}
+					hw_error_status.as_faultstatus[i] =
+									status;
+				}
+
+				if (hw_error_status.errors_mask &
+						KBASE_TRANSTAB_BUS_FAULT)
+					hw_error_status.mmu_irq_rawstat |=
+						1 << (16 + i); /* bus error */
+				else
+					hw_error_status.mmu_irq_rawstat |=
+						1 << i; /* page fault */
+			}
+		}
+	}			/*end of mmu register statuses */
+	if (hw_error_status.errors_mask & IS_A_GPU_ERROR) {
+		if (hw_error_status.gpu_fault_status) {
+			/* not the first GPU error reported */
+			hw_error_status.gpu_error_irq |= (1 << 7);
+		} else {
+			hw_error_status.gpu_error_irq |= 1;
+			switch (hw_error_status.errors_mask & IS_A_GPU_ERROR) {
+			case KBASE_DELAYED_BUS_FAULT:
+				hw_error_status.gpu_fault_status = (1 << 7);
+				break;
+
+			case KBASE_SHAREABILITY_FAULT:
+				hw_error_status.gpu_fault_status = (1 << 7) |
+								(1 << 3);
+				break;
+
+			default:
+				model_error_log(KBASE_CORE,
+				"\nAtom Chain 0x%llx: Invalid Error Mask!",
+						hw_error_status.current_jc);
+				break;
+			}
+		}
+	}
+	hw_error_status.errors_mask = 0;	/*clear error mask */
+}
+
+#if !MALI_USE_CSF
+static void update_job_irq_js_state(struct dummy_model_t *dummy, int mask)
+{
+	int i;
+
+	pr_debug("%s", "Updating the JS_ACTIVE register");
+
+	for (i = 0; i < NUM_SLOTS; i++) {
+		int slot_active = dummy->slots[i].job_active;
+		int next_busy = dummy->slots[i].job_queued;
+
+		if ((mask & (1 << i)) || (mask & (1 << (i + 16)))) {
+			/* clear the bits we're updating */
+			dummy->job_irq_js_state &= ~((1 << (16 + i)) |
+								(1 << i));
+			if (hw_error_status.js_status[i]) {
+				dummy->job_irq_js_state |= next_busy <<
+								(i + 16);
+				if (mask & (1 << (i + 16))) {
+					/* clear job slot status */
+					hw_error_status.js_status[i] = 0;
+					/* continue execution of jobchain */
+					dummy->slots[i].job_active =
+						dummy->slots[i].job_queued;
+				}
+			} else {
+				/* set bits if needed */
+				dummy->job_irq_js_state |= ((slot_active << i) |
+						(next_busy << (i + 16)));
+			}
+		}
+	}
+	pr_debug("The new snapshot is 0x%08X\n", dummy->job_irq_js_state);
+}
+#endif /* !MALI_USE_CSF */
+
+/**
+ * find_control_reg_values() - Look up constant control register values.
+ * @gpu:	GPU name
+ *
+ * Look up the GPU name to find the correct set of control register values for
+ * that GPU. If not found, warn and use the first values in the array.
+ *
+ * Return: Pointer to control register values for that GPU.
+ */
+static const struct control_reg_values_t *find_control_reg_values(const char *gpu)
+{
+	size_t i;
+	const struct control_reg_values_t *ret = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(all_control_reg_values); ++i) {
+		const struct control_reg_values_t * const fcrv = &all_control_reg_values[i];
+
+		if (!strcmp(fcrv->name, gpu)) {
+			ret = fcrv;
+			pr_debug("Found control register values for %s\n", gpu);
+			break;
+		}
+	}
+
+	if (!ret) {
+		ret = &all_control_reg_values[0];
+		pr_warn("Couldn't find control register values for GPU %s; using default %s\n",
+			gpu, ret->name);
+	}
+
+	return ret;
+}
+
+void *midgard_model_create(const void *config)
+{
+	struct dummy_model_t *dummy = NULL;
+
+	dummy = kzalloc(sizeof(*dummy), GFP_KERNEL);
+
+	if (dummy) {
+		dummy->job_irq_js_state = 0;
+		init_register_statuses(dummy);
+		dummy->control_reg_values = find_control_reg_values(no_mali_gpu);
+	}
+	return dummy;
+}
+
+void midgard_model_destroy(void *h)
+{
+	kfree((void *)h);
+}
+
+static void midgard_model_get_outputs(void *h)
+{
+	struct dummy_model_t *dummy = (struct dummy_model_t *)h;
+
+	if (hw_error_status.job_irq_status)
+		gpu_device_raise_irq(dummy, GPU_DUMMY_JOB_IRQ);
+
+	if ((dummy->power_changed && dummy->power_changed_mask) ||
+	    (dummy->reset_completed & dummy->reset_completed_mask) ||
+	    hw_error_status.gpu_error_irq ||
+	    (dummy->clean_caches_completed && dummy->clean_caches_completed_irq_enabled) ||
+	    dummy->prfcnt_sample_completed)
+		gpu_device_raise_irq(dummy, GPU_DUMMY_GPU_IRQ);
+
+	if (hw_error_status.mmu_irq_rawstat & hw_error_status.mmu_irq_mask)
+		gpu_device_raise_irq(dummy, GPU_DUMMY_MMU_IRQ);
+}
+
+static void midgard_model_update(void *h)
+{
+	struct dummy_model_t *dummy = (struct dummy_model_t *)h;
+	int i;
+
+	for (i = 0; i < NUM_SLOTS; i++) {
+		if (!dummy->slots[i].job_active)
+			continue;
+
+		if (dummy->slots[i].job_disabled) {
+			update_register_statuses(dummy, i);
+			continue;
+		}
+
+		/* If there are any pending interrupts that have not
+		 * been cleared we cannot run the job in the next register
+		 * as we will overwrite the register status of the job in
+		 * the head registers - which has not yet been read
+		 */
+		if ((hw_error_status.job_irq_rawstat & (1 << (i + 16))) ||
+		   (hw_error_status.job_irq_rawstat & (1 << i))) {
+			continue;
+		}
+
+		/*this job is done assert IRQ lines */
+		signal_int(dummy, i);
+#ifdef CONFIG_MALI_ERROR_INJECT
+		midgard_set_error(i);
+#endif				/* CONFIG_MALI_ERROR_INJECT */
+		update_register_statuses(dummy, i);
+		/*if this job slot returned failures we cannot use it */
+		if (hw_error_status.job_irq_rawstat & (1 << (i + 16))) {
+			dummy->slots[i].job_active = 0;
+			continue;
+		}
+		/*process next job */
+		dummy->slots[i].job_active = dummy->slots[i].job_queued;
+		dummy->slots[i].job_queued = 0;
+		if (dummy->slots[i].job_active) {
+			if (hw_error_status.job_irq_rawstat & (1 << (i + 16)))
+				model_error_log(KBASE_CORE,
+				"\natom %lld running a job on a dirty slot",
+						hw_error_status.current_jc);
+		}
+	}
+}
+
+static void invalidate_active_jobs(struct dummy_model_t *dummy)
+{
+	int i;
+
+	for (i = 0; i < NUM_SLOTS; i++) {
+		if (dummy->slots[i].job_active) {
+			hw_error_status.job_irq_rawstat |= (1 << (16 + i));
+
+		hw_error_status.js_status[i] = 0x7f; /*UNKNOWN*/
+		}
+	}
+}
+
+u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
+{
+	struct dummy_model_t *dummy = (struct dummy_model_t *)h;
+#if !MALI_USE_CSF
+	if ((addr >= JOB_CONTROL_REG(JOB_SLOT0)) &&
+			(addr < (JOB_CONTROL_REG(JOB_SLOT15) + 0x80))) {
+		int slot_idx = (addr >> 7) & 0xf;
+
+		KBASE_DEBUG_ASSERT(slot_idx < NUM_SLOTS);
+		if (addr == JOB_SLOT_REG(slot_idx, JS_HEAD_NEXT_LO)) {
+			hw_error_status.current_jc &=
+						~((u64) (0xFFFFFFFF));
+			hw_error_status.current_jc |= (u64) value;
+		}
+		if (addr == JOB_SLOT_REG(slot_idx, JS_HEAD_NEXT_HI)) {
+			hw_error_status.current_jc &= (u64) 0xFFFFFFFF;
+			hw_error_status.current_jc |=
+						((u64) value) << 32;
+		}
+		if (addr == JOB_SLOT_REG(slot_idx, JS_COMMAND_NEXT) &&
+								value == 1) {
+			pr_debug("%s", "start detected");
+			KBASE_DEBUG_ASSERT(!dummy->slots[slot_idx].job_active ||
+					!dummy->slots[slot_idx].job_queued);
+			if ((dummy->slots[slot_idx].job_active) ||
+					(hw_error_status.job_irq_rawstat &
+						(1 << (slot_idx + 16)))) {
+				pr_debug("~~~~~~~~~~~ Start: job slot is already active or there are IRQ pending  ~~~~~~~~~"
+									);
+				dummy->slots[slot_idx].job_queued = 1;
+			} else {
+				dummy->slots[slot_idx].job_active = 1;
+			}
+		}
+
+		if (addr == JOB_SLOT_REG(slot_idx, JS_COMMAND_NEXT) && value ==
+									0)
+			dummy->slots[slot_idx].job_queued = 0;
+
+		if ((addr == JOB_SLOT_REG(slot_idx, JS_COMMAND)) &&
+				(value == JS_COMMAND_SOFT_STOP ||
+					value == JS_COMMAND_HARD_STOP)) {
+			/*dummy->slots[slot_idx].job_active = 0; */
+			hw_error_status.current_job_slot = slot_idx;
+			if (value == JS_COMMAND_SOFT_STOP) {
+				hw_error_status.errors_mask = KBASE_JOB_STOPPED;
+			} else {	/*value == 3 */
+
+				if (dummy->slots[slot_idx].job_disabled != 0) {
+					pr_debug("enabling slot after HARD_STOP"
+									);
+					dummy->slots[slot_idx].job_disabled = 0;
+				}
+				hw_error_status.errors_mask =
+							KBASE_JOB_TERMINATED;
+			}
+		}
+	} else if (addr == JOB_CONTROL_REG(JOB_IRQ_CLEAR)) {
+		int i;
+
+		for (i = 0; i < NUM_SLOTS; i++) {
+			if (value & ((1 << i) | (1 << (i + 16))))
+				dummy->slots[i].job_complete_irq_asserted = 0;
+			/* hw_error_status.js_status[i] is cleared in
+			 * update_job_irq_js_state
+			 */
+		}
+		pr_debug("%s", "job irq cleared");
+		update_job_irq_js_state(dummy, value);
+		/*remove error condition for JOB */
+		hw_error_status.job_irq_rawstat &= ~(value);
+		hw_error_status.job_irq_status &= ~(value);
+	} else if (addr == JOB_CONTROL_REG(JOB_IRQ_MASK)) {
+		int i;
+
+		for (i = 0; i < NUM_SLOTS; i++)
+			dummy->slots[i].job_irq_mask = (value >> i) & 0x01;
+		pr_debug("job irq mask to value %x", value);
+	} else if (addr == GPU_CONTROL_REG(GPU_IRQ_MASK)) {
+#else /* !MALI_USE_CSF */
+	if (addr == JOB_CONTROL_REG(JOB_IRQ_CLEAR)) {
+		pr_debug("%s", "job irq cleared");
+
+		hw_error_status.job_irq_rawstat &= ~(value);
+		hw_error_status.job_irq_status &= ~(value);
+	} else if (addr == JOB_CONTROL_REG(JOB_IRQ_MASK)) {
+		/* ignore JOB_IRQ_MASK as it is handled by CSFFW */
+	} else if (addr == GPU_CONTROL_REG(GPU_IRQ_MASK)) {
+#endif /* !MALI_USE_CSF */
+		pr_debug("GPU_IRQ_MASK set to 0x%x", value);
+		dummy->reset_completed_mask = (value >> 8) & 0x01;
+		dummy->power_changed_mask = (value >> 9) & 0x03;
+		dummy->clean_caches_completed_irq_enabled = (value & (1u << 17)) != 0u;
+	} else if (addr == GPU_CONTROL_REG(COHERENCY_ENABLE)) {
+		dummy->coherency_enable = value;
+	} else if (addr == GPU_CONTROL_REG(GPU_IRQ_CLEAR)) {
+		if (value & (1 << 8)) {
+			pr_debug("%s", "gpu RESET_COMPLETED irq cleared");
+			dummy->reset_completed = 0;
+		}
+		if (value & (3 << 9))
+			dummy->power_changed = 0;
+
+		if (value & (1 << 17))
+			dummy->clean_caches_completed = false;
+		if (value & (1 << 16))
+			dummy->prfcnt_sample_completed = 0;
+
+		/*update error status */
+		hw_error_status.gpu_error_irq &= ~(value);
+	} else if (addr == GPU_CONTROL_REG(GPU_COMMAND)) {
+		switch (value) {
+		case GPU_COMMAND_SOFT_RESET:
+		case GPU_COMMAND_HARD_RESET:
+			pr_debug("gpu reset (%d) requested", value);
+			/* no more fault status */
+			hw_error_status.gpu_fault_status = 0;
+			/* completed reset instantly */
+			dummy->reset_completed = 1;
+			break;
+#if MALI_USE_CSF
+		case GPU_COMMAND_CACHE_CLN_INV_L2:
+		case GPU_COMMAND_CACHE_CLN_INV_L2_LSC:
+		case GPU_COMMAND_CACHE_CLN_INV_FULL:
+#else
+		case GPU_COMMAND_CLEAN_CACHES:
+		case GPU_COMMAND_CLEAN_INV_CACHES:
+#endif
+			pr_debug("clean caches requested");
+			dummy->clean_caches_completed = true;
+			break;
+		case GPU_COMMAND_PRFCNT_SAMPLE:
+			midgard_model_dump_prfcnt();
+			dummy->prfcnt_sample_completed = 1;
+		default:
+			break;
+		}
+	} else if (addr == GPU_CONTROL_REG(L2_CONFIG)) {
+		dummy->l2_config = value;
+	}
+#if MALI_USE_CSF
+	else if (addr >= GPU_CONTROL_REG(CSF_HW_DOORBELL_PAGE_OFFSET) &&
+			 addr < GPU_CONTROL_REG(CSF_HW_DOORBELL_PAGE_OFFSET +
+						(CSF_NUM_DOORBELL * CSF_HW_DOORBELL_PAGE_SIZE))) {
+		if (addr == GPU_CONTROL_REG(CSF_HW_DOORBELL_PAGE_OFFSET))
+			hw_error_status.job_irq_status = JOB_IRQ_GLOBAL_IF;
+	} else if (addr == IPA_CONTROL_REG(COMMAND)) {
+		pr_debug("Received IPA_CONTROL command");
+	} else if (addr == IPA_CONTROL_REG(TIMER)) {
+		ipa_control_timer_enabled = value ? true : false;
+	} else if ((addr >= IPA_CONTROL_REG(SELECT_CSHW_LO)) &&
+		   (addr <= IPA_CONTROL_REG(SELECT_SHADER_HI))) {
+		enum kbase_ipa_core_type core_type = (enum kbase_ipa_core_type)(
+			(addr - IPA_CONTROL_REG(SELECT_CSHW_LO)) >> 3);
+		bool is_low_word =
+			!((addr - IPA_CONTROL_REG(SELECT_CSHW_LO)) & 7);
+
+		if (is_low_word) {
+			ipa_ctl_select_config[core_type] &= ~(u64)U32_MAX;
+			ipa_ctl_select_config[core_type] |= value;
+		} else {
+			ipa_ctl_select_config[core_type] &= U32_MAX;
+			ipa_ctl_select_config[core_type] |= ((u64)value << 32);
+		}
+	}
+#endif
+	else if (addr == MMU_REG(MMU_IRQ_MASK)) {
+		hw_error_status.mmu_irq_mask = value;
+	} else if (addr == MMU_REG(MMU_IRQ_CLEAR)) {
+		hw_error_status.mmu_irq_rawstat &= (~value);
+	} else if ((addr >= MMU_AS_REG(0, AS_TRANSTAB_LO)) &&
+			(addr <= MMU_AS_REG(15, AS_STATUS))) {
+		int mem_addr_space = (addr - MMU_AS_REG(0, AS_TRANSTAB_LO))
+									>> 6;
+
+		switch (addr & 0x3F) {
+		case AS_COMMAND:
+			switch (value) {
+			case AS_COMMAND_NOP:
+				hw_error_status.as_command[mem_addr_space] =
+									value;
+				break;
+
+			case AS_COMMAND_UPDATE:
+				hw_error_status.as_command[mem_addr_space] =
+									value;
+				if ((hw_error_status.as_faultstatus[
+								mem_addr_space])
+					&& ((hw_error_status.as_transtab[
+						mem_addr_space] & 0x3) != 0)) {
+					model_error_log(KBASE_CORE,
+					"\n ERROR: AS_COMMAND issued UPDATE on error condition before AS_TRANSTAB been set to unmapped\n"
+									);
+				} else if ((hw_error_status.as_faultstatus[
+								mem_addr_space])
+					&& ((hw_error_status.as_transtab[
+						mem_addr_space] & 0x3) == 0)) {
+
+					/*invalidate all active jobs */
+					invalidate_active_jobs(dummy);
+					/* error handled */
+					hw_error_status.as_faultstatus[
+							mem_addr_space] = 0;
+				}
+				break;
+
+			case AS_COMMAND_LOCK:
+			case AS_COMMAND_UNLOCK:
+				hw_error_status.as_command[mem_addr_space] =
+									value;
+				break;
+
+			case AS_COMMAND_FLUSH_PT:
+			case AS_COMMAND_FLUSH_MEM:
+				if (hw_error_status.as_command[mem_addr_space]
+							!= AS_COMMAND_LOCK)
+					model_error_log(KBASE_CORE,
+						"\n ERROR: AS_COMMAND issued FLUSH without LOCKING before\n"
+									);
+				else /* error handled if any */
+					hw_error_status.as_faultstatus[
+							mem_addr_space] = 0;
+				hw_error_status.as_command[mem_addr_space] =
+									value;
+				break;
+
+			default:
+				model_error_log(KBASE_CORE,
+				"\n WARNING: UNRECOGNIZED AS_COMMAND 0x%x\n",
+									value);
+				break;
+			}
+			break;
+
+		case AS_TRANSTAB_LO:
+			hw_error_status.as_transtab[mem_addr_space] &=
+						~((u64) (0xffffffff));
+			hw_error_status.as_transtab[mem_addr_space] |=
+						(u64) value;
+			break;
+
+		case AS_TRANSTAB_HI:
+			hw_error_status.as_transtab[mem_addr_space] &=
+						(u64) 0xffffffff;
+			hw_error_status.as_transtab[mem_addr_space] |=
+						((u64) value) << 32;
+			break;
+
+		case AS_LOCKADDR_LO:
+		case AS_LOCKADDR_HI:
+		case AS_MEMATTR_LO:
+		case AS_MEMATTR_HI:
+		case AS_TRANSCFG_LO:
+		case AS_TRANSCFG_HI:
+			/* Writes ignored */
+			break;
+
+		default:
+			model_error_log(KBASE_CORE,
+				"Dummy model register access: Writing unsupported MMU #%d register 0x%x value 0x%x\n",
+						mem_addr_space, addr, value);
+			break;
+		}
+	} else if (addr >= GPU_CONTROL_REG(PRFCNT_BASE_LO) &&
+			   addr <= GPU_CONTROL_REG(PRFCNT_MMU_L2_EN)) {
+		switch (addr) {
+		case PRFCNT_BASE_LO:
+			performance_counters.prfcnt_base |= value;
+			break;
+		case PRFCNT_BASE_HI:
+			performance_counters.prfcnt_base |= ((u64) value) << 32;
+			break;
+#if !MALI_USE_CSF
+		case PRFCNT_JM_EN:
+			performance_counters.prfcnt_en.jm = value;
+			break;
+#endif /* !MALI_USE_CSF */
+		case PRFCNT_SHADER_EN:
+			performance_counters.prfcnt_en.shader = value;
+			break;
+		case PRFCNT_TILER_EN:
+			performance_counters.prfcnt_en.tiler = value;
+			break;
+		case PRFCNT_MMU_L2_EN:
+			performance_counters.prfcnt_en.l2 = value;
+			break;
+		}
+	} else {
+		switch (addr) {
+		case TILER_PWRON_LO:
+			dummy->power_on |= (value & 1) << 1;
+			/* Also ensure L2 is powered on */
+			dummy->power_on |= value & 1;
+			dummy->power_changed = 1;
+			break;
+		case SHADER_PWRON_LO:
+			dummy->power_on |= (value & 0xF) << 2;
+			dummy->power_changed = 1;
+			break;
+		case L2_PWRON_LO:
+			dummy->power_on |= value & 1;
+			dummy->power_changed = 1;
+			break;
+		case STACK_PWRON_LO:
+			dummy->stack_power_on_lo |= value;
+			dummy->power_changed = 1;
+			break;
+		case TILER_PWROFF_LO:
+			dummy->power_on &= ~((value & 1) << 1);
+			dummy->power_changed = 1;
+			break;
+		case SHADER_PWROFF_LO:
+			dummy->power_on &= ~((value & 0xF) << 2);
+			dummy->power_changed = 1;
+			break;
+		case L2_PWROFF_LO:
+			dummy->power_on &= ~(value & 1);
+			/* Also ensure tiler is powered off */
+			dummy->power_on &= ~((value & 1) << 1);
+			dummy->power_changed = 1;
+			break;
+		case STACK_PWROFF_LO:
+			dummy->stack_power_on_lo &= ~value;
+			dummy->power_changed = 1;
+			break;
+
+		case TILER_PWROFF_HI:
+		case SHADER_PWROFF_HI:
+		case L2_PWROFF_HI:
+		case PWR_KEY:
+		case PWR_OVERRIDE0:
+#if !MALI_USE_CSF
+		case JM_CONFIG:
+#else /* !MALI_USE_CSF */
+		case CSF_CONFIG:
+#endif /* !MALI_USE_CSF */
+		case SHADER_CONFIG:
+		case TILER_CONFIG:
+		case L2_MMU_CONFIG:
+			/* Writes ignored */
+			break;
+		default:
+			model_error_log(KBASE_CORE,
+				"Dummy model register access: Writing unsupported register 0x%x value 0x%x\n",
+								addr, value);
+			break;
+		}
+	}
+
+	midgard_model_update(dummy);
+	midgard_model_get_outputs(dummy);
+
+	return 1;
+}
+
+u8 midgard_model_read_reg(void *h, u32 addr, u32 * const value)
+{
+	struct dummy_model_t *dummy = (struct dummy_model_t *)h;
+	*value = 0;		/* 0 by default */
+#if !MALI_USE_CSF
+	if (addr == JOB_CONTROL_REG(JOB_IRQ_JS_STATE)) {
+		pr_debug("%s", "JS_ACTIVE being read");
+
+		*value = dummy->job_irq_js_state;
+	} else if (addr == GPU_CONTROL_REG(GPU_ID)) {
+#else /* !MALI_USE_CSF */
+	if (addr == GPU_CONTROL_REG(GPU_ID)) {
+#endif /* !MALI_USE_CSF */
+
+		*value = dummy->control_reg_values->gpu_id;
+	} else if (addr == JOB_CONTROL_REG(JOB_IRQ_RAWSTAT)) {
+		*value = hw_error_status.job_irq_rawstat;
+		pr_debug("%s", "JS_IRQ_RAWSTAT being read");
+	} else if (addr == JOB_CONTROL_REG(JOB_IRQ_STATUS)) {
+		*value = hw_error_status.job_irq_status;
+		pr_debug("JS_IRQ_STATUS being read %x", *value);
+	}
+#if !MALI_USE_CSF
+	else if (addr == JOB_CONTROL_REG(JOB_IRQ_MASK)) {
+		int i;
+
+		*value = 0;
+		for (i = 0; i < NUM_SLOTS; i++)
+			*value |= dummy->slots[i].job_irq_mask << i;
+		pr_debug("JS_IRQ_MASK being read %x", *value);
+	}
+#else /* !MALI_USE_CSF */
+	else if (addr == JOB_CONTROL_REG(JOB_IRQ_MASK)) {
+		/* ignore JOB_IRQ_MASK as it is handled by CSFFW */
+	}
+#endif /* !MALI_USE_CSF */
+	else if (addr == GPU_CONTROL_REG(GPU_IRQ_MASK)) {
+		*value = (dummy->reset_completed_mask << 8) |
+				(dummy->power_changed_mask << 9) | (1 << 7) | 1;
+		pr_debug("GPU_IRQ_MASK read %x", *value);
+	} else if (addr == GPU_CONTROL_REG(GPU_IRQ_RAWSTAT)) {
+		*value = (dummy->power_changed << 9) | (dummy->power_changed << 10) |
+			 (dummy->reset_completed << 8) |
+			 ((dummy->clean_caches_completed ? 1u : 0u) << 17) |
+			 (dummy->prfcnt_sample_completed << 16) | hw_error_status.gpu_error_irq;
+		pr_debug("GPU_IRQ_RAWSTAT read %x", *value);
+	} else if (addr == GPU_CONTROL_REG(GPU_IRQ_STATUS)) {
+		*value = ((dummy->power_changed && (dummy->power_changed_mask & 0x1)) << 9) |
+			 ((dummy->power_changed && (dummy->power_changed_mask & 0x2)) << 10) |
+			 ((dummy->reset_completed & dummy->reset_completed_mask) << 8) |
+			 (((dummy->clean_caches_completed &&
+			    dummy->clean_caches_completed_irq_enabled) ?
+				   1u :
+				   0u)
+			  << 17) |
+			 (dummy->prfcnt_sample_completed << 16) | hw_error_status.gpu_error_irq;
+		pr_debug("GPU_IRQ_STAT read %x", *value);
+	} else if (addr == GPU_CONTROL_REG(GPU_STATUS)) {
+		*value = 0;
+#if !MALI_USE_CSF
+	} else if (addr == GPU_CONTROL_REG(LATEST_FLUSH)) {
+		*value = 0;
+#endif
+	} else if (addr == GPU_CONTROL_REG(GPU_FAULTSTATUS)) {
+		*value = hw_error_status.gpu_fault_status;
+	} else if (addr == GPU_CONTROL_REG(L2_CONFIG)) {
+		*value = dummy->l2_config;
+	} else if ((addr >= GPU_CONTROL_REG(SHADER_PRESENT_LO)) &&
+				(addr <= GPU_CONTROL_REG(L2_MMU_CONFIG))) {
+		switch (addr) {
+		case GPU_CONTROL_REG(SHADER_PRESENT_LO):
+		case GPU_CONTROL_REG(SHADER_PRESENT_HI):
+		case GPU_CONTROL_REG(TILER_PRESENT_LO):
+		case GPU_CONTROL_REG(TILER_PRESENT_HI):
+		case GPU_CONTROL_REG(L2_PRESENT_LO):
+		case GPU_CONTROL_REG(L2_PRESENT_HI):
+		case GPU_CONTROL_REG(STACK_PRESENT_LO):
+		case GPU_CONTROL_REG(STACK_PRESENT_HI):
+			*value = get_implementation_register(addr);
+			break;
+		case GPU_CONTROL_REG(SHADER_READY_LO):
+			*value = (dummy->power_on >> 0x02) &
+			get_implementation_register(
+				GPU_CONTROL_REG(SHADER_PRESENT_LO));
+			break;
+		case GPU_CONTROL_REG(TILER_READY_LO):
+			*value = (dummy->power_on >> 0x01) &
+				 get_implementation_register(
+				GPU_CONTROL_REG(TILER_PRESENT_LO));
+			break;
+		case GPU_CONTROL_REG(L2_READY_LO):
+			*value = dummy->power_on &
+				 get_implementation_register(
+				GPU_CONTROL_REG(L2_PRESENT_LO));
+			break;
+		case GPU_CONTROL_REG(STACK_READY_LO):
+			*value = dummy->stack_power_on_lo &
+				 get_implementation_register(
+				GPU_CONTROL_REG(STACK_PRESENT_LO));
+			break;
+
+		case GPU_CONTROL_REG(SHADER_READY_HI):
+		case GPU_CONTROL_REG(TILER_READY_HI):
+		case GPU_CONTROL_REG(L2_READY_HI):
+		case GPU_CONTROL_REG(STACK_READY_HI):
+			*value = 0;
+			break;
+
+		case GPU_CONTROL_REG(SHADER_PWRTRANS_LO):
+		case GPU_CONTROL_REG(SHADER_PWRTRANS_HI):
+		case GPU_CONTROL_REG(TILER_PWRTRANS_LO):
+		case GPU_CONTROL_REG(TILER_PWRTRANS_HI):
+		case GPU_CONTROL_REG(L2_PWRTRANS_LO):
+		case GPU_CONTROL_REG(L2_PWRTRANS_HI):
+		case GPU_CONTROL_REG(STACK_PWRTRANS_LO):
+		case GPU_CONTROL_REG(STACK_PWRTRANS_HI):
+			*value = 0;
+			break;
+
+		case GPU_CONTROL_REG(SHADER_PWRACTIVE_LO):
+		case GPU_CONTROL_REG(SHADER_PWRACTIVE_HI):
+		case GPU_CONTROL_REG(TILER_PWRACTIVE_LO):
+		case GPU_CONTROL_REG(TILER_PWRACTIVE_HI):
+		case GPU_CONTROL_REG(L2_PWRACTIVE_LO):
+		case GPU_CONTROL_REG(L2_PWRACTIVE_HI):
+			*value = 0;
+			break;
+
+#if !MALI_USE_CSF
+		case GPU_CONTROL_REG(JM_CONFIG):
+#else /* !MALI_USE_CSF */
+		case GPU_CONTROL_REG(CSF_CONFIG):
+#endif /* !MALI_USE_CSF */
+
+		case GPU_CONTROL_REG(SHADER_CONFIG):
+		case GPU_CONTROL_REG(TILER_CONFIG):
+		case GPU_CONTROL_REG(L2_MMU_CONFIG):
+			*value = 0;
+			break;
+
+		case GPU_CONTROL_REG(COHERENCY_FEATURES):
+			*value = BIT(0) | BIT(1); /* ace_lite and ace, respectively. */
+			break;
+		case GPU_CONTROL_REG(COHERENCY_ENABLE):
+			*value = dummy->coherency_enable;
+			break;
+
+		case GPU_CONTROL_REG(THREAD_TLS_ALLOC):
+			*value = 0;
+			break;
+
+		default:
+			model_error_log(KBASE_CORE,
+					"Dummy model register access: Reading unknown control reg 0x%x\n",
+									addr);
+			break;
+		}
+#if !MALI_USE_CSF
+	} else if ((addr >= JOB_CONTROL_REG(JOB_SLOT0)) &&
+			(addr < (JOB_CONTROL_REG(JOB_SLOT15) + 0x80))) {
+		int slot_idx = (addr >> 7) & 0xf;
+		int sub_reg = addr & 0x7F;
+
+		KBASE_DEBUG_ASSERT(slot_idx < NUM_SLOTS);
+		switch (sub_reg) {
+		case JS_HEAD_NEXT_LO:
+			*value = (u32) ((hw_error_status.current_jc) &
+								0xFFFFFFFF);
+			break;
+		case JS_HEAD_NEXT_HI:
+			*value = (u32) (hw_error_status.current_jc >> 32);
+			break;
+		case JS_STATUS:
+			if (hw_error_status.js_status[slot_idx])
+				*value = hw_error_status.js_status[slot_idx];
+			else /* 0x08 means active, 0x00 idle */
+				*value = (dummy->slots[slot_idx].job_active)
+									<< 3;
+			break;
+		case JS_COMMAND_NEXT:
+			*value = dummy->slots[slot_idx].job_queued;
+			break;
+
+		/* The dummy model does not implement these registers
+		 * avoid printing error messages
+		 */
+		case JS_HEAD_HI:
+		case JS_HEAD_LO:
+		case JS_TAIL_HI:
+		case JS_TAIL_LO:
+		case JS_FLUSH_ID_NEXT:
+			break;
+
+		default:
+			model_error_log(KBASE_CORE,
+				"Dummy model register access: unknown job slot reg 0x%02X being read\n",
+								sub_reg);
+			break;
+		}
+#endif /* !MALI_USE_CSF */
+	} else if (addr == GPU_CONTROL_REG(AS_PRESENT)) {
+		*value = dummy->control_reg_values->as_present;
+#if !MALI_USE_CSF
+	} else if (addr == GPU_CONTROL_REG(JS_PRESENT)) {
+		*value = 0x7;
+#endif /* !MALI_USE_CSF */
+	} else if (addr >= GPU_CONTROL_REG(TEXTURE_FEATURES_0) &&
+				addr <= GPU_CONTROL_REG(TEXTURE_FEATURES_3)) {
+		switch (addr) {
+		case GPU_CONTROL_REG(TEXTURE_FEATURES_0):
+			*value = 0xfffff;
+			break;
+
+		case GPU_CONTROL_REG(TEXTURE_FEATURES_1):
+			*value = 0xffff;
+			break;
+
+		case GPU_CONTROL_REG(TEXTURE_FEATURES_2):
+			*value = 0x9f81ffff;
+			break;
+
+		case GPU_CONTROL_REG(TEXTURE_FEATURES_3):
+			*value = 0;
+			break;
+		}
+#if !MALI_USE_CSF
+	} else if (addr >= GPU_CONTROL_REG(JS0_FEATURES) &&
+				addr <= GPU_CONTROL_REG(JS15_FEATURES)) {
+		switch (addr) {
+		case GPU_CONTROL_REG(JS0_FEATURES):
+			*value = 0x20e;
+			break;
+
+		case GPU_CONTROL_REG(JS1_FEATURES):
+			*value = 0x1fe;
+			break;
+
+		case GPU_CONTROL_REG(JS2_FEATURES):
+			*value = 0x7e;
+			break;
+
+		default:
+			*value = 0;
+			break;
+		}
+#endif /* !MALI_USE_CSF */
+	} else if (addr >= GPU_CONTROL_REG(L2_FEATURES)
+				&& addr <= GPU_CONTROL_REG(MMU_FEATURES)) {
+		switch (addr) {
+		case GPU_CONTROL_REG(L2_FEATURES):
+			*value = 0x6100206;
+			break;
+
+		case GPU_CONTROL_REG(CORE_FEATURES):
+			*value = dummy->control_reg_values->core_features;
+			break;
+
+		case GPU_CONTROL_REG(TILER_FEATURES):
+			*value = dummy->control_reg_values->tiler_features;
+			break;
+
+		case GPU_CONTROL_REG(MEM_FEATURES):
+			/* Bit 0: Core group is coherent */
+			*value = 0x01;
+			/* Bits 11:8: L2 slice count - 1 */
+			*value |= (hweight64(DUMMY_IMPLEMENTATION_L2_PRESENT) - 1) << 8;
+			break;
+
+		case GPU_CONTROL_REG(MMU_FEATURES):
+			*value = dummy->control_reg_values->mmu_features;
+			break;
+		}
+	} else if (addr >= GPU_CONTROL_REG(THREAD_MAX_THREADS)
+				&& addr <= GPU_CONTROL_REG(THREAD_FEATURES)) {
+		switch (addr) {
+		case GPU_CONTROL_REG(THREAD_FEATURES):
+			*value = dummy->control_reg_values->thread_features
+					| (IMPLEMENTATION_MODEL << 30);
+			break;
+		case GPU_CONTROL_REG(THREAD_MAX_BARRIER_SIZE):
+			*value = dummy->control_reg_values->thread_max_barrier_size;
+			break;
+		case GPU_CONTROL_REG(THREAD_MAX_WORKGROUP_SIZE):
+			*value = dummy->control_reg_values->thread_max_workgroup_size;
+			break;
+		case GPU_CONTROL_REG(THREAD_MAX_THREADS):
+			*value = dummy->control_reg_values->thread_max_threads;
+			break;
+		}
+	} else if (addr >= GPU_CONTROL_REG(CYCLE_COUNT_LO)
+				&& addr <= GPU_CONTROL_REG(TIMESTAMP_HI)) {
+		*value = 0;
+	} else if (addr >= MMU_AS_REG(0, AS_TRANSTAB_LO)
+				&& addr <= MMU_AS_REG(15, AS_STATUS)) {
+		int mem_addr_space = (addr - MMU_AS_REG(0, AS_TRANSTAB_LO))
+									>> 6;
+
+		switch (addr & 0x3F) {
+		case AS_TRANSTAB_LO:
+			*value = (u32)
+				(hw_error_status.as_transtab[mem_addr_space] &
+								0xffffffff);
+			break;
+
+		case AS_TRANSTAB_HI:
+			*value = (u32)
+				(hw_error_status.as_transtab[mem_addr_space] >>
+									32);
+			break;
+
+		case AS_STATUS:
+			*value = 0;
+			break;
+
+		case AS_FAULTSTATUS:
+			if (mem_addr_space == hw_error_status.faulty_mmu_as)
+				*value = hw_error_status.as_faultstatus[
+						hw_error_status.faulty_mmu_as];
+			else
+				*value = 0;
+			break;
+
+		case AS_LOCKADDR_LO:
+		case AS_LOCKADDR_HI:
+		case AS_MEMATTR_LO:
+		case AS_MEMATTR_HI:
+		case AS_TRANSCFG_LO:
+		case AS_TRANSCFG_HI:
+			/* Read ignored */
+			*value = 0;
+			break;
+
+		default:
+			model_error_log(KBASE_CORE,
+					"Dummy model register access: Reading unsupported MMU #%d register 0x%x. Returning 0\n",
+							mem_addr_space, addr);
+			*value = 0;
+			break;
+		}
+	} else if (addr == MMU_REG(MMU_IRQ_MASK)) {
+		*value = hw_error_status.mmu_irq_mask;
+	} else if (addr == MMU_REG(MMU_IRQ_RAWSTAT)) {
+		*value = hw_error_status.mmu_irq_rawstat;
+	} else if (addr == MMU_REG(MMU_IRQ_STATUS)) {
+		*value = hw_error_status.mmu_irq_mask &
+						hw_error_status.mmu_irq_rawstat;
+	}
+#if MALI_USE_CSF
+	else if (addr == IPA_CONTROL_REG(STATUS)) {
+		*value = (ipa_control_timer_enabled << 31);
+	} else if ((addr >= IPA_CONTROL_REG(VALUE_CSHW_REG_LO(0))) &&
+		   (addr <= IPA_CONTROL_REG(VALUE_CSHW_REG_HI(
+				    IPA_CTL_MAX_VAL_CNT_IDX)))) {
+		u32 counter_index =
+			(addr - IPA_CONTROL_REG(VALUE_CSHW_REG_LO(0))) >> 3;
+		bool is_low_word =
+			!((addr - IPA_CONTROL_REG(VALUE_CSHW_REG_LO(0))) & 7);
+
+		*value = gpu_model_get_prfcnt_value(KBASE_IPA_CORE_TYPE_CSHW,
+						    counter_index, is_low_word);
+	} else if ((addr >= IPA_CONTROL_REG(VALUE_MEMSYS_REG_LO(0))) &&
+		   (addr <= IPA_CONTROL_REG(VALUE_MEMSYS_REG_HI(
+				    IPA_CTL_MAX_VAL_CNT_IDX)))) {
+		u32 counter_index =
+			(addr - IPA_CONTROL_REG(VALUE_MEMSYS_REG_LO(0))) >> 3;
+		bool is_low_word =
+			!((addr - IPA_CONTROL_REG(VALUE_MEMSYS_REG_LO(0))) & 7);
+
+		*value = gpu_model_get_prfcnt_value(KBASE_IPA_CORE_TYPE_MEMSYS,
+						    counter_index, is_low_word);
+	} else if ((addr >= IPA_CONTROL_REG(VALUE_TILER_REG_LO(0))) &&
+		   (addr <= IPA_CONTROL_REG(VALUE_TILER_REG_HI(
+				    IPA_CTL_MAX_VAL_CNT_IDX)))) {
+		u32 counter_index =
+			(addr - IPA_CONTROL_REG(VALUE_TILER_REG_LO(0))) >> 3;
+		bool is_low_word =
+			!((addr - IPA_CONTROL_REG(VALUE_TILER_REG_LO(0))) & 7);
+
+		*value = gpu_model_get_prfcnt_value(KBASE_IPA_CORE_TYPE_TILER,
+						    counter_index, is_low_word);
+	} else if ((addr >= IPA_CONTROL_REG(VALUE_SHADER_REG_LO(0))) &&
+		   (addr <= IPA_CONTROL_REG(VALUE_SHADER_REG_HI(
+				    IPA_CTL_MAX_VAL_CNT_IDX)))) {
+		u32 counter_index =
+			(addr - IPA_CONTROL_REG(VALUE_SHADER_REG_LO(0))) >> 3;
+		bool is_low_word =
+			!((addr - IPA_CONTROL_REG(VALUE_SHADER_REG_LO(0))) & 7);
+
+		*value = gpu_model_get_prfcnt_value(KBASE_IPA_CORE_TYPE_SHADER,
+						    counter_index, is_low_word);
+	}
+#endif
+	else if (addr == GPU_CONTROL_REG(GPU_FEATURES_LO)) {
+		*value = dummy->control_reg_values->gpu_features_lo;
+	} else if (addr == GPU_CONTROL_REG(GPU_FEATURES_HI)) {
+		*value = dummy->control_reg_values->gpu_features_hi;
+	} else {
+		model_error_log(KBASE_CORE,
+			"Dummy model register access: Reading unsupported register 0x%x. Returning 0\n",
+									addr);
+		*value = 0;
+	}
+
+	CSTD_UNUSED(dummy);
+
+	return 1;
+}
+
+static u32 set_user_sample_core_type(u64 *counters,
+	u32 *usr_data_start, u32 usr_data_offset,
+	u32 usr_data_size, u32 core_count)
+{
+	u32 sample_size;
+	u32 *usr_data = NULL;
+
+	sample_size =
+		core_count * KBASE_DUMMY_MODEL_COUNTER_PER_CORE * sizeof(u32);
+
+	if ((usr_data_size >= usr_data_offset) &&
+	    (sample_size <= usr_data_size - usr_data_offset))
+		usr_data = usr_data_start + (usr_data_offset / sizeof(u32));
+
+	if (!usr_data)
+		model_error_log(KBASE_CORE, "Unable to set counter sample 1");
+	else {
+		u32 loop_cnt = core_count * KBASE_DUMMY_MODEL_COUNTER_PER_CORE;
+		u32 i;
+
+		for (i = 0; i < loop_cnt; i++) {
+			if (copy_from_user(&counters[i], &usr_data[i],
+					   sizeof(u32))) {
+				model_error_log(KBASE_CORE, "Unable to set counter sample 2");
+				break;
+			}
+		}
+	}
+
+	return usr_data_offset + sample_size;
+}
+
+static u32 set_kernel_sample_core_type(u64 *counters,
+	u64 *usr_data_start, u32 usr_data_offset,
+	u32 usr_data_size, u32 core_count)
+{
+	u32 sample_size;
+	u64 *usr_data = NULL;
+
+	sample_size =
+		core_count * KBASE_DUMMY_MODEL_COUNTER_PER_CORE * sizeof(u64);
+
+	if ((usr_data_size >= usr_data_offset) &&
+	    (sample_size <= usr_data_size - usr_data_offset))
+		usr_data = usr_data_start + (usr_data_offset / sizeof(u64));
+
+	if (!usr_data)
+		model_error_log(KBASE_CORE, "Unable to set kernel counter sample 1");
+	else
+		memcpy(counters, usr_data, sample_size);
+
+	return usr_data_offset + sample_size;
+}
+
+/* Counter values injected through ioctl are of 32 bits */
+void gpu_model_set_dummy_prfcnt_sample(u32 *usr_data, u32 usr_data_size)
+{
+	u32 offset = 0;
+
+#if !MALI_USE_CSF
+	offset = set_user_sample_core_type(performance_counters.jm_counters,
+		usr_data, offset, usr_data_size, 1);
+#else
+	offset = set_user_sample_core_type(performance_counters.cshw_counters,
+		usr_data, offset, usr_data_size, 1);
+#endif /* !MALI_USE_CSF */
+	offset = set_user_sample_core_type(performance_counters.tiler_counters,
+		usr_data, offset, usr_data_size,
+		hweight64(DUMMY_IMPLEMENTATION_TILER_PRESENT));
+	offset = set_user_sample_core_type(performance_counters.l2_counters,
+		usr_data, offset, usr_data_size,
+		KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS);
+	offset = set_user_sample_core_type(performance_counters.shader_counters,
+		usr_data, offset, usr_data_size,
+		KBASE_DUMMY_MODEL_MAX_SHADER_CORES);
+}
+
+/* Counter values injected through kutf are of 64 bits */
+void gpu_model_set_dummy_prfcnt_kernel_sample(u64 *usr_data, u32 usr_data_size)
+{
+	u32 offset = 0;
+
+#if !MALI_USE_CSF
+	offset = set_kernel_sample_core_type(performance_counters.jm_counters,
+		usr_data, offset, usr_data_size, 1);
+#else
+	offset = set_kernel_sample_core_type(performance_counters.cshw_counters,
+		usr_data, offset, usr_data_size, 1);
+#endif /* !MALI_USE_CSF */
+	offset = set_kernel_sample_core_type(performance_counters.tiler_counters,
+		usr_data, offset, usr_data_size,
+		hweight64(DUMMY_IMPLEMENTATION_TILER_PRESENT));
+	offset = set_kernel_sample_core_type(performance_counters.l2_counters,
+		usr_data, offset, usr_data_size,
+		hweight64(performance_counters.l2_present));
+	offset = set_kernel_sample_core_type(performance_counters.shader_counters,
+		usr_data, offset, usr_data_size,
+		hweight64(performance_counters.shader_present));
+}
+KBASE_EXPORT_TEST_API(gpu_model_set_dummy_prfcnt_kernel_sample);
+
+void gpu_model_get_dummy_prfcnt_cores(struct kbase_device *kbdev,
+		u64 *l2_present, u64 *shader_present)
+{
+	if (shader_present)
+		*shader_present = performance_counters.shader_present;
+	if (l2_present)
+		*l2_present = performance_counters.l2_present;
+}
+KBASE_EXPORT_TEST_API(gpu_model_get_dummy_prfcnt_cores);
+
+void gpu_model_set_dummy_prfcnt_cores(struct kbase_device *kbdev,
+		u64 l2_present, u64 shader_present)
+{
+	if (WARN_ON(!l2_present || !shader_present
+			|| hweight64(l2_present) > KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS
+			|| hweight64(shader_present) > KBASE_DUMMY_MODEL_MAX_SHADER_CORES))
+		return;
+
+	performance_counters.l2_present = l2_present;
+	performance_counters.shader_present = shader_present;
+
+	/* Update the GPU properties used by vinstr to calculate the counter
+	 * dump buffer size.
+	 */
+	kbdev->gpu_props.props.l2_props.num_l2_slices = hweight64(l2_present);
+	kbdev->gpu_props.props.coherency_info.group[0].core_mask = shader_present;
+	kbdev->gpu_props.curr_config.l2_slices = hweight64(l2_present);
+	kbdev->gpu_props.curr_config.shader_present = shader_present;
+}
+KBASE_EXPORT_TEST_API(gpu_model_set_dummy_prfcnt_cores);
+
+void gpu_model_set_dummy_prfcnt_base_cpu(u32 *base, struct kbase_device *kbdev,
+					 struct tagged_addr *pages,
+					 size_t page_count)
+{
+	performance_counters.prfcnt_base_cpu = base;
+	performance_counters.kbdev = kbdev;
+	performance_counters.pages = pages;
+	performance_counters.page_count = page_count;
+}
+
+int gpu_model_control(void *model,
+				struct kbase_model_control_params *params)
+{
+	struct dummy_model_t *dummy = (struct dummy_model_t *)model;
+	int i;
+
+	if (params->command == KBASE_MC_DISABLE_JOBS) {
+		for (i = 0; i < NUM_SLOTS; i++)
+			dummy->slots[i].job_disabled = params->value;
+	} else {
+		return -EINVAL;
+	}
+
+	midgard_model_update(dummy);
+	midgard_model_get_outputs(dummy);
+
+	return 0;
+}
diff --git a/mali_kbase/backend/gpu/mali_kbase_model_dummy.h b/mali_kbase/backend/gpu/mali_kbase_model_dummy.h
new file mode 100644
index 0000000..e092134
--- /dev/null
+++ b/mali_kbase/backend/gpu/mali_kbase_model_dummy.h
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * Dummy Model interface
+ */
+
+#ifndef _KBASE_MODEL_DUMMY_H_
+#define _KBASE_MODEL_DUMMY_H_
+
+#include <uapi/gpu/arm/midgard/backend/gpu/mali_kbase_model_dummy.h>
+
+#define model_error_log(module, ...) pr_err(__VA_ARGS__)
+
+#define NUM_SLOTS 4		/*number of job slots */
+
+/*Errors Mask Codes*/
+/* each bit of errors_mask is associated to a specific error:
+ * NON FAULT STATUS CODES: only the following are implemented since the others
+ * represent normal working statuses
+ */
+#define KBASE_JOB_INTERRUPTED         (1<<0)
+#define KBASE_JOB_STOPPED             (1<<1)
+#define KBASE_JOB_TERMINATED          (1<<2)
+
+/* JOB EXCEPTIONS: */
+#define KBASE_JOB_CONFIG_FAULT        (1<<3)
+#define KBASE_JOB_POWER_FAULT         (1<<4)
+#define KBASE_JOB_READ_FAULT          (1<<5)
+#define KBASE_JOB_WRITE_FAULT         (1<<6)
+#define KBASE_JOB_AFFINITY_FAULT      (1<<7)
+#define KBASE_JOB_BUS_FAULT           (1<<8)
+#define KBASE_INSTR_INVALID_PC        (1<<9)
+#define KBASE_INSTR_INVALID_ENC       (1<<10)
+#define KBASE_INSTR_TYPE_MISMATCH     (1<<11)
+#define KBASE_INSTR_OPERAND_FAULT     (1<<12)
+#define KBASE_INSTR_TLS_FAULT         (1<<13)
+#define KBASE_INSTR_BARRIER_FAULT     (1<<14)
+#define KBASE_INSTR_ALIGN_FAULT       (1<<15)
+#define KBASE_DATA_INVALID_FAULT      (1<<16)
+#define KBASE_TILE_RANGE_FAULT        (1<<17)
+#define KBASE_ADDR_RANGE_FAULT        (1<<18)
+#define KBASE_OUT_OF_MEMORY           (1<<19)
+#define KBASE_UNKNOWN                 (1<<20)
+
+/* GPU EXCEPTIONS:*/
+#define KBASE_DELAYED_BUS_FAULT       (1<<21)
+#define KBASE_SHAREABILITY_FAULT      (1<<22)
+
+/* MMU EXCEPTIONS:*/
+#define KBASE_TRANSLATION_FAULT       (1<<23)
+#define KBASE_PERMISSION_FAULT        (1<<24)
+#define KBASE_TRANSTAB_BUS_FAULT      (1<<25)
+#define KBASE_ACCESS_FLAG             (1<<26)
+
+/* generic useful bitmasks */
+#define IS_A_JOB_ERROR ((KBASE_UNKNOWN << 1) - KBASE_JOB_INTERRUPTED)
+#define IS_A_MMU_ERROR ((KBASE_ACCESS_FLAG << 1) - KBASE_TRANSLATION_FAULT)
+#define IS_A_GPU_ERROR (KBASE_DELAYED_BUS_FAULT|KBASE_SHAREABILITY_FAULT)
+
+/* number of possible MMU address spaces */
+#define NUM_MMU_AS 16 /* total number of MMU address spaces as in
+		       * MMU_IRQ_RAWSTAT register
+		       */
+
+/* Forward declaration */
+struct kbase_device;
+
+/*
+ * the function below is used to trigger the simulation of a faulty
+ * HW condition for a specific job chain atom
+ */
+
+struct kbase_error_params {
+	u64 jc;
+	u32 errors_mask;
+	u32 mmu_table_level;
+	u16 faulty_mmu_as;
+	u16 padding[3];
+};
+
+enum kbase_model_control_command {
+	/* Disable/Enable job completion in the dummy model */
+	KBASE_MC_DISABLE_JOBS
+};
+
+/* struct to control dummy model behavior */
+struct kbase_model_control_params {
+	s32 command;
+	s32 value;
+};
+
+/* struct to track faulty atoms */
+struct kbase_error_atom {
+	struct kbase_error_params params;
+	struct kbase_error_atom *next;
+};
+
+/*struct to track the system error state*/
+struct error_status_t {
+	u32 errors_mask;
+	u32 mmu_table_level;
+	int faulty_mmu_as;
+
+	u64 current_jc;
+	int current_job_slot;
+
+	u32 job_irq_rawstat;
+	u32 job_irq_status;
+	u32 js_status[NUM_SLOTS];
+
+	u32 mmu_irq_mask;
+	u32 mmu_irq_rawstat;
+
+	u32 gpu_error_irq;
+	u32 gpu_fault_status;
+
+	u32 as_faultstatus[NUM_MMU_AS];
+	u32 as_command[NUM_MMU_AS];
+	u64 as_transtab[NUM_MMU_AS];
+};
+
+void *midgard_model_create(const void *config);
+void midgard_model_destroy(void *h);
+u8 midgard_model_write_reg(void *h, u32 addr, u32 value);
+u8 midgard_model_read_reg(void *h, u32 addr,
+							u32 * const value);
+void gpu_generate_error(void);
+void midgard_set_error(int job_slot);
+int job_atom_inject_error(struct kbase_error_params *params);
+int gpu_model_control(void *h,
+				struct kbase_model_control_params *params);
+
+void gpu_model_set_dummy_prfcnt_sample(u32 *usr_data, u32 usr_data_size);
+void gpu_model_set_dummy_prfcnt_kernel_sample(u64 *usr_data, u32 usr_data_size);
+void gpu_model_get_dummy_prfcnt_cores(struct kbase_device *kbdev,
+		u64 *l2_present, u64 *shader_present);
+void gpu_model_set_dummy_prfcnt_cores(struct kbase_device *kbdev,
+		u64 l2_present, u64 shader_present);
+void gpu_model_set_dummy_prfcnt_base_cpu(u32 *base, struct kbase_device *kbdev,
+					 struct tagged_addr *pages,
+					 size_t page_count);
+/* Clear the counter values array maintained by the dummy model */
+void gpu_model_clear_prfcnt_values(void);
+
+enum gpu_dummy_irq {
+	GPU_DUMMY_JOB_IRQ,
+	GPU_DUMMY_GPU_IRQ,
+	GPU_DUMMY_MMU_IRQ
+};
+
+void gpu_device_raise_irq(void *model,
+						enum gpu_dummy_irq irq);
+void gpu_device_set_data(void *model, void *data);
+void *gpu_device_get_data(void *model);
+
+extern struct error_status_t hw_error_status;
+
+#endif
diff --git a/mali_kbase/backend/gpu/mali_kbase_model_error_generator.c b/mali_kbase/backend/gpu/mali_kbase_model_error_generator.c
new file mode 100644
index 0000000..dfa7f62
--- /dev/null
+++ b/mali_kbase/backend/gpu/mali_kbase_model_error_generator.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#include <mali_kbase.h>
+#include <linux/random.h>
+#include "backend/gpu/mali_kbase_model_dummy.h"
+
+/* all the error conditions supported by the model */
+#define TOTAL_FAULTS 27
+/* maximum number of levels in the MMU translation table tree */
+#define MAX_MMU_TABLE_LEVEL 4
+/* worst case scenario is <1 MMU fault + 1 job fault + 2 GPU faults> */
+#define MAX_CONCURRENT_FAULTS 3
+
+static struct kbase_error_atom *error_track_list;
+
+unsigned int rand_seed;
+
+/*following error probability are set quite high in order to stress the driver*/
+unsigned int error_probability = 50;	/* to be set between 0 and 100 */
+/* probability to have multiple error give that there is an error */
+unsigned int multiple_error_probability = 50;
+
+void gpu_generate_error(void)
+{
+	unsigned int errors_num = 0;
+
+	/*is there at least one error? */
+	if ((prandom_u32() % 100) < error_probability) {
+		/* pick up a faulty mmu address space */
+		hw_error_status.faulty_mmu_as = prandom_u32() % NUM_MMU_AS;
+		/* pick up an mmu table level */
+		hw_error_status.mmu_table_level =
+			1 + (prandom_u32() % MAX_MMU_TABLE_LEVEL);
+		hw_error_status.errors_mask =
+			(u32)(1 << (prandom_u32() % TOTAL_FAULTS));
+
+		/*is there also one or more errors? */
+		if ((prandom_u32() % 100) < multiple_error_probability) {
+			errors_num = 1 + (prandom_u32() %
+					  (MAX_CONCURRENT_FAULTS - 1));
+			while (errors_num-- > 0) {
+				u32 temp_mask;
+
+				temp_mask = (u32)(
+					1 << (prandom_u32() % TOTAL_FAULTS));
+				/* below we check that no bit of the same error
+				 * type is set again in the error mask
+				 */
+				if ((temp_mask & IS_A_JOB_ERROR) &&
+						(hw_error_status.errors_mask &
+							IS_A_JOB_ERROR)) {
+					errors_num++;
+					continue;
+				}
+				if ((temp_mask & IS_A_MMU_ERROR) &&
+						(hw_error_status.errors_mask &
+							IS_A_MMU_ERROR)) {
+					errors_num++;
+					continue;
+				}
+				if ((temp_mask & IS_A_GPU_ERROR) &&
+						(hw_error_status.errors_mask &
+							IS_A_GPU_ERROR)) {
+					errors_num++;
+					continue;
+				}
+				/* this error mask is already set */
+				if ((hw_error_status.errors_mask | temp_mask) ==
+						hw_error_status.errors_mask) {
+					errors_num++;
+					continue;
+				}
+				hw_error_status.errors_mask |= temp_mask;
+			}
+		}
+	}
+}
+
+int job_atom_inject_error(struct kbase_error_params *params)
+{
+	struct kbase_error_atom *new_elem;
+
+	KBASE_DEBUG_ASSERT(params);
+
+	new_elem = kzalloc(sizeof(*new_elem), GFP_KERNEL);
+
+	if (!new_elem) {
+		model_error_log(KBASE_CORE,
+			"\njob_atom_inject_error: kzalloc failed for new_elem\n"
+									);
+		return -ENOMEM;
+	}
+	new_elem->params.jc = params->jc;
+	new_elem->params.errors_mask = params->errors_mask;
+	new_elem->params.mmu_table_level = params->mmu_table_level;
+	new_elem->params.faulty_mmu_as = params->faulty_mmu_as;
+
+	/*circular list below */
+	if (error_track_list == NULL) {	/*no elements */
+		error_track_list = new_elem;
+		new_elem->next = error_track_list;
+	} else {
+		struct kbase_error_atom *walker = error_track_list;
+
+		while (walker->next != error_track_list)
+			walker = walker->next;
+
+		new_elem->next = error_track_list;
+		walker->next = new_elem;
+	}
+	return 0;
+}
+
+void midgard_set_error(int job_slot)
+{
+#ifdef CONFIG_MALI_ERROR_INJECT_RANDOM
+	gpu_generate_error();
+#else
+	struct kbase_error_atom *walker, *auxiliar;
+
+	if (error_track_list != NULL) {
+		walker = error_track_list->next;
+		auxiliar = error_track_list;
+		do {
+			if (walker->params.jc == hw_error_status.current_jc) {
+				/* found a faulty atom matching with the
+				 * current one
+				 */
+				hw_error_status.errors_mask =
+						walker->params.errors_mask;
+				hw_error_status.mmu_table_level =
+						walker->params.mmu_table_level;
+				hw_error_status.faulty_mmu_as =
+						walker->params.faulty_mmu_as;
+				hw_error_status.current_job_slot = job_slot;
+
+				if (walker->next == walker) {
+					/* only one element */
+					kfree(error_track_list);
+					error_track_list = NULL;
+				} else {
+					auxiliar->next = walker->next;
+					if (walker == error_track_list)
+						error_track_list = walker->next;
+
+					kfree(walker);
+				}
+				break;
+			}
+			auxiliar = walker;
+			walker = walker->next;
+		} while (auxiliar->next != error_track_list);
+	}
+#endif				/* CONFIG_MALI_ERROR_INJECT_RANDOM */
+}
diff --git a/mali_kbase/backend/gpu/mali_kbase_model_linux.c b/mali_kbase/backend/gpu/mali_kbase_model_linux.c
new file mode 100644
index 0000000..ed5d4ce
--- /dev/null
+++ b/mali_kbase/backend/gpu/mali_kbase_model_linux.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2010, 2012-2015, 2017-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * Model interface
+ */
+
+#include <mali_kbase.h>
+#include <gpu/mali_kbase_gpu_regmap.h>
+#include <backend/gpu/mali_kbase_model_dummy.h>
+#include "backend/gpu/mali_kbase_model_linux.h"
+#include "device/mali_kbase_device.h"
+#include "mali_kbase_irq_internal.h"
+
+#include <linux/kthread.h>
+
+struct model_irq_data {
+	struct kbase_device *kbdev;
+	struct work_struct work;
+};
+
+static void serve_job_irq(struct work_struct *work)
+{
+	struct model_irq_data *data = container_of(work, struct model_irq_data,
+									work);
+	struct kbase_device *kbdev = data->kbdev;
+
+	/* Make sure no worker is already serving this IRQ */
+	while (atomic_cmpxchg(&kbdev->serving_job_irq, 1, 0) == 1) {
+		u32 val;
+
+		while ((val = kbase_reg_read(kbdev,
+				JOB_CONTROL_REG(JOB_IRQ_STATUS)))) {
+			unsigned long flags;
+
+			/* Handle the IRQ */
+			spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+#if MALI_USE_CSF
+			kbase_csf_interrupt(kbdev, val);
+#else
+			kbase_job_done(kbdev, val);
+#endif
+			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+		}
+	}
+
+	kmem_cache_free(kbdev->irq_slab, data);
+}
+
+static void serve_gpu_irq(struct work_struct *work)
+{
+	struct model_irq_data *data = container_of(work, struct model_irq_data,
+									work);
+	struct kbase_device *kbdev = data->kbdev;
+
+	/* Make sure no worker is already serving this IRQ */
+	while (atomic_cmpxchg(&kbdev->serving_gpu_irq, 1, 0) == 1) {
+		u32 val;
+
+		while ((val = kbase_reg_read(kbdev,
+				GPU_CONTROL_REG(GPU_IRQ_STATUS)))) {
+			/* Handle the IRQ */
+			kbase_gpu_interrupt(kbdev, val);
+		}
+	}
+
+	kmem_cache_free(kbdev->irq_slab, data);
+}
+
+static void serve_mmu_irq(struct work_struct *work)
+{
+	struct model_irq_data *data = container_of(work, struct model_irq_data,
+									work);
+	struct kbase_device *kbdev = data->kbdev;
+
+	/* Make sure no worker is already serving this IRQ */
+	if (atomic_cmpxchg(&kbdev->serving_mmu_irq, 1, 0) == 1) {
+		u32 val;
+
+		while ((val = kbase_reg_read(kbdev,
+					MMU_REG(MMU_IRQ_STATUS)))) {
+			/* Handle the IRQ */
+			kbase_mmu_interrupt(kbdev, val);
+		}
+	}
+
+	kmem_cache_free(kbdev->irq_slab, data);
+}
+
+void gpu_device_raise_irq(void *model,
+				enum gpu_dummy_irq irq)
+{
+	struct model_irq_data *data;
+	struct kbase_device *kbdev = gpu_device_get_data(model);
+
+	KBASE_DEBUG_ASSERT(kbdev);
+
+	data = kmem_cache_alloc(kbdev->irq_slab, GFP_ATOMIC);
+	if (data == NULL)
+		return;
+
+	data->kbdev = kbdev;
+
+	switch (irq) {
+	case GPU_DUMMY_JOB_IRQ:
+		INIT_WORK(&data->work, serve_job_irq);
+		atomic_set(&kbdev->serving_job_irq, 1);
+		break;
+	case GPU_DUMMY_GPU_IRQ:
+		INIT_WORK(&data->work, serve_gpu_irq);
+		atomic_set(&kbdev->serving_gpu_irq, 1);
+		break;
+	case GPU_DUMMY_MMU_IRQ:
+		INIT_WORK(&data->work, serve_mmu_irq);
+		atomic_set(&kbdev->serving_mmu_irq, 1);
+		break;
+	default:
+		dev_warn(kbdev->dev, "Unknown IRQ");
+		kmem_cache_free(kbdev->irq_slab, data);
+	}
+	queue_work(kbdev->irq_workq, &data->work);
+}
+
+void kbase_reg_write(struct kbase_device *kbdev, u32 offset, u32 value)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&kbdev->reg_op_lock, flags);
+	midgard_model_write_reg(kbdev->model, offset, value);
+	spin_unlock_irqrestore(&kbdev->reg_op_lock, flags);
+}
+
+KBASE_EXPORT_TEST_API(kbase_reg_write);
+
+u32 kbase_reg_read(struct kbase_device *kbdev, u32 offset)
+{
+	unsigned long flags;
+	u32 val;
+
+	spin_lock_irqsave(&kbdev->reg_op_lock, flags);
+	midgard_model_read_reg(kbdev->model, offset, &val);
+	spin_unlock_irqrestore(&kbdev->reg_op_lock, flags);
+
+	return val;
+}
+
+KBASE_EXPORT_TEST_API(kbase_reg_read);
+
+/**
+ * kbase_is_gpu_removed - Has the GPU been removed.
+ * @kbdev:    Kbase device pointer
+ *
+ * This function would return true if the GPU has been removed.
+ * It is stubbed here
+ * Return: Always false
+ */
+bool kbase_is_gpu_removed(struct kbase_device *kbdev)
+{
+	return false;
+}
+
+int kbase_install_interrupts(struct kbase_device *kbdev)
+{
+	KBASE_DEBUG_ASSERT(kbdev);
+
+	atomic_set(&kbdev->serving_job_irq, 0);
+	atomic_set(&kbdev->serving_gpu_irq, 0);
+	atomic_set(&kbdev->serving_mmu_irq, 0);
+
+	kbdev->irq_workq = alloc_ordered_workqueue("dummy irq queue", 0);
+	if (kbdev->irq_workq == NULL)
+		return -ENOMEM;
+
+	kbdev->irq_slab = kmem_cache_create("dummy_irq_slab",
+				sizeof(struct model_irq_data), 0, 0, NULL);
+	if (kbdev->irq_slab == NULL) {
+		destroy_workqueue(kbdev->irq_workq);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void kbase_release_interrupts(struct kbase_device *kbdev)
+{
+	KBASE_DEBUG_ASSERT(kbdev);
+	destroy_workqueue(kbdev->irq_workq);
+	kmem_cache_destroy(kbdev->irq_slab);
+}
+
+void kbase_synchronize_irqs(struct kbase_device *kbdev)
+{
+	KBASE_DEBUG_ASSERT(kbdev);
+	flush_workqueue(kbdev->irq_workq);
+}
+
+KBASE_EXPORT_TEST_API(kbase_synchronize_irqs);
+
+int kbase_set_custom_irq_handler(struct kbase_device *kbdev,
+					irq_handler_t custom_handler,
+					int irq_type)
+{
+	return 0;
+}
+
+KBASE_EXPORT_TEST_API(kbase_set_custom_irq_handler);
+
+irqreturn_t kbase_gpu_irq_test_handler(int irq, void *data, u32 val)
+{
+	if (!val)
+		return IRQ_NONE;
+
+	return IRQ_HANDLED;
+}
+
+KBASE_EXPORT_TEST_API(kbase_gpu_irq_test_handler);
+
+int kbase_gpu_device_create(struct kbase_device *kbdev)
+{
+	kbdev->model = midgard_model_create(NULL);
+	if (kbdev->model == NULL)
+		return -ENOMEM;
+
+	gpu_device_set_data(kbdev->model, kbdev);
+
+	spin_lock_init(&kbdev->reg_op_lock);
+
+	dev_warn(kbdev->dev, "Using Dummy Model");
+
+	return 0;
+}
+
+void kbase_gpu_device_destroy(struct kbase_device *kbdev)
+{
+	midgard_model_destroy(kbdev->model);
+}
diff --git a/mali_kbase/backend/gpu/mali_kbase_model_linux.h b/mali_kbase/backend/gpu/mali_kbase_model_linux.h
new file mode 100644
index 0000000..dcb2e7c
--- /dev/null
+++ b/mali_kbase/backend/gpu/mali_kbase_model_linux.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * Model interface
+ */
+
+#ifndef _KBASE_MODEL_LINUX_H_
+#define _KBASE_MODEL_LINUX_H_
+
+int kbase_gpu_device_create(struct kbase_device *kbdev);
+void kbase_gpu_device_destroy(struct kbase_device *kbdev);
+
+#endif				/* _KBASE_MODEL_LINUX_H_ */
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_backend.c b/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
index 5df7f67..8711a6c 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
@@ -568,11 +568,14 @@ static void kbase_pm_hwcnt_disable_worker(struct work_struct *data)
  * when system suspend takes place.
  * The function first waits for the @gpu_poweroff_wait_work to complete, which
  * could have been enqueued after the last PM reference was released.
+ *
+ * Return: 0 on success, negative value otherwise.
  */
-static void kbase_pm_do_poweroff_sync(struct kbase_device *kbdev)
+static int kbase_pm_do_poweroff_sync(struct kbase_device *kbdev)
 {
 	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
 	unsigned long flags;
+	int ret = 0;
 
 	WARN_ON(kbdev->pm.active_count);
 
@@ -581,8 +584,8 @@ static void kbase_pm_do_poweroff_sync(struct kbase_device *kbdev)
 	kbase_pm_lock(kbdev);
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	WARN_ON(backend->poweroff_wait_in_progress);
+	WARN_ON(backend->gpu_sleep_mode_active);
 	if (backend->gpu_powered) {
-		int ret;
 
 		backend->mcu_desired = false;
 		backend->l2_desired = false;
@@ -591,17 +594,11 @@ static void kbase_pm_do_poweroff_sync(struct kbase_device *kbdev)
 
 		ret = kbase_pm_wait_for_desired_state(kbdev);
 		if (ret) {
-			dev_warn(kbdev->dev, "Wait failed on synchronous power off");
-			kbase_pm_unlock(kbdev);
-			/* Wait for the completion of reset, triggered due to
-			 * the previous failure.
-			 */
-			kbase_reset_gpu_wait(kbdev);
-			/* Wait again for the poweroff work which could have
-			 * been enqueued by the GPU reset worker.
-			 */
-			kbase_pm_wait_for_poweroff_work_complete(kbdev);
-			kbase_pm_lock(kbdev);
+			dev_warn(
+				kbdev->dev,
+				"Wait for pm state change failed on synchronous power off");
+			ret = -EBUSY;
+			goto out;
 		}
 
 		/* Due to the power policy, GPU could have been kept active
@@ -614,12 +611,19 @@ static void kbase_pm_do_poweroff_sync(struct kbase_device *kbdev)
 			backend->gpu_idled = true;
 		}
 
-		kbase_pm_clock_off(kbdev);
+		if (!kbase_pm_clock_off(kbdev)) {
+			dev_warn(
+				kbdev->dev,
+				"Failed to turn off GPU clocks on synchronous power off, MMU faults pending");
+			ret = -EBUSY;
+		}
 	} else {
 		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 	}
 
+out:
 	kbase_pm_unlock(kbdev);
+	return ret;
 }
 #endif
 
@@ -793,7 +797,7 @@ void kbase_hwaccess_pm_halt(struct kbase_device *kbdev)
 	KBASE_DEBUG_ASSERT(kbdev != NULL);
 
 #if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
-	kbase_pm_do_poweroff_sync(kbdev);
+	WARN_ON(kbase_pm_do_poweroff_sync(kbdev));
 #else
 	mutex_lock(&kbdev->pm.lock);
 	kbase_pm_do_poweroff(kbdev);
@@ -902,10 +906,14 @@ void kbase_hwaccess_pm_gpu_idle(struct kbase_device *kbdev)
 	kbase_pm_update_active(kbdev);
 }
 
-void kbase_hwaccess_pm_suspend(struct kbase_device *kbdev)
+int kbase_hwaccess_pm_suspend(struct kbase_device *kbdev)
 {
+	int ret = 0;
+
 #if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
-	kbase_pm_do_poweroff_sync(kbdev);
+	ret = kbase_pm_do_poweroff_sync(kbdev);
+	if (ret)
+		return ret;
 #else
 	/* Force power off the GPU and all cores (regardless of policy), only
 	 * after the PM active count reaches zero (otherwise, we risk turning it
@@ -929,6 +937,8 @@ void kbase_hwaccess_pm_suspend(struct kbase_device *kbdev)
 
 	if (kbdev->pm.backend.callback_power_suspend)
 		kbdev->pm.backend.callback_power_suspend(kbdev);
+
+	return ret;
 }
 
 void kbase_hwaccess_pm_resume(struct kbase_device *kbdev)
@@ -1044,7 +1054,12 @@ static int pm_handle_mcu_sleep_on_runtime_suspend(struct kbase_device *kbdev)
 
 	ret = kbase_pm_force_mcu_wakeup_after_sleep(kbdev);
 	if (ret) {
-		dev_warn(kbdev->dev, "Wait for MCU wake up failed on runtime suspend");
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		dev_warn(
+			kbdev->dev,
+			"Waiting for MCU to wake up failed on runtime suspend");
+		kbdev->pm.backend.gpu_wakeup_override = false;
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 		return ret;
 	}
 
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_ca.c b/mali_kbase/backend/gpu/mali_kbase_pm_ca.c
index efc620f..803ba4d 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_ca.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_ca.c
@@ -26,6 +26,9 @@
 #include <mali_kbase.h>
 #include <mali_kbase_pm.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+#include <backend/gpu/mali_kbase_model_dummy.h>
+#endif /* CONFIG_MALI_NO_MALI */
 #include <mali_kbase_dummy_job_wa.h>
 
 int kbase_pm_ca_init(struct kbase_device *kbdev)
@@ -120,7 +123,9 @@ u64 kbase_pm_ca_get_instr_core_mask(struct kbase_device *kbdev)
 {
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
-#if   MALI_USE_CSF
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	return (((1ull) << KBASE_DUMMY_MODEL_MAX_SHADER_CORES) - 1);
+#elif MALI_USE_CSF
 	return kbase_pm_get_ready_cores(kbdev, KBASE_PM_CORE_SHADER);
 #else
 	return kbdev->pm.backend.pm_shaders_core_mask;
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_ca.h b/mali_kbase/backend/gpu/mali_kbase_pm_ca.h
index 8d169c3..90dcaf5 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_ca.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_ca.h
@@ -29,10 +29,10 @@
 /**
  * kbase_pm_ca_init - Initialize core availability framework
  *
- * Must be called before calling any other core availability function
- *
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
+ * Must be called before calling any other core availability function
+ *
  * Return: 0 if the core availability framework was successfully initialized,
  *         -errno otherwise
  */
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_ca_devfreq.h b/mali_kbase/backend/gpu/mali_kbase_pm_ca_devfreq.h
index 41f3c14..d1e4b53 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_ca_devfreq.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_ca_devfreq.h
@@ -30,12 +30,12 @@
 /**
  * struct kbasep_pm_ca_policy_devfreq - Private structure for devfreq ca policy
  *
- * This contains data that is private to the devfreq core availability
- * policy.
- *
  * @cores_desired: Cores that the policy wants to be available
  * @cores_enabled: Cores that the policy is currently returning as available
  * @cores_used: Cores currently powered or transitioning
+ *
+ * This contains data that is private to the devfreq core availability
+ * policy.
  */
 struct kbasep_pm_ca_policy_devfreq {
 	u64 cores_desired;
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_coarse_demand.h b/mali_kbase/backend/gpu/mali_kbase_pm_coarse_demand.h
index 5e3f17e..a947e8f 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_coarse_demand.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_coarse_demand.h
@@ -52,10 +52,8 @@
 /**
  * struct kbasep_pm_policy_coarse_demand - Private structure for coarse demand
  *                                         policy
- *
- * This contains data that is private to the coarse demand power policy.
- *
  * @dummy: Dummy member - no state needed
+ * This contains data that is private to the coarse demand power policy.
  */
 struct kbasep_pm_policy_coarse_demand {
 	int dummy;
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_defs.h b/mali_kbase/backend/gpu/mali_kbase_pm_defs.h
index 52877f5..c7efe23 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_defs.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_defs.h
@@ -40,6 +40,11 @@ struct kbase_jd_atom;
 /**
  * enum kbase_pm_core_type - The types of core in a GPU.
  *
+ * @KBASE_PM_CORE_L2: The L2 cache
+ * @KBASE_PM_CORE_SHADER: Shader cores
+ * @KBASE_PM_CORE_TILER: Tiler cores
+ * @KBASE_PM_CORE_STACK: Core stacks
+ *
  * These enumerated values are used in calls to
  * - kbase_pm_get_present_cores()
  * - kbase_pm_get_active_cores()
@@ -49,11 +54,6 @@ struct kbase_jd_atom;
  * They specify which type of core should be acted on.  These values are set in
  * a manner that allows core_type_to_reg() function to be simpler and more
  * efficient.
- *
- * @KBASE_PM_CORE_L2: The L2 cache
- * @KBASE_PM_CORE_SHADER: Shader cores
- * @KBASE_PM_CORE_TILER: Tiler cores
- * @KBASE_PM_CORE_STACK: Core stacks
  */
 enum kbase_pm_core_type {
 	KBASE_PM_CORE_L2 = L2_PRESENT_LO,
@@ -215,9 +215,6 @@ union kbase_pm_policy_data {
 /**
  * struct kbase_pm_backend_data - Data stored per device for power management.
  *
- * This structure contains data for the power management framework. There is one
- * instance of this structure per device in the system.
- *
  * @pm_current_policy: The policy that is currently actively controlling the
  *                     power state.
  * @pm_policy_data:    Private data for current PM policy. This is automatically
@@ -324,6 +321,10 @@ union kbase_pm_policy_data {
  * @policy_change_lock: Used to serialize the policy change calls. In CSF case,
  *                      the change of policy may involve the scheduler to
  *                      suspend running CSGs and then reconfigure the MCU.
+ * @core_idle_wq: Workqueue for executing the @core_idle_work.
+ * @core_idle_work: Work item used to wait for undesired cores to become inactive.
+ *                  The work item is enqueued when Host controls the power for
+ *                  shader cores and down scaling of cores is performed.
  * @gpu_sleep_supported: Flag to indicate that if GPU sleep feature can be
  *                       supported by the kernel driver or not. If this
  *                       flag is not set, then HW state is directly saved
@@ -389,6 +390,9 @@ union kbase_pm_policy_data {
  * @gpu_clock_control_work: work item to set GPU clock during L2 power cycle
  *                          using gpu_clock_control
  *
+ * This structure contains data for the power management framework. There is one
+ * instance of this structure per device in the system.
+ *
  * Note:
  * During an IRQ, @pm_current_policy can be NULL when the policy is being
  * changed with kbase_pm_set_policy(). The change is protected under
@@ -455,6 +459,8 @@ struct kbase_pm_backend_data {
 	bool policy_change_clamp_state_to_off;
 	unsigned int csf_pm_sched_flags;
 	struct mutex policy_change_lock;
+	struct workqueue_struct *core_idle_wq;
+	struct work_struct core_idle_work;
 
 #ifdef KBASE_PM_RUNTIME
 	bool gpu_sleep_supported;
@@ -547,9 +553,6 @@ enum kbase_pm_policy_event {
 /**
  * struct kbase_pm_policy - Power policy structure.
  *
- * Each power policy exposes a (static) instance of this structure which
- * contains function pointers to the policy's methods.
- *
  * @name:               The name of this policy
  * @init:               Function called when the policy is selected
  * @term:               Function called when the policy is unselected
@@ -567,6 +570,8 @@ enum kbase_pm_policy_event {
  *                  Pre-defined required flags exist for each of the
  *                  ARM released policies, such as 'always_on', 'coarse_demand'
  *                  and etc.
+ * Each power policy exposes a (static) instance of this structure which
+ * contains function pointers to the policy's methods.
  */
 struct kbase_pm_policy {
 	char *name;
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
index d65c684..81c922f 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
@@ -54,6 +54,10 @@
 #include <csf/ipa_control/mali_kbase_csf_ipa_control.h>
 #endif
 
+#if MALI_USE_CSF
+#include <linux/delay.h>
+#endif
+
 #include <linux/of.h>
 
 #ifdef CONFIG_MALI_CORESTACK
@@ -72,16 +76,16 @@ KBASE_EXPORT_TEST_API(corestack_driver_control);
 /**
  * enum kbasep_pm_action - Actions that can be performed on a core.
  *
- * This enumeration is private to the file. Its values are set to allow
- * core_type_to_reg() function, which decodes this enumeration, to be simpler
- * and more efficient.
- *
  * @ACTION_PRESENT: The cores that are present
  * @ACTION_READY: The cores that are ready
  * @ACTION_PWRON: Power on the cores specified
  * @ACTION_PWROFF: Power off the cores specified
  * @ACTION_PWRTRANS: The cores that are transitioning
  * @ACTION_PWRACTIVE: The cores that are active
+ *
+ * This enumeration is private to the file. Its values are set to allow
+ * core_type_to_reg() function, which decodes this enumeration, to be simpler
+ * and more efficient.
  */
 enum kbasep_pm_action {
 	ACTION_PRESENT = 0,
@@ -221,14 +225,14 @@ void kbase_pm_protected_l2_override(struct kbase_device *kbdev, bool override)
 /**
  * core_type_to_reg - Decode a core type and action to a register.
  *
+ * @core_type: The type of core
+ * @action:    The type of action
+ *
  * Given a core type (defined by kbase_pm_core_type) and an action (defined
  * by kbasep_pm_action) this function will return the register offset that
  * will perform the action on the core type. The register returned is the _LO
  * register and an offset must be applied to use the _HI register.
  *
- * @core_type: The type of core
- * @action:    The type of action
- *
  * Return: The register offset of the _LO register that performs an action of
  * type @action on a core of type @core_type.
  */
@@ -291,14 +295,14 @@ static void mali_cci_flush_l2(struct kbase_device *kbdev)
 /**
  * kbase_pm_invoke - Invokes an action on a core set
  *
- * This function performs the action given by @action on a set of cores of a
- * type given by @core_type. It is a static function used by
- * kbase_pm_transition_core_type()
- *
  * @kbdev:     The kbase device structure of the device
  * @core_type: The type of core that the action should be performed on
  * @cores:     A bit mask of cores to perform the action on (low 32 bits)
  * @action:    The action to perform on the cores
+ *
+ * This function performs the action given by @action on a set of cores of a
+ * type given by @core_type. It is a static function used by
+ * kbase_pm_transition_core_type()
  */
 static void kbase_pm_invoke(struct kbase_device *kbdev,
 					enum kbase_pm_core_type core_type,
@@ -376,15 +380,15 @@ static void kbase_pm_invoke(struct kbase_device *kbdev,
 /**
  * kbase_pm_get_state - Get information about a core set
  *
+ * @kbdev:     The kbase device structure of the device
+ * @core_type: The type of core that the should be queried
+ * @action:    The property of the cores to query
+ *
  * This function gets information (chosen by @action) about a set of cores of
  * a type given by @core_type. It is a static function used by
  * kbase_pm_get_active_cores(), kbase_pm_get_trans_cores() and
  * kbase_pm_get_ready_cores().
  *
- * @kbdev:     The kbase device structure of the device
- * @core_type: The type of core that the should be queried
- * @action:    The property of the cores to query
- *
  * Return: A bit mask specifying the state of the cores
  */
 static u64 kbase_pm_get_state(struct kbase_device *kbdev,
@@ -753,17 +757,17 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev)
 			if (!kbase_pm_is_mcu_desired(kbdev))
 				backend->mcu_state = KBASE_MCU_ON_HWCNT_DISABLE;
 			else if (kbdev->csf.firmware_hctl_core_pwr) {
-				/* Host control add additional Cores to be active */
-				if (backend->shaders_desired_mask & ~shaders_ready) {
+				/* Host control scale up/down cores as needed */
+				if (backend->shaders_desired_mask != shaders_ready) {
 					backend->hwcnt_desired = false;
 					if (!backend->hwcnt_disabled)
 						kbase_pm_trigger_hwcnt_disable(kbdev);
 					backend->mcu_state =
 						KBASE_MCU_HCTL_MCU_ON_RECHECK;
 				}
-			} else if (kbase_pm_handle_mcu_core_attr_update(kbdev))
-				kbdev->pm.backend.mcu_state =
-					KBASE_MCU_ON_CORE_ATTR_UPDATE_PEND;
+			} else if (kbase_pm_handle_mcu_core_attr_update(kbdev)) {
+				backend->mcu_state = KBASE_MCU_ON_CORE_ATTR_UPDATE_PEND;
+			}
 			break;
 
 		case KBASE_MCU_HCTL_MCU_ON_RECHECK:
@@ -787,16 +791,54 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev)
 						ACTION_PWRON);
 				backend->mcu_state =
 					KBASE_MCU_HCTL_SHADERS_PEND_ON;
+
+			} else if (~backend->shaders_desired_mask & shaders_ready) {
+				kbase_csf_firmware_update_core_attr(kbdev, false, true,
+								    backend->shaders_desired_mask);
+				backend->mcu_state = KBASE_MCU_HCTL_CORES_DOWN_SCALE_NOTIFY_PEND;
 			} else {
 				backend->mcu_state =
 					KBASE_MCU_HCTL_SHADERS_PEND_ON;
 			}
 			break;
 
+		case KBASE_MCU_HCTL_CORES_DOWN_SCALE_NOTIFY_PEND:
+			if (kbase_csf_firmware_core_attr_updated(kbdev)) {
+				/* wait in queue until cores idle */
+				queue_work(backend->core_idle_wq, &backend->core_idle_work);
+				backend->mcu_state = KBASE_MCU_HCTL_CORE_INACTIVE_PEND;
+			}
+			break;
+
+		case KBASE_MCU_HCTL_CORE_INACTIVE_PEND:
+			{
+				u64 active_cores = kbase_pm_get_active_cores(
+							kbdev,
+							KBASE_PM_CORE_SHADER);
+				u64 cores_to_disable = shaders_ready &
+							~backend->shaders_desired_mask;
+
+				if (!(cores_to_disable & active_cores)) {
+					kbase_pm_invoke(kbdev, KBASE_PM_CORE_SHADER,
+							cores_to_disable,
+							ACTION_PWROFF);
+					backend->shaders_avail = backend->shaders_desired_mask;
+					backend->mcu_state = KBASE_MCU_HCTL_SHADERS_CORE_OFF_PEND;
+				}
+			}
+			break;
+
+		case KBASE_MCU_HCTL_SHADERS_CORE_OFF_PEND:
+			if (!shaders_trans && shaders_ready == backend->shaders_avail) {
+				/* Cores now stable */
+				backend->pm_shaders_core_mask = shaders_ready;
+				backend->mcu_state = KBASE_MCU_ON_HWCNT_ENABLE;
+			}
+			break;
+
 		case KBASE_MCU_ON_CORE_ATTR_UPDATE_PEND:
 			if (kbase_csf_firmware_core_attr_updated(kbdev)) {
-				backend->shaders_avail =
-					backend->shaders_desired_mask;
+				backend->shaders_avail = backend->shaders_desired_mask;
 				backend->mcu_state = KBASE_MCU_ON;
 			}
 			break;
@@ -832,6 +874,8 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev)
 
 		case KBASE_MCU_ON_PEND_HALT:
 			if (kbase_csf_firmware_mcu_halted(kbdev)) {
+				KBASE_KTRACE_ADD(kbdev, MCU_HALTED, NULL,
+					kbase_csf_ktrace_gpu_cycle_cnt(kbdev));
 				if (kbdev->csf.firmware_hctl_core_pwr)
 					backend->mcu_state =
 						KBASE_MCU_HCTL_SHADERS_READY_OFF;
@@ -875,6 +919,8 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev)
 
 		case KBASE_MCU_ON_PEND_SLEEP:
 			if (kbase_csf_firmware_is_mcu_in_sleep(kbdev)) {
+				KBASE_KTRACE_ADD(kbdev, MCU_IN_SLEEP, NULL,
+					kbase_csf_ktrace_gpu_cycle_cnt(kbdev));
 				backend->mcu_state = KBASE_MCU_IN_SLEEP;
 				kbase_pm_enable_db_mirror_interrupt(kbdev);
 				kbase_csf_scheduler_reval_idleness_post_sleep(kbdev);
@@ -884,6 +930,8 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev)
 		case KBASE_MCU_IN_SLEEP:
 			if (kbase_pm_is_mcu_desired(kbdev) &&
 			    backend->l2_state == KBASE_L2_ON) {
+				KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_REQUEST_WAKEUP(
+					kbdev, kbase_backend_get_cycle_cnt(kbdev));
 				kbase_pm_enable_mcu_db_notification(kbdev);
 				kbase_pm_disable_db_mirror_interrupt(kbdev);
 				backend->mcu_state = KBASE_MCU_ON_HWCNT_ENABLE;
@@ -910,6 +958,33 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev)
 
 	return 0;
 }
+
+static void core_idle_worker(struct work_struct *work)
+{
+	struct kbase_device *kbdev =
+		container_of(work, struct kbase_device, pm.backend.core_idle_work);
+	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	while (backend->gpu_powered && (backend->mcu_state == KBASE_MCU_HCTL_CORE_INACTIVE_PEND)) {
+		const unsigned int core_inactive_wait_ms = 1;
+		u64 active_cores = kbase_pm_get_active_cores(kbdev, KBASE_PM_CORE_SHADER);
+		u64 shaders_ready = kbase_pm_get_ready_cores(kbdev, KBASE_PM_CORE_SHADER);
+		u64 cores_to_disable = shaders_ready & ~backend->shaders_desired_mask;
+
+		if (!(cores_to_disable & active_cores)) {
+			kbase_pm_update_state(kbdev);
+			break;
+		}
+
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+		msleep(core_inactive_wait_ms);
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+}
 #endif
 
 static const char *kbase_l2_core_state_to_string(enum kbase_l2_core_state state)
@@ -925,6 +1000,23 @@ static const char *kbase_l2_core_state_to_string(enum kbase_l2_core_state state)
 		return strings[state];
 }
 
+#if !MALI_USE_CSF
+/* On powering on the L2, the tracked kctx becomes stale and can be cleared.
+ * This enables the backend to spare the START_FLUSH.INV_SHADER_OTHER
+ * operation on the first submitted katom after the L2 powering on.
+ */
+static void kbase_pm_l2_clear_backend_slot_submit_kctx(struct kbase_device *kbdev)
+{
+	int js;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	/* Clear the slots' last katom submission kctx */
+	for (js = 0; js < kbdev->gpu_props.num_job_slots; js++)
+		kbdev->hwaccess.backend.slot_rb[js].last_kctx_tagged = SLOT_RB_NULL_TAG_VAL;
+}
+#endif
+
 static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 {
 	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
@@ -1015,6 +1107,8 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 					kbase_pm_invoke(kbdev, KBASE_PM_CORE_L2,
 							l2_present & ~1,
 							ACTION_PWRON);
+				/* Clear backend slot submission kctx */
+				kbase_pm_l2_clear_backend_slot_submit_kctx(kbdev);
 #else
 				/* With CSF firmware, Host driver doesn't need to
 				 * handle power management with both shader and tiler cores.
@@ -1217,7 +1311,7 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 				 * powered off.
 				 */
 				kbase_gpu_start_cache_clean_nolock(
-						kbdev);
+					kbdev, GPU_COMMAND_CACHE_CLN_INV_L2);
 #if !MALI_USE_CSF
 			KBASE_KTRACE_ADD(kbdev, PM_CORES_CHANGE_AVAILABLE_TILER, NULL, 0u);
 #else
@@ -1594,10 +1688,12 @@ static int kbase_pm_shaders_update_state(struct kbase_device *kbdev)
 			break;
 
 		case KBASE_SHADERS_WAIT_FINISHED_CORESTACK_ON:
-			shader_poweroff_timer_queue_cancel(kbdev);
+			if (!backend->partial_shaderoff)
+				shader_poweroff_timer_queue_cancel(kbdev);
 
 			if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_TTRX_921)) {
-				kbase_gpu_start_cache_clean_nolock(kbdev);
+				kbase_gpu_start_cache_clean_nolock(
+					kbdev, GPU_COMMAND_CACHE_CLN_INV_L2);
 				backend->shaders_state =
 					KBASE_SHADERS_L2_FLUSHING_CORESTACK_ON;
 			} else {
@@ -1895,11 +1991,24 @@ int kbase_pm_state_machine_init(struct kbase_device *kbdev)
 	stt->default_ticks = DEFAULT_PM_POWEROFF_TICK_SHADER;
 	stt->configured_ticks = stt->default_ticks;
 
+#if MALI_USE_CSF
+	kbdev->pm.backend.core_idle_wq = alloc_workqueue("coreoff_wq", WQ_HIGHPRI | WQ_UNBOUND, 1);
+	if (!kbdev->pm.backend.core_idle_wq) {
+		destroy_workqueue(stt->wq);
+		return -ENOMEM;
+	}
+
+	INIT_WORK(&kbdev->pm.backend.core_idle_work, core_idle_worker);
+#endif
+
 	return 0;
 }
 
 void kbase_pm_state_machine_term(struct kbase_device *kbdev)
 {
+#if MALI_USE_CSF
+	destroy_workqueue(kbdev->pm.backend.core_idle_wq);
+#endif
 	hrtimer_cancel(&kbdev->pm.backend.shader_tick_timer.timer);
 	destroy_workqueue(kbdev->pm.backend.shader_tick_timer.wq);
 }
@@ -2419,9 +2528,9 @@ void kbase_pm_reset_done(struct kbase_device *kbdev)
 /**
  * kbase_pm_wait_for_reset - Wait for a reset to happen
  *
- * Wait for the %RESET_COMPLETED IRQ to occur, then reset the waiting state.
- *
  * @kbdev: Kbase device
+ *
+ * Wait for the %RESET_COMPLETED IRQ to occur, then reset the waiting state.
  */
 static void kbase_pm_wait_for_reset(struct kbase_device *kbdev)
 {
@@ -2889,6 +2998,7 @@ exit:
 
 /**
  * kbase_pm_request_gpu_cycle_counter_do_request - Request cycle counters
+ * @kbdev:     The kbase device structure of the device
  *
  * Increase the count of cycle counter users and turn the cycle counters on if
  * they were previously off
@@ -2899,8 +3009,6 @@ exit:
  *
  * When this function is called the l2 cache must be on - i.e., the GPU must be
  * on.
- *
- * @kbdev:     The kbase device structure of the device
  */
 static void
 kbase_pm_request_gpu_cycle_counter_do_request(struct kbase_device *kbdev)
@@ -2918,11 +3026,13 @@ kbase_pm_request_gpu_cycle_counter_do_request(struct kbase_device *kbdev)
 		/* This might happen after GPU reset.
 		 * Then counter needs to be kicked.
 		 */
+#if !IS_ENABLED(CONFIG_MALI_NO_MALI)
 		if (!(kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_STATUS)) &
 		      GPU_STATUS_CYCLE_COUNT_ACTIVE)) {
 			kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
 					GPU_COMMAND_CYCLE_COUNT_START);
 		}
+#endif
 	}
 
 	spin_unlock_irqrestore(
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_internal.h b/mali_kbase/backend/gpu/mali_kbase_pm_internal.h
index ef26c16..97e8607 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_internal.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_internal.h
@@ -35,18 +35,18 @@
 /**
  * kbase_pm_dev_idle - The GPU is idle.
  *
- * The OS may choose to turn off idle devices
- *
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * The OS may choose to turn off idle devices
  */
 void kbase_pm_dev_idle(struct kbase_device *kbdev);
 
 /**
  * kbase_pm_dev_activate - The GPU is active.
  *
- * The OS should avoid opportunistically turning off the GPU while it is active
- *
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * The OS should avoid opportunistically turning off the GPU while it is active
  */
 void kbase_pm_dev_activate(struct kbase_device *kbdev);
 
@@ -54,14 +54,14 @@ void kbase_pm_dev_activate(struct kbase_device *kbdev);
  * kbase_pm_get_present_cores - Get details of the cores that are present in
  *                              the device.
  *
- * This function can be called by the active power policy to return a bitmask of
- * the cores (of a specified type) present in the GPU device and also a count of
- * the number of cores.
- *
  * @kbdev: The kbase device structure for the device (must be a valid
  *         pointer)
  * @type:  The type of core (see the enum kbase_pm_core_type enumeration)
  *
+ * This function can be called by the active power policy to return a bitmask of
+ * the cores (of a specified type) present in the GPU device and also a count of
+ * the number of cores.
+ *
  * Return: The bit mask of cores present
  */
 u64 kbase_pm_get_present_cores(struct kbase_device *kbdev,
@@ -71,13 +71,13 @@ u64 kbase_pm_get_present_cores(struct kbase_device *kbdev,
  * kbase_pm_get_active_cores - Get details of the cores that are currently
  *                             active in the device.
  *
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ * @type:  The type of core (see the enum kbase_pm_core_type enumeration)
+ *
  * This function can be called by the active power policy to return a bitmask of
  * the cores (of a specified type) that are actively processing work (i.e.
  * turned on *and* busy).
  *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
- * @type:  The type of core (see the enum kbase_pm_core_type enumeration)
- *
  * Return: The bit mask of active cores
  */
 u64 kbase_pm_get_active_cores(struct kbase_device *kbdev,
@@ -87,13 +87,13 @@ u64 kbase_pm_get_active_cores(struct kbase_device *kbdev,
  * kbase_pm_get_trans_cores - Get details of the cores that are currently
  *                            transitioning between power states.
  *
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ * @type:  The type of core (see the enum kbase_pm_core_type enumeration)
+ *
  * This function can be called by the active power policy to return a bitmask of
  * the cores (of a specified type) that are currently transitioning between
  * power states.
  *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
- * @type:  The type of core (see the enum kbase_pm_core_type enumeration)
- *
  * Return: The bit mask of transitioning cores
  */
 u64 kbase_pm_get_trans_cores(struct kbase_device *kbdev,
@@ -103,13 +103,13 @@ u64 kbase_pm_get_trans_cores(struct kbase_device *kbdev,
  * kbase_pm_get_ready_cores - Get details of the cores that are currently
  *                            powered and ready for jobs.
  *
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ * @type:  The type of core (see the enum kbase_pm_core_type enumeration)
+ *
  * This function can be called by the active power policy to return a bitmask of
  * the cores (of a specified type) that are powered and ready for jobs (they may
  * or may not be currently executing jobs).
  *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
- * @type:  The type of core (see the enum kbase_pm_core_type enumeration)
- *
  * Return: The bit mask of ready cores
  */
 u64 kbase_pm_get_ready_cores(struct kbase_device *kbdev,
@@ -119,13 +119,13 @@ u64 kbase_pm_get_ready_cores(struct kbase_device *kbdev,
  * kbase_pm_clock_on - Turn the clock for the device on, and enable device
  *                     interrupts.
  *
- * This function can be used by a power policy to turn the clock for the GPU on.
- * It should be modified during integration to perform the necessary actions to
- * ensure that the GPU is fully powered and clocked.
- *
  * @kbdev:     The kbase device structure for the device (must be a valid
  *             pointer)
  * @is_resume: true if clock on due to resume after suspend, false otherwise
+ *
+ * This function can be used by a power policy to turn the clock for the GPU on.
+ * It should be modified during integration to perform the necessary actions to
+ * ensure that the GPU is fully powered and clocked.
  */
 void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume);
 
@@ -133,6 +133,9 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume);
  * kbase_pm_clock_off - Disable device interrupts, and turn the clock for the
  *                      device off.
  *
+ * @kbdev:      The kbase device structure for the device (must be a valid
+ *              pointer)
+ *
  * This function can be used by a power policy to turn the clock for the GPU
  * off. It should be modified during integration to perform the necessary
  * actions to turn the clock off (if this is possible in the integration).
@@ -141,9 +144,6 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume);
  * then this function would usually be invoked from the runtime suspend
  * callback function.
  *
- * @kbdev:      The kbase device structure for the device (must be a valid
- *              pointer)
- *
  * Return: true  if clock was turned off, or
  *         false if clock can not be turned off due to pending page/bus fault
  *               workers. Caller must flush MMU workqueues and retry
@@ -153,22 +153,22 @@ bool kbase_pm_clock_off(struct kbase_device *kbdev);
 /**
  * kbase_pm_enable_interrupts - Enable interrupts on the device.
  *
- * Interrupts are also enabled after a call to kbase_pm_clock_on().
- *
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * Interrupts are also enabled after a call to kbase_pm_clock_on().
  */
 void kbase_pm_enable_interrupts(struct kbase_device *kbdev);
 
 /**
  * kbase_pm_disable_interrupts - Disable interrupts on the device.
  *
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
  * This prevents delivery of Power Management interrupts to the CPU so that
  * kbase_pm_update_state() will not be called from the IRQ handler
  * until kbase_pm_enable_interrupts() or kbase_pm_clock_on() is called.
  *
  * Interrupts are also disabled after a call to kbase_pm_clock_off().
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
 void kbase_pm_disable_interrupts(struct kbase_device *kbdev);
 
@@ -176,9 +176,9 @@ void kbase_pm_disable_interrupts(struct kbase_device *kbdev);
  * kbase_pm_disable_interrupts_nolock - Version of kbase_pm_disable_interrupts()
  *                                      that does not take the hwaccess_lock
  *
- * Caller must hold the hwaccess_lock.
- *
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * Caller must hold the hwaccess_lock.
  */
 void kbase_pm_disable_interrupts_nolock(struct kbase_device *kbdev);
 
@@ -197,12 +197,11 @@ int kbase_pm_init_hw(struct kbase_device *kbdev, unsigned int flags);
 
 /**
  * kbase_pm_reset_done - The GPU has been reset successfully.
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * This function must be called by the GPU interrupt handler when the
  * RESET_COMPLETED bit is set. It signals to the power management initialization
  * code that the GPU has been successfully reset.
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
 void kbase_pm_reset_done(struct kbase_device *kbdev);
 
@@ -210,6 +209,7 @@ void kbase_pm_reset_done(struct kbase_device *kbdev);
 /**
  * kbase_pm_wait_for_desired_state - Wait for the desired power state to be
  *                                   reached
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * Wait for the L2 and MCU state machines to reach the states corresponding
  * to the values of 'kbase_pm_is_l2_desired' and 'kbase_pm_is_mcu_desired'.
@@ -224,8 +224,6 @@ void kbase_pm_reset_done(struct kbase_device *kbdev);
  * power off in progress and kbase_pm_context_active() was called instead of
  * kbase_csf_scheduler_pm_active().
  *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
- *
  * Return: 0 on success, error code on error
  */
 int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev);
@@ -233,6 +231,7 @@ int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev);
 /**
  * kbase_pm_wait_for_desired_state - Wait for the desired power state to be
  *                                   reached
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * Wait for the L2 and shader power state machines to reach the states
  * corresponding to the values of 'l2_desired' and 'shaders_desired'.
@@ -248,8 +247,6 @@ int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev);
  * must ensure that this is not the case by, for example, calling
  * kbase_pm_wait_for_poweroff_work_complete()
  *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
- *
  * Return: 0 on success, error code on error
  */
 int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev);
@@ -258,6 +255,8 @@ int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev);
 /**
  * kbase_pm_wait_for_l2_powered - Wait for the L2 cache to be powered on
  *
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
  * Wait for the L2 to be powered on, and for the L2 and the state machines of
  * its dependent stack components to stabilise.
  *
@@ -266,8 +265,6 @@ int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev);
  * Unlike kbase_pm_update_state(), the caller must not hold hwaccess_lock,
  * because this function will take that lock itself.
  *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
- *
  * Return: 0 on success, error code on error
  */
 int kbase_pm_wait_for_l2_powered(struct kbase_device *kbdev);
@@ -276,13 +273,12 @@ int kbase_pm_wait_for_l2_powered(struct kbase_device *kbdev);
  * kbase_pm_update_dynamic_cores_onoff - Update the L2 and shader power state
  *                                       machines after changing shader core
  *                                       availability
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * It can be called in any status, so need to check the l2 and shader core
  * power status in this function or it will break shader/l2 state machine
  *
  * Caller must hold hwaccess_lock
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
 void kbase_pm_update_dynamic_cores_onoff(struct kbase_device *kbdev);
 
@@ -318,22 +314,21 @@ void kbase_pm_state_machine_term(struct kbase_device *kbdev);
  * kbase_pm_update_cores_state - Update the desired state of shader cores from
  *                               the Power Policy, and begin any power
  *                               transitions.
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * This function will update the desired_xx_state members of
  * struct kbase_pm_device_data by calling into the current Power Policy. It will
  * then begin power transitions to make the hardware acheive the desired shader
  * core state.
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
 void kbase_pm_update_cores_state(struct kbase_device *kbdev);
 
 /**
  * kbasep_pm_metrics_init - Initialize the metrics gathering framework.
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * This must be called before other metric gathering APIs are called.
  *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * Return: 0 on success, error code on error
  */
@@ -341,29 +336,27 @@ int kbasep_pm_metrics_init(struct kbase_device *kbdev);
 
 /**
  * kbasep_pm_metrics_term - Terminate the metrics gathering framework.
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * This must be called when metric gathering is no longer required. It is an
  * error to call any metrics gathering function (other than
  * kbasep_pm_metrics_init()) after calling this function.
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
 void kbasep_pm_metrics_term(struct kbase_device *kbdev);
 
 /**
  * kbase_pm_report_vsync - Function to be called by the frame buffer driver to
  *                         update the vsync metric.
+ * @kbdev:          The kbase device structure for the device (must be a
+ *                  valid pointer)
+ * @buffer_updated: True if the buffer has been updated on this VSync,
+ *                  false otherwise
  *
  * This function should be called by the frame buffer driver to update whether
  * the system is hitting the vsync target or not. buffer_updated should be true
  * if the vsync corresponded with a new frame being displayed, otherwise it
  * should be false. This function does not need to be called every vsync, but
  * only when the value of @buffer_updated differs from a previous call.
- *
- * @kbdev:          The kbase device structure for the device (must be a
- *                  valid pointer)
- * @buffer_updated: True if the buffer has been updated on this VSync,
- *                  false otherwise
  */
 void kbase_pm_report_vsync(struct kbase_device *kbdev, int buffer_updated);
 
@@ -381,6 +374,7 @@ void kbase_pm_get_dvfs_action(struct kbase_device *kbdev);
 /**
  * kbase_pm_request_gpu_cycle_counter - Mark that the GPU cycle counter is
  *                                      needed
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * If the caller is the first caller then the GPU cycle counters will be enabled
  * along with the l2 cache
@@ -388,13 +382,13 @@ void kbase_pm_get_dvfs_action(struct kbase_device *kbdev);
  * The GPU must be powered when calling this function (i.e.
  * kbase_pm_context_active() must have been called).
  *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
 void kbase_pm_request_gpu_cycle_counter(struct kbase_device *kbdev);
 
 /**
  * kbase_pm_request_gpu_cycle_counter_l2_is_on - Mark GPU cycle counter is
  *                                               needed (l2 cache already on)
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * This is a version of the above function
  * (kbase_pm_request_gpu_cycle_counter()) suitable for being called when the
@@ -405,14 +399,13 @@ void kbase_pm_request_gpu_cycle_counter(struct kbase_device *kbdev);
  * The GPU must be powered when calling this function (i.e.
  * kbase_pm_context_active() must have been called) and the l2 cache must be
  * powered on.
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
 void kbase_pm_request_gpu_cycle_counter_l2_is_on(struct kbase_device *kbdev);
 
 /**
  * kbase_pm_release_gpu_cycle_counter - Mark that the GPU cycle counter is no
  *                                      longer in use
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * If the caller is the last caller then the GPU cycle counters will be
  * disabled. A request must have been made before a call to this.
@@ -420,18 +413,15 @@ void kbase_pm_request_gpu_cycle_counter_l2_is_on(struct kbase_device *kbdev);
  * Caller must not hold the hwaccess_lock, as it will be taken in this function.
  * If the caller is already holding this lock then
  * kbase_pm_release_gpu_cycle_counter_nolock() must be used instead.
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
 void kbase_pm_release_gpu_cycle_counter(struct kbase_device *kbdev);
 
 /**
  * kbase_pm_release_gpu_cycle_counter_nolock - Version of kbase_pm_release_gpu_cycle_counter()
  *                                             that does not take hwaccess_lock
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * Caller must hold the hwaccess_lock.
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
 void kbase_pm_release_gpu_cycle_counter_nolock(struct kbase_device *kbdev);
 
@@ -458,12 +448,11 @@ void kbase_pm_wait_for_gpu_power_down(struct kbase_device *kbdev);
 
 /**
  * kbase_pm_runtime_init - Initialize runtime-pm for Mali GPU platform device
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * Setup the power management callbacks and initialize/enable the runtime-pm
  * for the Mali GPU platform device, using the callback function. This must be
  * called before the kbase_pm_register_access_enable() function.
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
 int kbase_pm_runtime_init(struct kbase_device *kbdev);
 
@@ -476,6 +465,7 @@ void kbase_pm_runtime_term(struct kbase_device *kbdev);
 
 /**
  * kbase_pm_register_access_enable - Enable access to GPU registers
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * Enables access to the GPU registers before power management has powered up
  * the GPU with kbase_pm_powerup().
@@ -486,13 +476,12 @@ void kbase_pm_runtime_term(struct kbase_device *kbdev);
  *
  * This should only be used before power management is powered up with
  * kbase_pm_powerup()
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
 void kbase_pm_register_access_enable(struct kbase_device *kbdev);
 
 /**
  * kbase_pm_register_access_disable - Disable early register access
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * Disables access to the GPU registers enabled earlier by a call to
  * kbase_pm_register_access_enable().
@@ -503,8 +492,6 @@ void kbase_pm_register_access_enable(struct kbase_device *kbdev);
  *
  * This should only be used before power management is powered up with
  * kbase_pm_powerup()
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
 void kbase_pm_register_access_disable(struct kbase_device *kbdev);
 
@@ -515,6 +502,7 @@ void kbase_pm_register_access_disable(struct kbase_device *kbdev);
 /**
  * kbase_pm_metrics_is_active - Check if the power management metrics
  *                              collection is active.
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
  * Note that this returns if the power management metrics collection was
  * active at the time of calling, it is possible that after the call the metrics
@@ -522,7 +510,6 @@ void kbase_pm_register_access_disable(struct kbase_device *kbdev);
  *
  * The caller must handle the consequence that the state may have changed.
  *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
  * Return: true if metrics collection was active else false.
  */
 bool kbase_pm_metrics_is_active(struct kbase_device *kbdev);
@@ -558,12 +545,13 @@ void kbase_pm_get_dvfs_metrics(struct kbase_device *kbdev,
 /**
  * kbase_platform_dvfs_event - Report utilisation to DVFS code for CSF GPU
  *
- * Function provided by platform specific code when DVFS is enabled to allow
- * the power management metrics system to report utilisation.
- *
  * @kbdev:         The kbase device structure for the device (must be a
  *                 valid pointer)
  * @utilisation:   The current calculated utilisation by the metrics system.
+ *
+ * Function provided by platform specific code when DVFS is enabled to allow
+ * the power management metrics system to report utilisation.
+ *
  * Return:         Returns 0 on failure and non zero on success.
  */
 int kbase_platform_dvfs_event(struct kbase_device *kbdev, u32 utilisation);
@@ -571,15 +559,15 @@ int kbase_platform_dvfs_event(struct kbase_device *kbdev, u32 utilisation);
 /**
  * kbase_platform_dvfs_event - Report utilisation to DVFS code for JM GPU
  *
- * Function provided by platform specific code when DVFS is enabled to allow
- * the power management metrics system to report utilisation.
- *
  * @kbdev:         The kbase device structure for the device (must be a
  *                 valid pointer)
  * @utilisation:   The current calculated utilisation by the metrics system.
  * @util_gl_share: The current calculated gl share of utilisation.
  * @util_cl_share: The current calculated cl share of utilisation per core
  *                 group.
+ * Function provided by platform specific code when DVFS is enabled to allow
+ * the power management metrics system to report utilisation.
+ *
  * Return:         Returns 0 on failure and non zero on success.
  */
 int kbase_platform_dvfs_event(struct kbase_device *kbdev, u32 utilisation,
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h b/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h
index 96f196f..5e57c9d 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h
@@ -25,37 +25,47 @@
  * this header file. This header file can be included multiple times in the
  * same compilation unit with different definitions of KBASEP_MCU_STATE().
  *
- * @OFF:                      The MCU is powered off.
- * @PEND_ON_RELOAD:           The warm boot of MCU or cold boot of MCU (with
- *                            firmware reloading) is in progress.
- * @ON_GLB_REINIT_PEND:       The MCU is enabled and Global configuration
- *                            requests have been sent to the firmware.
- * @ON_HWCNT_ENABLE:          The Global requests have completed and MCU is now
- *                            ready for use and hwcnt is being enabled.
- * @ON:                       The MCU is active and hwcnt has been enabled.
- * @ON_CORE_ATTR_UPDATE_PEND: The MCU is active and mask of enabled shader cores
- *                            is being updated.
- * @ON_HWCNT_DISABLE:         The MCU is on and hwcnt is being disabled.
- * @ON_HALT:                  The MCU is on and hwcnt has been disabled, MCU
- *                            halt would be triggered.
- * @ON_PEND_HALT:             MCU halt in progress, confirmation pending.
- * @POWER_DOWN:               MCU halted operations, pending being disabled.
- * @PEND_OFF:                 MCU is being disabled, pending on powering off.
- * @RESET_WAIT:               The GPU is resetting, MCU state is unknown.
- * @HCTL_SHADERS_PEND_ON:     Global configuration requests sent to the firmware
- *                            have completed and shaders have been requested to
- *                            power on.
- * @HCTL_CORES_NOTIFY_PEND:   Shader cores have powered up and firmware is being
- *                            notified of the mask of enabled shader cores.
- * @HCTL_MCU_ON_RECHECK:      MCU is on and hwcnt disabling is triggered
- *                            and checks are done to increase the number of
- *                            enabled cores.
- * @HCTL_SHADERS_READY_OFF:   MCU has halted and cores need to be powered down
- * @HCTL_SHADERS_PEND_OFF:    Cores are transitioning to power down.
- * @ON_SLEEP_INITIATE:        MCU is on and hwcnt has been disabled and MCU
- *                            is being put to sleep.
- * @ON_PEND_SLEEP:            MCU sleep is in progress.
- * @IN_SLEEP:                 Sleep request is completed and MCU has halted.
+ * @OFF:                                The MCU is powered off.
+ * @PEND_ON_RELOAD:                     The warm boot of MCU or cold boot of MCU (with
+ *                                      firmware reloading) is in progress.
+ * @ON_GLB_REINIT_PEND:                 The MCU is enabled and Global configuration
+ *                                      requests have been sent to the firmware.
+ * @ON_HWCNT_ENABLE:                    The Global requests have completed and MCU is now
+ *                                      ready for use and hwcnt is being enabled.
+ * @ON:                                 The MCU is active and hwcnt has been enabled.
+ * @ON_CORE_ATTR_UPDATE_PEND:           The MCU is active and mask of enabled shader cores
+ *                                      is being updated.
+ * @ON_HWCNT_DISABLE:                   The MCU is on and hwcnt is being disabled.
+ * @ON_HALT:                            The MCU is on and hwcnt has been disabled, MCU
+ *                                      halt would be triggered.
+ * @ON_PEND_HALT:                       MCU halt in progress, confirmation pending.
+ * @POWER_DOWN:                         MCU halted operations, pending being disabled.
+ * @PEND_OFF:                           MCU is being disabled, pending on powering off.
+ * @RESET_WAIT:                         The GPU is resetting, MCU state is unknown.
+ * @HCTL_SHADERS_PEND_ON:               Global configuration requests sent to the firmware
+ *                                      have completed and shaders have been requested to
+ *                                      power on.
+ * @HCTL_CORES_NOTIFY_PEND:             Shader cores have powered up and firmware is being
+ *                                      notified of the mask of enabled shader cores.
+ * @HCTL_MCU_ON_RECHECK:                MCU is on and hwcnt disabling is triggered
+ *                                      and checks are done to update the number of
+ *                                      enabled cores.
+ * @HCTL_SHADERS_READY_OFF:             MCU has halted and cores need to be powered down
+ * @HCTL_SHADERS_PEND_OFF:              Cores are transitioning to power down.
+ * @HCTL_CORES_DOWN_SCALE_NOTIFY_PEND:  Firmware has been informed to stop using
+ *                                      specific cores, due to core_mask change request.
+ *                                      After the ACK from FW, the wait will be done for
+ *                                      undesired cores to become inactive.
+ * @HCTL_CORE_INACTIVE_PEND:            Waiting for specific cores to become inactive.
+ *                                      Once the cores become inactive their power down
+ *                                      will be initiated.
+ * @HCTL_SHADERS_CORE_OFF_PEND:         Waiting for specific cores to complete the
+ *                                      transition to power down. Once powered down,
+ *                                      HW counters will be re-enabled.
+ * @ON_SLEEP_INITIATE:                  MCU is on and hwcnt has been disabled and MCU
+ *                                      is being put to sleep.
+ * @ON_PEND_SLEEP:                      MCU sleep is in progress.
+ * @IN_SLEEP:                           Sleep request is completed and MCU has halted.
  */
 KBASEP_MCU_STATE(OFF)
 KBASEP_MCU_STATE(PEND_ON_RELOAD)
@@ -75,6 +85,9 @@ KBASEP_MCU_STATE(HCTL_CORES_NOTIFY_PEND)
 KBASEP_MCU_STATE(HCTL_MCU_ON_RECHECK)
 KBASEP_MCU_STATE(HCTL_SHADERS_READY_OFF)
 KBASEP_MCU_STATE(HCTL_SHADERS_PEND_OFF)
+KBASEP_MCU_STATE(HCTL_CORES_DOWN_SCALE_NOTIFY_PEND)
+KBASEP_MCU_STATE(HCTL_CORE_INACTIVE_PEND)
+KBASEP_MCU_STATE(HCTL_SHADERS_CORE_OFF_PEND)
 /* Additional MCU states to support GPU sleep feature */
 KBASEP_MCU_STATE(ON_SLEEP_INITIATE)
 KBASEP_MCU_STATE(ON_PEND_SLEEP)
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_policy.c b/mali_kbase/backend/gpu/mali_kbase_pm_policy.c
index 7b126a1..bc05bd7 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_policy.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_policy.c
@@ -36,8 +36,13 @@
 #include <linux/of.h>
 
 static const struct kbase_pm_policy *const all_policy_list[] = {
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	&kbase_pm_always_on_policy_ops,
 	&kbase_pm_coarse_demand_policy_ops,
-	&kbase_pm_always_on_policy_ops
+#else /* CONFIG_MALI_NO_MALI */
+	&kbase_pm_coarse_demand_policy_ops,
+	&kbase_pm_always_on_policy_ops,
+#endif /* CONFIG_MALI_NO_MALI */
 };
 
 void kbase_pm_policy_init(struct kbase_device *kbdev)
diff --git a/mali_kbase/backend/gpu/mali_kbase_time.c b/mali_kbase/backend/gpu/mali_kbase_time.c
index 92a366b..51812ee 100644
--- a/mali_kbase/backend/gpu/mali_kbase_time.c
+++ b/mali_kbase/backend/gpu/mali_kbase_time.c
@@ -67,6 +67,9 @@ void kbase_backend_get_gpu_time_norequest(struct kbase_device *kbdev,
  */
 static bool timedwait_cycle_count_active(struct kbase_device *kbdev)
 {
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	return true;
+#else
 	bool success = false;
 	const unsigned int timeout = 100;
 	const unsigned long remaining = jiffies + msecs_to_jiffies(timeout);
@@ -79,6 +82,7 @@ static bool timedwait_cycle_count_active(struct kbase_device *kbdev)
 		}
 	}
 	return success;
+#endif
 }
 #endif
 
diff --git a/mali_kbase/context/backend/mali_kbase_context_csf.c b/mali_kbase/context/backend/mali_kbase_context_csf.c
index 1ce806f..7d45a08 100644
--- a/mali_kbase/context/backend/mali_kbase_context_csf.c
+++ b/mali_kbase/context/backend/mali_kbase_context_csf.c
@@ -48,6 +48,7 @@ void kbase_context_debugfs_init(struct kbase_context *const kctx)
 	kbase_csf_queue_group_debugfs_init(kctx);
 	kbase_csf_kcpu_debugfs_init(kctx);
 	kbase_csf_tiler_heap_debugfs_init(kctx);
+	kbase_csf_tiler_heap_total_debugfs_init(kctx);
 	kbase_csf_cpu_queue_debugfs_init(kctx);
 }
 KBASE_EXPORT_SYMBOL(kbase_context_debugfs_init);
diff --git a/mali_kbase/context/mali_kbase_context.c b/mali_kbase/context/mali_kbase_context.c
index 85f4c0a..9eaf69a 100644
--- a/mali_kbase/context/mali_kbase_context.c
+++ b/mali_kbase/context/mali_kbase_context.c
@@ -163,8 +163,6 @@ int kbase_context_common_init(struct kbase_context *kctx)
 
 	kctx->id = atomic_add_return(1, &(kctx->kbdev->ctx_num)) - 1;
 
-	mutex_init(&kctx->legacy_hwcnt_lock);
-
 	mutex_lock(&kctx->kbdev->kctx_list_lock);
 
 	err = kbase_insert_kctx_to_process(kctx);
diff --git a/mali_kbase/csf/Kbuild b/mali_kbase/csf/Kbuild
index 765e419..29983fb 100644
--- a/mali_kbase/csf/Kbuild
+++ b/mali_kbase/csf/Kbuild
@@ -33,10 +33,12 @@ mali_kbase-y += \
     csf/mali_kbase_csf_kcpu_debugfs.o \
     csf/mali_kbase_csf_protected_memory.o \
     csf/mali_kbase_csf_tiler_heap_debugfs.o \
-    csf/mali_kbase_csf_cpu_queue_debugfs.o
+    csf/mali_kbase_csf_cpu_queue_debugfs.o \
+    csf/mali_kbase_csf_event.o
 
 mali_kbase-$(CONFIG_MALI_REAL_HW) += csf/mali_kbase_csf_firmware.o
 
+mali_kbase-$(CONFIG_MALI_NO_MALI) += csf/mali_kbase_csf_firmware_no_mali.o
 
 ifeq ($(KBUILD_EXTMOD),)
 # in-tree
diff --git a/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.c b/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.c
index ce6d546..546e18d 100644
--- a/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.c
+++ b/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.c
@@ -43,7 +43,7 @@
 #define COMMAND_PROTECTED_ACK ((u32)4)
 #define COMMAND_RESET_ACK ((u32)5)
 
-/**
+/*
  * Default value for the TIMER register of the IPA Control interface,
  * expressed in milliseconds.
  *
@@ -53,22 +53,22 @@
  */
 #define TIMER_DEFAULT_VALUE_MS ((u32)10) /* 10 milliseconds */
 
-/**
+/*
  * Number of timer events per second.
  */
 #define TIMER_EVENTS_PER_SECOND ((u32)1000 / TIMER_DEFAULT_VALUE_MS)
 
-/**
+/*
  * Maximum number of loops polling the GPU before we assume the GPU has hung.
  */
 #define IPA_INACTIVE_MAX_LOOPS ((unsigned int)8000000)
 
-/**
+/*
  * Number of bits used to configure a performance counter in SELECT registers.
  */
 #define IPA_CONTROL_SELECT_BITS_PER_CNT ((u64)8)
 
-/**
+/*
  * Maximum value of a performance counter.
  */
 #define MAX_PRFCNT_VALUE (((u64)1 << 48) - 1)
@@ -251,9 +251,15 @@ static inline void calc_prfcnt_delta(struct kbase_device *kbdev,
 
 	delta_value *= prfcnt->scaling_factor;
 
-	if (!WARN_ON_ONCE(kbdev->csf.ipa_control.cur_gpu_rate == 0))
-		if (prfcnt->gpu_norm)
-			delta_value = div_u64(delta_value, kbdev->csf.ipa_control.cur_gpu_rate);
+	if (kbdev->csf.ipa_control.cur_gpu_rate == 0) {
+		static bool warned;
+
+		if (!warned) {
+			dev_warn(kbdev->dev, "%s: GPU freq is unexpectedly 0", __func__);
+			warned = true;
+		}
+	} else if (prfcnt->gpu_norm)
+		delta_value = div_u64(delta_value, kbdev->csf.ipa_control.cur_gpu_rate);
 
 	prfcnt->latest_raw_value = raw_value;
 
@@ -791,7 +797,7 @@ int kbase_ipa_control_query(struct kbase_device *kbdev, const void *client,
 	ipa_ctrl = &kbdev->csf.ipa_control;
 	session = (struct kbase_ipa_control_session *)client;
 
-	if (WARN_ON(!session->active)) {
+	if (!session->active) {
 		dev_err(kbdev->dev,
 			"%s: attempt to query inactive session", __func__);
 		return -EINVAL;
diff --git a/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.h b/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.h
index 348a52f..0469c48 100644
--- a/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.h
+++ b/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.h
@@ -24,7 +24,7 @@
 
 #include <mali_kbase.h>
 
-/**
+/*
  * Maximum index accepted to configure an IPA Control performance counter.
  */
 #define KBASE_IPA_CONTROL_CNT_MAX_IDX ((u8)64 * 3)
diff --git a/mali_kbase/csf/mali_kbase_csf.c b/mali_kbase/csf/mali_kbase_csf.c
index 142e5a8..8b70349 100644
--- a/mali_kbase/csf/mali_kbase_csf.c
+++ b/mali_kbase/csf/mali_kbase_csf.c
@@ -33,30 +33,12 @@
 #include "mali_kbase_csf_timeout.h"
 #include <csf/ipa_control/mali_kbase_csf_ipa_control.h>
 #include <mali_kbase_hwaccess_time.h>
+#include "mali_kbase_csf_event.h"
 
 #define CS_REQ_EXCEPTION_MASK (CS_REQ_FAULT_MASK | CS_REQ_FATAL_MASK)
 #define CS_ACK_EXCEPTION_MASK (CS_ACK_FAULT_MASK | CS_ACK_FATAL_MASK)
 #define POWER_DOWN_LATEST_FLUSH_VALUE ((u32)1)
 
-/**
- * struct kbase_csf_event - CSF event callback.
- *
- * This structure belongs to the list of events which is part of a Kbase
- * context, and describes a callback function with a custom parameter to pass
- * to it when a CSF event is signalled.
- *
- * @link:      Link to the rest of the list.
- * @kctx:      Pointer to the Kbase context this event belongs to.
- * @callback:  Callback function to call when a CSF event is signalled.
- * @param:     Parameter to pass to the callback function.
- */
-struct kbase_csf_event {
-	struct list_head link;
-	struct kbase_context *kctx;
-	kbase_csf_event_callback *callback;
-	void *param;
-};
-
 const u8 kbasep_csf_queue_group_priority_to_relative[BASE_QUEUE_GROUP_PRIORITY_COUNT] = {
 	KBASE_QUEUE_GROUP_PRIORITY_HIGH,
 	KBASE_QUEUE_GROUP_PRIORITY_MEDIUM,
@@ -530,24 +512,24 @@ static int csf_queue_register_internal(struct kbase_context *kctx,
 	if (reg_ex && reg_ex->ex_buffer_size) {
 		int buf_pages = (reg_ex->ex_buffer_size +
 				 (1 << PAGE_SHIFT) - 1) >> PAGE_SHIFT;
+		struct kbase_va_region *region_ex =
+			kbase_region_tracker_find_region_enclosing_address(kctx,
+									   reg_ex->ex_buffer_base);
 
-		region = kbase_region_tracker_find_region_enclosing_address(
-				kctx, reg_ex->ex_buffer_base);
-		if (kbase_is_region_invalid_or_free(region)) {
+		if (kbase_is_region_invalid_or_free(region_ex)) {
 			ret = -ENOENT;
 			goto out_unlock_vm;
 		}
 
-		if (buf_pages > (region->nr_pages -
-				 ((reg_ex->ex_buffer_base >> PAGE_SHIFT) -
-				 region->start_pfn))) {
+		if (buf_pages > (region_ex->nr_pages -
+				 ((reg_ex->ex_buffer_base >> PAGE_SHIFT) - region_ex->start_pfn))) {
 			ret = -EINVAL;
 			goto out_unlock_vm;
 		}
 
-		region = kbase_region_tracker_find_region_enclosing_address(
-				kctx, reg_ex->ex_offset_var_addr);
-		if (kbase_is_region_invalid_or_free(region)) {
+		region_ex = kbase_region_tracker_find_region_enclosing_address(
+			kctx, reg_ex->ex_offset_var_addr);
+		if (kbase_is_region_invalid_or_free(region_ex)) {
 			ret = -ENOENT;
 			goto out_unlock_vm;
 		}
@@ -582,6 +564,8 @@ static int csf_queue_register_internal(struct kbase_context *kctx,
 	queue->sb_status = 0;
 	queue->blocked_reason = CS_STATUS_BLOCKED_REASON_REASON_UNBLOCKED;
 
+	atomic_set(&queue->pending, 0);
+
 	INIT_LIST_HEAD(&queue->link);
 	INIT_LIST_HEAD(&queue->error.link);
 	INIT_WORK(&queue->oom_event_work, oom_event_worker);
@@ -589,6 +573,7 @@ static int csf_queue_register_internal(struct kbase_context *kctx,
 	list_add(&queue->link, &kctx->csf.queue_list);
 
 	region->flags |= KBASE_REG_NO_USER_FREE;
+	region->user_data = queue;
 
 	/* Initialize the cs_trace configuration parameters, When buffer_size
 	 * is 0, trace is disabled. Here we only update the fields when
@@ -669,8 +654,6 @@ void kbase_csf_queue_terminate(struct kbase_context *kctx,
 	queue = find_queue(kctx, term->buffer_gpu_addr);
 
 	if (queue) {
-		unsigned long flags;
-
 		/* As the GPU queue has been terminated by the
 		 * user space, undo the actions that were performed when the
 		 * queue was registered i.e. remove the queue from the per
@@ -687,19 +670,18 @@ void kbase_csf_queue_terminate(struct kbase_context *kctx,
 			/* After this the Userspace would be able to free the
 			 * memory for GPU queue. In case the Userspace missed
 			 * terminating the queue, the cleanup will happen on
-			 * context termination where teardown of region tracker
+			 * context termination where tear down of region tracker
 			 * would free up the GPU queue memory.
 			 */
 			queue->queue_reg->flags &= ~KBASE_REG_NO_USER_FREE;
+			queue->queue_reg->user_data = NULL;
 		}
 		kbase_gpu_vm_unlock(kctx);
 
-		spin_lock_irqsave(&kctx->csf.event_lock, flags);
 		dev_dbg(kctx->kbdev->dev,
 			"Remove any pending command queue fatal from context %pK\n",
 			(void *)kctx);
-		list_del_init(&queue->error.link);
-		spin_unlock_irqrestore(&kctx->csf.event_lock, flags);
+		kbase_csf_event_remove_error(kctx, &queue->error);
 
 		release_queue(queue);
 	}
@@ -781,6 +763,48 @@ static struct kbase_queue_group *get_bound_queue_group(
 	return group;
 }
 
+/**
+ * pending_submission_worker() - Work item to process pending kicked GPU command queues.
+ *
+ * @work: Pointer to pending_submission_work.
+ *
+ * This function starts all pending queues, for which the work
+ * was previously submitted via ioctl call from application thread.
+ * If the queue is already scheduled and resident, it will be started
+ * right away, otherwise once the group is made resident.
+ */
+static void pending_submission_worker(struct work_struct *work)
+{
+	struct kbase_context *kctx =
+		container_of(work, struct kbase_context, csf.pending_submission_work);
+	struct kbase_device *kbdev = kctx->kbdev;
+	struct kbase_queue *queue;
+	int err = kbase_reset_gpu_prevent_and_wait(kbdev);
+
+	if (err) {
+		dev_err(kbdev->dev, "Unsuccessful GPU reset detected when kicking queue ");
+		return;
+	}
+
+	mutex_lock(&kctx->csf.lock);
+
+	/* Iterate through the queue list and schedule the pending ones for submission. */
+	list_for_each_entry(queue, &kctx->csf.queue_list, link) {
+		if (atomic_cmpxchg(&queue->pending, 1, 0) == 1) {
+			struct kbase_queue_group *group = get_bound_queue_group(queue);
+
+			if (!group || queue->bind_state != KBASE_CSF_QUEUE_BOUND)
+				dev_dbg(kbdev->dev, "queue is not bound to a group");
+			else
+				WARN_ON(kbase_csf_scheduler_queue_start(queue));
+		}
+	}
+
+	mutex_unlock(&kctx->csf.lock);
+
+	kbase_reset_gpu_allow(kbdev);
+}
+
 void kbase_csf_ring_csg_doorbell(struct kbase_device *kbdev, int slot)
 {
 	if (WARN_ON(slot < 0))
@@ -846,40 +870,44 @@ void kbase_csf_ring_cs_kernel_doorbell(struct kbase_device *kbdev,
 		kbase_csf_ring_csg_doorbell(kbdev, csg_nr);
 }
 
+static void enqueue_gpu_submission_work(struct kbase_context *const kctx)
+{
+	queue_work(system_highpri_wq, &kctx->csf.pending_submission_work);
+}
+
 int kbase_csf_queue_kick(struct kbase_context *kctx,
 			 struct kbase_ioctl_cs_queue_kick *kick)
 {
 	struct kbase_device *kbdev = kctx->kbdev;
-	struct kbase_queue_group *group;
-	struct kbase_queue *queue;
+	bool trigger_submission = false;
+	struct kbase_va_region *region;
 	int err = 0;
 
-	err = kbase_reset_gpu_prevent_and_wait(kbdev);
-	if (err) {
-		dev_warn(
-			kbdev->dev,
-			"Unsuccessful GPU reset detected when kicking queue (buffer_addr=0x%.16llx)",
-			kick->buffer_gpu_addr);
-		return err;
-	}
-
-	mutex_lock(&kctx->csf.lock);
-	queue = find_queue(kctx, kick->buffer_gpu_addr);
-	if (!queue)
-		err = -EINVAL;
+	/* GPU work submission happening asynchronously to prevent the contention with
+	 * scheduler lock and as the result blocking application thread. For this reason,
+	 * the vm_lock is used here to get the reference to the queue based on its buffer_gpu_addr
+	 * from the context list of active va_regions.
+	 * Once the target queue is found the pending flag is set to one atomically avoiding
+	 * a race between submission ioctl thread and the work item.
+	 */
+	kbase_gpu_vm_lock(kctx);
+	region = kbase_region_tracker_find_region_enclosing_address(kctx, kick->buffer_gpu_addr);
+	if (!kbase_is_region_invalid_or_free(region)) {
+		struct kbase_queue *queue = region->user_data;
 
-	if (!err) {
-		group = get_bound_queue_group(queue);
-		if (!group) {
-			dev_err(kctx->kbdev->dev, "queue not bound\n");
-			err = -EINVAL;
+		if (queue) {
+			atomic_cmpxchg(&queue->pending, 0, 1);
+			trigger_submission = true;
 		}
+	} else {
+		dev_dbg(kbdev->dev,
+			"Attempt to kick GPU queue without a valid command buffer region");
+		err = -EFAULT;
 	}
+	kbase_gpu_vm_unlock(kctx);
 
-	if (!err)
-		err = kbase_csf_scheduler_queue_start(queue);
-	mutex_unlock(&kctx->csf.lock);
-	kbase_reset_gpu_allow(kbdev);
+	if (likely(trigger_submission))
+		enqueue_gpu_submission_work(kctx);
 
 	return err;
 }
@@ -1310,6 +1338,7 @@ static int create_queue_group(struct kbase_context *const kctx,
 			group->doorbell_nr = KBASEP_USER_DB_NR_INVALID;
 			group->faulted = false;
 
+
 			group->group_uid = generate_group_uid();
 			create->out.group_uid = group->group_uid;
 
@@ -1343,6 +1372,7 @@ static int create_queue_group(struct kbase_context *const kctx,
 	return group_handle;
 }
 
+
 int kbase_csf_queue_group_create(struct kbase_context *const kctx,
 			union kbase_ioctl_cs_queue_group_create *const create)
 {
@@ -1368,6 +1398,9 @@ int kbase_csf_queue_group_create(struct kbase_context *const kctx,
 			"No CSG has at least %d CSs",
 			create->in.cs_min);
 		err = -EINVAL;
+	} else if (create->in.reserved) {
+		dev_warn(kctx->kbdev->dev, "Reserved field was set to non-0");
+		err = -EINVAL;
 	} else {
 		/* For the CSG which satisfies the condition for having
 		 * the needed number of CSs, check whether it also conforms
@@ -1517,6 +1550,19 @@ static void cancel_queue_group_events(struct kbase_queue_group *group)
 	cancel_work_sync(&group->protm_event_work);
 }
 
+static void remove_pending_group_fatal_error(struct kbase_queue_group *group)
+{
+	struct kbase_context *kctx = group->kctx;
+
+	dev_dbg(kctx->kbdev->dev,
+		"Remove any pending group fatal error from context %pK\n",
+		(void *)group->kctx);
+
+	kbase_csf_event_remove_error(kctx, &group->error_tiler_oom);
+	kbase_csf_event_remove_error(kctx, &group->error_timeout);
+	kbase_csf_event_remove_error(kctx, &group->error_fatal);
+}
+
 void kbase_csf_queue_group_terminate(struct kbase_context *kctx,
 				     u8 group_handle)
 {
@@ -1539,19 +1585,7 @@ void kbase_csf_queue_group_terminate(struct kbase_context *kctx,
 	group = find_queue_group(kctx, group_handle);
 
 	if (group) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&kctx->csf.event_lock, flags);
-
-		dev_dbg(kbdev->dev,
-			"Remove any pending group fatal error from context %pK\n",
-			(void *)group->kctx);
-
-		list_del_init(&group->error_tiler_oom.link);
-		list_del_init(&group->error_timeout.link);
-		list_del_init(&group->error_fatal.link);
-		spin_unlock_irqrestore(&kctx->csf.event_lock, flags);
-
+		remove_pending_group_fatal_error(group);
 		term_queue_group(group);
 		kctx->csf.queue_groups[group_handle] = NULL;
 	}
@@ -1603,48 +1637,6 @@ int kbase_csf_queue_group_suspend(struct kbase_context *kctx,
 	return err;
 }
 
-/**
- * add_error() - Add an error to the list of errors to report to user space
- *
- * @kctx:  Address of a base context associated with a GPU address space.
- * @error: Address of the item to be added to the context's pending error list.
- * @data:  Error data to be returned to userspace.
- *
- * Does not wake up the event queue blocking a user thread in kbase_poll. This
- * is to make it more efficient to add multiple errors.
- *
- * The added error must not already be on the context's list of errors waiting
- * to be reported (e.g. because a previous error concerning the same object has
- * not yet been reported).
- */
-static void add_error(struct kbase_context *const kctx,
-		      struct kbase_csf_notification *const error,
-		      struct base_csf_notification const *const data)
-{
-	unsigned long flags;
-
-	if (WARN_ON(!kctx))
-		return;
-
-	if (WARN_ON(!error))
-		return;
-
-	if (WARN_ON(!data))
-		return;
-
-	spin_lock_irqsave(&kctx->csf.event_lock, flags);
-
-	if (!WARN_ON(!list_empty(&error->link))) {
-		error->data = *data;
-		list_add_tail(&error->link, &kctx->csf.error_list);
-		dev_dbg(kctx->kbdev->dev,
-			"Added error %pK of type %d in context %pK\n",
-			(void *)error, data->type, (void *)kctx);
-	}
-
-	spin_unlock_irqrestore(&kctx->csf.event_lock, flags);
-}
-
 void kbase_csf_add_group_fatal_error(
 	struct kbase_queue_group *const group,
 	struct base_gpu_queue_group_error const *const err_payload)
@@ -1667,7 +1659,7 @@ void kbase_csf_add_group_fatal_error(
 		}
 	};
 
-	add_error(group->kctx, &group->error_fatal, &error);
+	kbase_csf_event_add_error(group->kctx, &group->error_fatal, &error);
 }
 
 void kbase_csf_active_queue_groups_reset(struct kbase_device *kbdev,
@@ -1708,12 +1700,11 @@ int kbase_csf_ctx_init(struct kbase_context *kctx)
 	struct kbase_device *kbdev = kctx->kbdev;
 	int err = -ENOMEM;
 
-	INIT_LIST_HEAD(&kctx->csf.event_callback_list);
 	INIT_LIST_HEAD(&kctx->csf.queue_list);
 	INIT_LIST_HEAD(&kctx->csf.link);
-	INIT_LIST_HEAD(&kctx->csf.error_list);
 
-	spin_lock_init(&kctx->csf.event_lock);
+	kbase_csf_event_init(kctx);
+
 	kctx->csf.user_reg_vma = NULL;
 	mutex_lock(&kbdev->pm.lock);
 	/* The inode information for /dev/malixx file is not available at the
@@ -1744,9 +1735,11 @@ int kbase_csf_ctx_init(struct kbase_context *kctx)
 			if (likely(!err)) {
 				err = kbase_csf_tiler_heap_context_init(kctx);
 
-				if (likely(!err))
+				if (likely(!err)) {
 					mutex_init(&kctx->csf.lock);
-				else
+					INIT_WORK(&kctx->csf.pending_submission_work,
+						  pending_submission_worker);
+				} else
 					kbase_csf_kcpu_queue_context_term(kctx);
 			}
 
@@ -1829,7 +1822,6 @@ void kbase_csf_ctx_term(struct kbase_context *kctx)
 	 * for queue groups & kcpu queues, hence no need to explicitly remove
 	 * those debugfs files.
 	 */
-	kbase_csf_event_wait_remove_all(kctx);
 
 	/* Wait for a GPU reset if it is happening, prevent it if not happening */
 	err = kbase_reset_gpu_prevent_and_wait(kbdev);
@@ -1841,13 +1833,20 @@ void kbase_csf_ctx_term(struct kbase_context *kctx)
 	else
 		reset_prevented = true;
 
+	cancel_work_sync(&kctx->csf.pending_submission_work);
+
 	mutex_lock(&kctx->csf.lock);
+
 	/* Iterate through the queue groups that were not terminated by
 	 * userspace and issue the term request to firmware for them.
 	 */
 	for (i = 0; i < MAX_QUEUE_GROUP_NUM; i++) {
-		if (kctx->csf.queue_groups[i])
-			term_queue_group(kctx->csf.queue_groups[i]);
+		struct kbase_queue_group *group = kctx->csf.queue_groups[i];
+
+		if (group) {
+			remove_pending_group_fatal_error(group);
+			term_queue_group(group);
+		}
 	}
 	mutex_unlock(&kctx->csf.lock);
 
@@ -1910,185 +1909,19 @@ void kbase_csf_ctx_term(struct kbase_context *kctx)
 	kbase_csf_tiler_heap_context_term(kctx);
 	kbase_csf_kcpu_queue_context_term(kctx);
 	kbase_csf_scheduler_context_term(kctx);
+	kbase_csf_event_term(kctx);
 
 	mutex_destroy(&kctx->csf.lock);
 }
 
-int kbase_csf_event_wait_add(struct kbase_context *kctx,
-			     kbase_csf_event_callback *callback, void *param)
-{
-	int err = -ENOMEM;
-	struct kbase_csf_event *event =
-		kzalloc(sizeof(struct kbase_csf_event), GFP_KERNEL);
-
-	if (event) {
-		unsigned long flags;
-
-		event->kctx = kctx;
-		event->callback = callback;
-		event->param = param;
-
-		spin_lock_irqsave(&kctx->csf.event_lock, flags);
-		list_add_tail(&event->link, &kctx->csf.event_callback_list);
-		dev_dbg(kctx->kbdev->dev,
-			"Added event handler %pK with param %pK\n", event,
-			event->param);
-		spin_unlock_irqrestore(&kctx->csf.event_lock, flags);
-
-		err = 0;
-	}
-
-	return err;
-}
-
-void kbase_csf_event_wait_remove(struct kbase_context *kctx,
-		kbase_csf_event_callback *callback, void *param)
-{
-	struct kbase_csf_event *event;
-	unsigned long flags;
-
-	spin_lock_irqsave(&kctx->csf.event_lock, flags);
-
-	list_for_each_entry(event, &kctx->csf.event_callback_list, link) {
-		if ((event->callback == callback) && (event->param == param)) {
-			list_del(&event->link);
-			dev_dbg(kctx->kbdev->dev,
-				"Removed event handler %pK with param %pK\n",
-				event, event->param);
-			kfree(event);
-			break;
-		}
-	}
-	spin_unlock_irqrestore(&kctx->csf.event_lock, flags);
-}
-
-bool kbase_csf_read_error(struct kbase_context *kctx,
-		struct base_csf_notification *event_data)
-{
-	bool got_event = true;
-	struct kbase_csf_notification *error_data = NULL;
-	unsigned long flags;
-
-	spin_lock_irqsave(&kctx->csf.event_lock, flags);
-
-	if (likely(!list_empty(&kctx->csf.error_list))) {
-		error_data = list_first_entry(&kctx->csf.error_list,
-			struct kbase_csf_notification, link);
-		list_del_init(&error_data->link);
-		*event_data = error_data->data;
-		dev_dbg(kctx->kbdev->dev, "Dequeued error %pK in context %pK\n",
-			(void *)error_data, (void *)kctx);
-	} else {
-		got_event = false;
-	}
-
-	spin_unlock_irqrestore(&kctx->csf.event_lock, flags);
-
-	return got_event;
-}
-
-bool kbase_csf_error_pending(struct kbase_context *kctx)
-{
-	bool event_pended = false;
-	unsigned long flags;
-
-	spin_lock_irqsave(&kctx->csf.event_lock, flags);
-	event_pended = !list_empty(&kctx->csf.error_list);
-	dev_dbg(kctx->kbdev->dev, "%s error is pending in context %pK\n",
-		event_pended ? "An" : "No", (void *)kctx);
-	spin_unlock_irqrestore(&kctx->csf.event_lock, flags);
-
-	return event_pended;
-}
-
-static void sync_update_notify_gpu(struct kbase_context *kctx)
-{
-	bool can_notify_gpu;
-	unsigned long flags;
-
-	spin_lock_irqsave(&kctx->kbdev->hwaccess_lock, flags);
-	can_notify_gpu = kctx->kbdev->pm.backend.gpu_powered;
-#ifdef KBASE_PM_RUNTIME
-	if (kctx->kbdev->pm.backend.gpu_sleep_mode_active)
-		can_notify_gpu = false;
-#endif
-
-	if (can_notify_gpu) {
-		kbase_csf_ring_doorbell(kctx->kbdev, CSF_KERNEL_DOORBELL_NR);
-		KBASE_KTRACE_ADD(kctx->kbdev, SYNC_UPDATE_EVENT_NOTIFY_GPU, kctx, 0u);
-	}
-
-	spin_unlock_irqrestore(&kctx->kbdev->hwaccess_lock, flags);
-}
-
-void kbase_csf_event_signal(struct kbase_context *kctx, bool notify_gpu)
-{
-	struct kbase_csf_event *event, *next_event;
-	unsigned long flags;
-
-	dev_dbg(kctx->kbdev->dev,
-		"Signal event (%s GPU notify) for context %pK\n",
-		notify_gpu ? "with" : "without", (void *)kctx);
-
-	/* First increment the signal count and wake up event thread.
-	 */
-	atomic_set(&kctx->event_count, 1);
-	kbase_event_wakeup(kctx);
-
-	/* Signal the CSF firmware. This is to ensure that pending command
-	 * stream synch object wait operations are re-evaluated.
-	 * Write to GLB_DOORBELL would suffice as spec says that all pending
-	 * synch object wait operations are re-evaluated on a write to any
-	 * CS_DOORBELL/GLB_DOORBELL register.
-	 */
-	if (notify_gpu)
-		sync_update_notify_gpu(kctx);
-
-	/* Now invoke the callbacks registered on backend side.
-	 * Allow item removal inside the loop, if requested by the callback.
-	 */
-	spin_lock_irqsave(&kctx->csf.event_lock, flags);
-
-	list_for_each_entry_safe(
-		event, next_event, &kctx->csf.event_callback_list, link) {
-		enum kbase_csf_event_callback_action action;
-
-		dev_dbg(kctx->kbdev->dev,
-			"Calling event handler %pK with param %pK\n",
-			(void *)event, event->param);
-		action = event->callback(event->param);
-		if (action == KBASE_CSF_EVENT_CALLBACK_REMOVE) {
-			list_del(&event->link);
-			kfree(event);
-		}
-	}
-
-	spin_unlock_irqrestore(&kctx->csf.event_lock, flags);
-}
-
-void kbase_csf_event_wait_remove_all(struct kbase_context *kctx)
-{
-	struct kbase_csf_event *event, *next_event;
-	unsigned long flags;
-
-	spin_lock_irqsave(&kctx->csf.event_lock, flags);
-
-	list_for_each_entry_safe(
-		event, next_event, &kctx->csf.event_callback_list, link) {
-		list_del(&event->link);
-		dev_dbg(kctx->kbdev->dev,
-			"Removed event handler %pK with param %pK\n",
-			(void *)event, event->param);
-		kfree(event);
-	}
-
-	spin_unlock_irqrestore(&kctx->csf.event_lock, flags);
-}
-
 /**
  * handle_oom_event - Handle the OoM event generated by the firmware for the
  *                    CSI.
  *
+ * @kctx: Pointer to the kbase context in which the tiler heap was initialized.
+ * @stream: Pointer to the structure containing info provided by the firmware
+ *          about the CSI.
+ *
  * This function will handle the OoM event request from the firmware for the
  * CS. It will retrieve the address of heap context and heap's
  * statistics (like number of render passes in-flight) from the CS's kernel
@@ -2097,10 +1930,6 @@ void kbase_csf_event_wait_remove_all(struct kbase_context *kctx)
  * It will also update the CS's kernel input page with the address
  * of a new chunk that was allocated.
  *
- * @kctx: Pointer to the kbase context in which the tiler heap was initialized.
- * @stream: Pointer to the structure containing info provided by the firmware
- *          about the CSI.
- *
  * Return: 0 if successfully handled the request, otherwise a negative error
  *         code on failure.
  */
@@ -2171,7 +2000,9 @@ static void report_tiler_oom_error(struct kbase_queue_group *group)
 							  BASE_GPU_QUEUE_GROUP_ERROR_TILER_HEAP_OOM,
 					  } } } };
 
-	add_error(group->kctx, &group->error_tiler_oom, &error);
+	kbase_csf_event_add_error(group->kctx,
+				  &group->error_tiler_oom,
+				  &error);
 	kbase_event_wakeup(group->kctx);
 }
 
@@ -2316,7 +2147,7 @@ static void report_group_timeout_error(struct kbase_queue_group *const group)
 		 "Notify the event notification thread, forward progress timeout (%llu cycles)\n",
 		 kbase_csf_timeout_get(group->kctx->kbdev));
 
-	add_error(group->kctx, &group->error_timeout, &error);
+	kbase_csf_event_add_error(group->kctx, &group->error_timeout, &error);
 	kbase_event_wakeup(group->kctx);
 }
 
@@ -2452,7 +2283,7 @@ static void report_queue_fatal_error(struct kbase_queue *const queue,
 		}
 	};
 
-	add_error(queue->kctx, &queue->error, &error);
+	kbase_csf_event_add_error(queue->kctx, &queue->error, &error);
 	kbase_event_wakeup(queue->kctx);
 }
 
@@ -3008,6 +2839,7 @@ void kbase_csf_interrupt(struct kbase_device *kbdev, u32 val)
 			if ((glb_req ^ glb_ack) & GLB_REQ_IDLE_EVENT_MASK) {
 				int non_idle_offslot_grps;
 				bool can_suspend_on_idle;
+
 				dev_dbg(kbdev->dev, "Idle-hysteresis event flagged");
 				kbase_csf_firmware_global_input_mask(
 						global_iface, GLB_REQ, glb_ack,
diff --git a/mali_kbase/csf/mali_kbase_csf.h b/mali_kbase/csf/mali_kbase_csf.h
index 640d2ed..e3db81d 100644
--- a/mali_kbase/csf/mali_kbase_csf.h
+++ b/mali_kbase/csf/mali_kbase_csf.h
@@ -26,6 +26,7 @@
 #include "mali_kbase_csf_scheduler.h"
 #include "mali_kbase_csf_firmware.h"
 #include "mali_kbase_csf_protected_memory.h"
+#include "mali_kbase_hwaccess_time.h"
 
 /* Indicate invalid CS h/w interface
  */
@@ -47,129 +48,6 @@
 #define FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER (5)
 
 /**
- * enum kbase_csf_event_callback_action - return type for CSF event callbacks.
- *
- * @KBASE_CSF_EVENT_CALLBACK_FIRST: Never set explicitly.
- * It doesn't correspond to any action or type of event callback.
- *
- * @KBASE_CSF_EVENT_CALLBACK_KEEP: The callback will remain registered.
- *
- * @KBASE_CSF_EVENT_CALLBACK_REMOVE: The callback will be removed
- * immediately upon return.
- *
- * @KBASE_CSF_EVENT_CALLBACK_LAST: Never set explicitly.
- * It doesn't correspond to any action or type of event callback.
- */
-enum kbase_csf_event_callback_action {
-	KBASE_CSF_EVENT_CALLBACK_FIRST = 0,
-	KBASE_CSF_EVENT_CALLBACK_KEEP,
-	KBASE_CSF_EVENT_CALLBACK_REMOVE,
-	KBASE_CSF_EVENT_CALLBACK_LAST,
-};
-
-/**
- * kbase_csf_event_callback_action - type for callback functions to be
- *                                   called upon CSF events.
- *
- * This is the type of callback functions that can be registered
- * for CSF events. These function calls shall be triggered by any call
- * to kbase_csf_event_signal.
- *
- * @param:   Generic parameter to pass to the callback function.
- *
- * Return: KBASE_CSF_EVENT_CALLBACK_KEEP if the callback should remain
- * registered, or KBASE_CSF_EVENT_CALLBACK_REMOVE if it should be removed.
- */
-typedef enum kbase_csf_event_callback_action kbase_csf_event_callback(void *param);
-
-/**
- * kbase_csf_event_wait_add - Add a CSF event callback
- *
- * This function adds an event callback to the list of CSF event callbacks
- * belonging to a given Kbase context, to be triggered when a CSF event is
- * signalled by kbase_csf_event_signal.
- *
- * @kctx:      The Kbase context the @callback should be registered to.
- * @callback:  The callback function to register.
- * @param:     Custom parameter to be passed to the @callback function.
- *
- * Return: 0 on success, or negative on failure.
- */
-int kbase_csf_event_wait_add(struct kbase_context *kctx,
-		kbase_csf_event_callback *callback, void *param);
-
-/**
- * kbase_csf_event_wait_remove - Remove a CSF event callback
- *
- * This function removes an event callback from the list of CSF event callbacks
- * belonging to a given Kbase context.
- *
- * @kctx:      The kbase context the @callback should be removed from.
- * @callback:  The callback function to remove.
- * @param:     Custom parameter that would have been passed to the @p callback
- *             function.
- */
-void kbase_csf_event_wait_remove(struct kbase_context *kctx,
-		kbase_csf_event_callback *callback, void *param);
-
-/**
- * kbase_csf_event_wait_remove_all - Removes all CSF event callbacks
- *
- * This function empties the list of CSF event callbacks belonging to a given
- * Kbase context.
- *
- * @kctx:  The kbase context for which CSF event callbacks have to be removed.
- */
-void kbase_csf_event_wait_remove_all(struct kbase_context *kctx);
-
-/**
- * kbase_csf_read_error - Read CS fatal error
- *
- * This function takes the CS fatal error from context's ordered
- * error_list, copies its contents to @event_data.
- *
- * @kctx:       The kbase context to read fatal error from
- * @event_data: Caller-provided buffer to copy the fatal error to
- *
- * Return: true if fatal error is read successfully.
- */
-bool kbase_csf_read_error(struct kbase_context *kctx,
-		struct base_csf_notification *event_data);
-
-/**
- * kbase_csf_error_pending - Check whether fatal error is pending
- *
- * @kctx:  The kbase context to check fatal error upon.
- *
- * Return: true if fatal error is pending.
- */
-bool kbase_csf_error_pending(struct kbase_context *kctx);
-
-/**
- * kbase_csf_event_signal - Signal a CSF event
- *
- * This function triggers all the CSF event callbacks that are registered to
- * a given Kbase context, and also signals the event handling thread of
- * userspace driver waiting for the CSF event.
- *
- * @kctx:  The kbase context whose CSF event callbacks shall be triggered.
- * @notify_gpu: Flag to indicate if CSF firmware should be notified of the
- *              signaling of event that happened on the Driver side, either
- *              the signal came from userspace or from kcpu queues.
- */
-void kbase_csf_event_signal(struct kbase_context *kctx, bool notify_gpu);
-
-static inline void kbase_csf_event_signal_notify_gpu(struct kbase_context *kctx)
-{
-	kbase_csf_event_signal(kctx, true);
-}
-
-static inline void kbase_csf_event_signal_cpu_only(struct kbase_context *kctx)
-{
-	kbase_csf_event_signal(kctx, false);
-}
-
-/**
  * kbase_csf_ctx_init - Initialize the CSF interface for a GPU address space.
  *
  * @kctx:	Pointer to the kbase context which is being initialized.
@@ -182,11 +60,11 @@ int kbase_csf_ctx_init(struct kbase_context *kctx);
  * kbase_csf_ctx_handle_fault - Terminate queue groups & notify fault upon
  *                              GPU bus fault, MMU page fault or similar.
  *
- * This function terminates all GPU command queue groups in the context and
- * notifies the event notification thread of the fault.
- *
  * @kctx:       Pointer to faulty kbase context.
  * @fault:      Pointer to the fault.
+ *
+ * This function terminates all GPU command queue groups in the context and
+ * notifies the event notification thread of the fault.
  */
 void kbase_csf_ctx_handle_fault(struct kbase_context *kctx,
 		struct kbase_fault *fault);
@@ -194,10 +72,10 @@ void kbase_csf_ctx_handle_fault(struct kbase_context *kctx,
 /**
  * kbase_csf_ctx_term - Terminate the CSF interface for a GPU address space.
  *
+ * @kctx:	Pointer to the kbase context which is being terminated.
+ *
  * This function terminates any remaining CSGs and CSs which weren't destroyed
  * before context termination.
- *
- * @kctx:	Pointer to the kbase context which is being terminated.
  */
 void kbase_csf_ctx_term(struct kbase_context *kctx);
 
@@ -246,14 +124,14 @@ void kbase_csf_queue_terminate(struct kbase_context *kctx,
  * kbase_csf_alloc_command_stream_user_pages - Allocate resources for a
  *                                             GPU command queue.
  *
- * This function allocates a pair of User mode input/output pages for a
- * GPU command queue and maps them in the shared interface segment of MCU
- * firmware address space. Also reserves a hardware doorbell page for the queue.
- *
  * @kctx:	Pointer to the kbase context within which the resources
  *		for the queue are being allocated.
  * @queue:	Pointer to the queue for which to allocate resources.
  *
+ * This function allocates a pair of User mode input/output pages for a
+ * GPU command queue and maps them in the shared interface segment of MCU
+ * firmware address space. Also reserves a hardware doorbell page for the queue.
+ *
  * Return:	0 on success, or negative on failure.
  */
 int kbase_csf_alloc_command_stream_user_pages(struct kbase_context *kctx,
@@ -294,9 +172,9 @@ void kbase_csf_queue_unbind_stopped(struct kbase_queue *queue);
 /**
  * kbase_csf_queue_kick - Schedule a GPU command queue on the firmware
  *
- * @kctx:	The kbase context.
- * @kick:	Pointer to the struct which specifies the queue
- *		that needs to be scheduled.
+ * @kctx:   The kbase context.
+ * @kick:   Pointer to the struct which specifies the queue
+ *          that needs to be scheduled.
  *
  * Return:	0 on success, or negative on failure.
  */
@@ -307,12 +185,12 @@ int kbase_csf_queue_kick(struct kbase_context *kctx,
  * kbase_csf_queue_group_handle_is_valid - Find if the given queue group handle
  *                                         is valid.
  *
- * This function is used to determine if the queue group handle is valid.
- *
  * @kctx:		The kbase context under which the queue group exists.
  * @group_handle:	Handle for the group which uniquely identifies it within
  *			the context with which it was created.
  *
+ * This function is used to determine if the queue group handle is valid.
+ *
  * Return:		0 on success, or negative on failure.
  */
 int kbase_csf_queue_group_handle_is_valid(struct kbase_context *kctx,
@@ -359,8 +237,6 @@ void kbase_csf_term_descheduled_queue_group(struct kbase_queue_group *group);
 /**
  * kbase_csf_queue_group_suspend - Suspend a GPU command queue group
  *
- * This function is used to suspend a queue group and copy the suspend buffer.
- *
  * @kctx:		The kbase context for which the queue group is to be
  *			suspended.
  * @sus_buf:		Pointer to the structure which contains details of the
@@ -368,6 +244,8 @@ void kbase_csf_term_descheduled_queue_group(struct kbase_queue_group *group);
  * @group_handle:	Handle for the group which uniquely identifies it within
  *			the context within which it was created.
  *
+ * This function is used to suspend a queue group and copy the suspend buffer.
+ *
  * Return:		0 on success or negative value if failed to suspend
  *			queue group and copy suspend buffer contents.
  */
@@ -397,12 +275,12 @@ void kbase_csf_interrupt(struct kbase_device *kbdev, u32 val);
  *                                   the update of userspace mapping of HW
  *                                   doorbell page.
  *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ *
  * The function creates a file and allocates a dummy page to facilitate the
  * update of userspace mapping to point to the dummy page instead of the real
  * HW doorbell page after the suspend of queue group.
  *
- * @kbdev: Instance of a GPU platform device that implements a CSF interface.
- *
  * Return: 0 on success, or negative on failure.
  */
 int kbase_csf_doorbell_mapping_init(struct kbase_device *kbdev);
@@ -420,14 +298,14 @@ void kbase_csf_doorbell_mapping_term(struct kbase_device *kbdev);
  *                                       instead of the User register page after
  *                                       the GPU power down.
  *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ *
  * The function allocates a dummy page which is used to replace the User
  * register page in the userspace mapping after the power down of GPU.
  * On the power up of GPU, the mapping is updated to point to the real
  * User register page. The mapping is used to allow access to LATEST_FLUSH
  * register from userspace.
  *
- * @kbdev: Instance of a GPU platform device that implements a CSF interface.
- *
  * Return: 0 on success, or negative on failure.
  */
 int kbase_csf_setup_dummy_user_reg_page(struct kbase_device *kbdev);
@@ -443,10 +321,10 @@ void kbase_csf_free_dummy_user_reg_page(struct kbase_device *kbdev);
 /**
  * kbase_csf_ring_csg_doorbell - ring the doorbell for a CSG interface.
  *
- * The function kicks a notification on the CSG interface to firmware.
- *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  * @slot: Index of CSG interface for ringing the door-bell.
+ *
+ * The function kicks a notification on the CSG interface to firmware.
  */
 void kbase_csf_ring_csg_doorbell(struct kbase_device *kbdev, int slot);
 
@@ -454,10 +332,10 @@ void kbase_csf_ring_csg_doorbell(struct kbase_device *kbdev, int slot);
  * kbase_csf_ring_csg_slots_doorbell - ring the doorbell for a set of CSG
  *                                     interfaces.
  *
- * The function kicks a notification on a set of CSG interfaces to firmware.
- *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  * @slot_bitmap: bitmap for the given slots, slot-0 on bit-0, etc.
+ *
+ * The function kicks a notification on a set of CSG interfaces to firmware.
  */
 void kbase_csf_ring_csg_slots_doorbell(struct kbase_device *kbdev,
 				       u32 slot_bitmap);
@@ -466,9 +344,6 @@ void kbase_csf_ring_csg_slots_doorbell(struct kbase_device *kbdev,
  * kbase_csf_ring_cs_kernel_doorbell - ring the kernel doorbell for a CSI
  *                                     assigned to a GPU queue
  *
- * The function sends a doorbell interrupt notification to the firmware for
- * a CSI assigned to a GPU queue.
- *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  * @csi_index: ID of the CSI assigned to the GPU queue.
  * @csg_nr:    Index of the CSG slot assigned to the queue
@@ -479,6 +354,9 @@ void kbase_csf_ring_csg_slots_doorbell(struct kbase_device *kbdev,
  *                     The flag is supposed be false only when the input page
  *                     for bound GPU queues is programmed at the time of
  *                     starting/resuming the group on a CSG slot.
+ *
+ * The function sends a doorbell interrupt notification to the firmware for
+ * a CSI assigned to a GPU queue.
  */
 void kbase_csf_ring_cs_kernel_doorbell(struct kbase_device *kbdev,
 				       int csi_index, int csg_nr,
@@ -488,11 +366,11 @@ void kbase_csf_ring_cs_kernel_doorbell(struct kbase_device *kbdev,
  * kbase_csf_ring_cs_user_doorbell - ring the user doorbell allocated for a
  *                                   queue.
  *
- * The function kicks a notification to the firmware on the doorbell assigned
- * to the queue.
- *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  * @queue: Pointer to the queue for ringing the door-bell.
+ *
+ * The function kicks a notification to the firmware on the doorbell assigned
+ * to the queue.
  */
 void kbase_csf_ring_cs_user_doorbell(struct kbase_device *kbdev,
 			struct kbase_queue *queue);
@@ -563,5 +441,23 @@ static inline u8 kbase_csf_priority_queue_group_priority_to_relative(u8 priority
 	return kbasep_csf_queue_group_priority_to_relative[priority];
 }
 
-
+/**
+ * kbase_csf_ktrace_gpu_cycle_cnt - Wrapper to retreive the GPU cycle counter
+ *                                  value for Ktrace purpose.
+ *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ *
+ * This function is just a wrapper to retreive the GPU cycle counter value, to
+ * avoid any overhead on Release builds where Ktrace is disabled by default.
+ *
+ * Return: Snapshot of the GPU cycle count register.
+ */
+static inline u64 kbase_csf_ktrace_gpu_cycle_cnt(struct kbase_device *kbdev)
+{
+#if KBASE_KTRACE_ENABLE
+	return kbase_backend_get_cycle_cnt(kbdev);
+#else
+	return 0;
+#endif
+}
 #endif /* _KBASE_CSF_H_ */
diff --git a/mali_kbase/csf/mali_kbase_csf_defs.h b/mali_kbase/csf/mali_kbase_csf_defs.h
index de471eb..0712648 100644
--- a/mali_kbase/csf/mali_kbase_csf_defs.h
+++ b/mali_kbase/csf/mali_kbase_csf_defs.h
@@ -30,6 +30,7 @@
 #include <linux/wait.h>
 
 #include "mali_kbase_csf_firmware.h"
+#include "mali_kbase_csf_event.h"
 
 /* Maximum number of KCPU command queues to be created per GPU address space.
  */
@@ -331,6 +332,7 @@ struct kbase_csf_notification {
  *                    queue.
  * @cs_fatal_info:    Records additional information about the CS fatal event.
  * @cs_fatal:         Records information about the CS fatal event.
+ * @pending:          Indicating whether the queue has new submitted work.
  */
 struct kbase_queue {
 	struct kbase_context *kctx;
@@ -364,6 +366,7 @@ struct kbase_queue {
 	struct work_struct fatal_event_work;
 	u64 cs_fatal_info;
 	u32 cs_fatal;
+	atomic_t pending;
 };
 
 /**
@@ -487,6 +490,7 @@ struct kbase_queue_group {
 	struct kbase_csf_notification error_tiler_oom;
 
 	struct work_struct timer_event_work;
+
 };
 
 /**
@@ -538,10 +542,6 @@ struct kbase_csf_cpu_queue_context {
 /**
  * struct kbase_csf_heap_context_allocator - Allocator of heap contexts
  *
- * Heap context structures are allocated by the kernel for use by the firmware.
- * The current implementation subdivides a single GPU memory region for use as
- * a sparse array.
- *
  * @kctx:     Pointer to the kbase context with which this allocator is
  *            associated.
  * @region:   Pointer to a GPU memory region from which heap context structures
@@ -552,6 +552,10 @@ struct kbase_csf_cpu_queue_context {
  * @lock:     Lock preventing concurrent access to the @in_use bitmap.
  * @in_use:   Bitmap that indicates which heap context structures are currently
  *            allocated (in @region).
+ *
+ * Heap context structures are allocated by the kernel for use by the firmware.
+ * The current implementation subdivides a single GPU memory region for use as
+ * a sparse array.
  */
 struct kbase_csf_heap_context_allocator {
 	struct kbase_context *kctx;
@@ -565,10 +569,6 @@ struct kbase_csf_heap_context_allocator {
  * struct kbase_csf_tiler_heap_context - Object representing the tiler heaps
  *                                       context for a GPU address space.
  *
- * This contains all of the CSF state relating to chunked tiler heaps for one
- * @kbase_context. It is not the same as a heap context structure allocated by
- * the kernel for use by the firmware.
- *
  * @lock:        Lock to prevent the concurrent access to tiler heaps (after the
  *               initialization), a tiler heap can be terminated whilst an OoM
  *               event is being handled for it.
@@ -576,6 +576,10 @@ struct kbase_csf_heap_context_allocator {
  * @ctx_alloc:   Allocator for heap context structures.
  * @nr_of_heaps: Total number of tiler heaps that were added during the
  *               life time of the context.
+ *
+ * This contains all of the CSF state relating to chunked tiler heaps for one
+ * @kbase_context. It is not the same as a heap context structure allocated by
+ * the kernel for use by the firmware.
  */
 struct kbase_csf_tiler_heap_context {
 	struct mutex lock;
@@ -617,6 +621,43 @@ struct kbase_csf_scheduler_context {
 };
 
 /**
+ * enum kbase_csf_event_callback_action - return type for CSF event callbacks.
+ *
+ * @KBASE_CSF_EVENT_CALLBACK_FIRST: Never set explicitly.
+ * It doesn't correspond to any action or type of event callback.
+ *
+ * @KBASE_CSF_EVENT_CALLBACK_KEEP: The callback will remain registered.
+ *
+ * @KBASE_CSF_EVENT_CALLBACK_REMOVE: The callback will be removed
+ * immediately upon return.
+ *
+ * @KBASE_CSF_EVENT_CALLBACK_LAST: Never set explicitly.
+ * It doesn't correspond to any action or type of event callback.
+ */
+enum kbase_csf_event_callback_action {
+	KBASE_CSF_EVENT_CALLBACK_FIRST = 0,
+	KBASE_CSF_EVENT_CALLBACK_KEEP,
+	KBASE_CSF_EVENT_CALLBACK_REMOVE,
+	KBASE_CSF_EVENT_CALLBACK_LAST,
+};
+
+/**
+ * struct kbase_csf_event - Object representing CSF event and error
+ *
+ * @callback_list:	List of callbacks which are registered to serve CSF
+ *			events.
+ * @error_list:		List for CS fatal errors in CSF context.
+ *			Link of fatal error is &struct_kbase_csf_notification.link.
+ * @lock:		Lock protecting access to @callback_list and
+ *			@error_list.
+ */
+struct kbase_csf_event {
+	struct list_head callback_list;
+	struct list_head error_list;
+	spinlock_t lock;
+};
+
+/**
  * struct kbase_csf_context - Object representing CSF for a GPU address space.
  *
  * @event_pages_head: A list of pages allocated for the event memory used by
@@ -647,10 +688,7 @@ struct kbase_csf_scheduler_context {
  *                    userspace mapping created for them on bind operation
  *                    hasn't been removed.
  * @kcpu_queues:      Kernel CPU command queues.
- * @event_lock:       Lock protecting access to @event_callback_list and
- *                    @error_list.
- * @event_callback_list: List of callbacks which are registered to serve CSF
- *                       events.
+ * @event:            CSF event object.
  * @tiler_heaps:      Chunked tiler memory heaps.
  * @wq:               Dedicated workqueue to process work items corresponding
  *                    to the OoM events raised for chunked tiler heaps being
@@ -661,10 +699,7 @@ struct kbase_csf_scheduler_context {
  *                    of the USER register page. Currently used only for sanity
  *                    checking.
  * @sched:            Object representing the scheduler's context
- * @error_list:       List for CS fatal errors in this context.
- *                    Link of fatal error is
- *                    &struct_kbase_csf_notification.link.
- *                    @event_lock needs to be held to access this list.
+ * @pending_submission_work: Work item to process pending kicked GPU command queues.
  * @cpu_queue:        CPU queue information. Only be available when DEBUG_FS
  *                    is enabled.
  */
@@ -677,14 +712,13 @@ struct kbase_csf_context {
 	struct kbase_queue_group *queue_groups[MAX_QUEUE_GROUP_NUM];
 	struct list_head queue_list;
 	struct kbase_csf_kcpu_queue_context kcpu_queues;
-	spinlock_t event_lock;
-	struct list_head event_callback_list;
+	struct kbase_csf_event event;
 	struct kbase_csf_tiler_heap_context tiler_heaps;
 	struct workqueue_struct *wq;
 	struct list_head link;
 	struct vm_area_struct *user_reg_vma;
 	struct kbase_csf_scheduler_context sched;
-	struct list_head error_list;
+	struct work_struct pending_submission_work;
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	struct kbase_csf_cpu_queue_context cpu_queue;
 #endif
@@ -882,12 +916,12 @@ struct kbase_csf_scheduler {
 	bool tick_timer_active;
 };
 
-/**
+/*
  * Number of GPU cycles per unit of the global progress timeout.
  */
 #define GLB_PROGRESS_TIMER_TIMEOUT_SCALE ((u64)1024)
 
-/**
+/*
  * Maximum value of the global progress timeout.
  */
 #define GLB_PROGRESS_TIMER_TIMEOUT_MAX \
@@ -895,12 +929,12 @@ struct kbase_csf_scheduler {
 		GLB_PROGRESS_TIMER_TIMEOUT_SHIFT) * \
 	GLB_PROGRESS_TIMER_TIMEOUT_SCALE)
 
-/**
+/*
  * Default GLB_PWROFF_TIMER_TIMEOUT value in unit of micro-seconds.
  */
 #define DEFAULT_GLB_PWROFF_TIMEOUT_US (800)
 
-/**
+/*
  * In typical operations, the management of the shader core power transitions
  * is delegated to the MCU/firmware. However, if the host driver is configured
  * to take direct control, one needs to disable the MCU firmware GLB_PWROFF
@@ -911,7 +945,7 @@ struct kbase_csf_scheduler {
 /* Index of the GPU_ACTIVE counter within the CSHW counter block */
 #define GPU_ACTIVE_CNT_IDX (4)
 
-/**
+/*
  * Maximum number of sessions that can be managed by the IPA Control component.
  */
 #if MALI_UNIT_TEST
@@ -937,13 +971,13 @@ enum kbase_ipa_core_type {
 	KBASE_IPA_CORE_TYPE_NUM
 };
 
-/**
+/*
  * Number of configurable counters per type of block on the IPA Control
  * interface.
  */
 #define KBASE_IPA_CONTROL_NUM_BLOCK_COUNTERS ((size_t)8)
 
-/**
+/*
  * Total number of configurable counters existing on the IPA Control interface.
  */
 #define KBASE_IPA_CONTROL_MAX_COUNTERS                                         \
diff --git a/mali_kbase/csf/mali_kbase_csf_event.c b/mali_kbase/csf/mali_kbase_csf_event.c
new file mode 100644
index 0000000..5c86688
--- /dev/null
+++ b/mali_kbase/csf/mali_kbase_csf_event.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+#include <mali_kbase.h>
+#include "mali_kbase_csf_event.h"
+
+/**
+ * struct kbase_csf_event_cb - CSF event callback.
+ *
+ * @link:      Link to the rest of the list.
+ * @kctx:      Pointer to the Kbase context this event belongs to.
+ * @callback:  Callback function to call when a CSF event is signalled.
+ * @param:     Parameter to pass to the callback function.
+ *
+ * This structure belongs to the list of events which is part of a Kbase
+ * context, and describes a callback function with a custom parameter to pass
+ * to it when a CSF event is signalled.
+ */
+struct kbase_csf_event_cb {
+	struct list_head link;
+	struct kbase_context *kctx;
+	kbase_csf_event_callback *callback;
+	void *param;
+};
+
+int kbase_csf_event_wait_add(struct kbase_context *kctx,
+			     kbase_csf_event_callback *callback, void *param)
+{
+	int err = -ENOMEM;
+	struct kbase_csf_event_cb *event_cb =
+		kzalloc(sizeof(struct kbase_csf_event_cb), GFP_KERNEL);
+
+	if (event_cb) {
+		unsigned long flags;
+
+		event_cb->kctx = kctx;
+		event_cb->callback = callback;
+		event_cb->param = param;
+
+		spin_lock_irqsave(&kctx->csf.event.lock, flags);
+		list_add_tail(&event_cb->link, &kctx->csf.event.callback_list);
+		dev_dbg(kctx->kbdev->dev,
+			"Added event handler %pK with param %pK\n", event_cb,
+			event_cb->param);
+		spin_unlock_irqrestore(&kctx->csf.event.lock, flags);
+
+		err = 0;
+	}
+
+	return err;
+}
+
+void kbase_csf_event_wait_remove(struct kbase_context *kctx,
+		kbase_csf_event_callback *callback, void *param)
+{
+	struct kbase_csf_event_cb *event_cb;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kctx->csf.event.lock, flags);
+
+	list_for_each_entry(event_cb, &kctx->csf.event.callback_list, link) {
+		if ((event_cb->callback == callback) && (event_cb->param == param)) {
+			list_del(&event_cb->link);
+			dev_dbg(kctx->kbdev->dev,
+				"Removed event handler %pK with param %pK\n",
+				event_cb, event_cb->param);
+			kfree(event_cb);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&kctx->csf.event.lock, flags);
+}
+
+static void sync_update_notify_gpu(struct kbase_context *kctx)
+{
+	bool can_notify_gpu;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kctx->kbdev->hwaccess_lock, flags);
+	can_notify_gpu = kctx->kbdev->pm.backend.gpu_powered;
+#ifdef KBASE_PM_RUNTIME
+	if (kctx->kbdev->pm.backend.gpu_sleep_mode_active)
+		can_notify_gpu = false;
+#endif
+
+	if (can_notify_gpu) {
+		kbase_csf_ring_doorbell(kctx->kbdev, CSF_KERNEL_DOORBELL_NR);
+		KBASE_KTRACE_ADD(kctx->kbdev, SYNC_UPDATE_EVENT_NOTIFY_GPU, kctx, 0u);
+	}
+
+	spin_unlock_irqrestore(&kctx->kbdev->hwaccess_lock, flags);
+}
+
+void kbase_csf_event_signal(struct kbase_context *kctx, bool notify_gpu)
+{
+	struct kbase_csf_event_cb *event_cb, *next_event_cb;
+	unsigned long flags;
+
+	dev_dbg(kctx->kbdev->dev,
+		"Signal event (%s GPU notify) for context %pK\n",
+		notify_gpu ? "with" : "without", (void *)kctx);
+
+	/* First increment the signal count and wake up event thread.
+	 */
+	atomic_set(&kctx->event_count, 1);
+	kbase_event_wakeup(kctx);
+
+	/* Signal the CSF firmware. This is to ensure that pending command
+	 * stream synch object wait operations are re-evaluated.
+	 * Write to GLB_DOORBELL would suffice as spec says that all pending
+	 * synch object wait operations are re-evaluated on a write to any
+	 * CS_DOORBELL/GLB_DOORBELL register.
+	 */
+	if (notify_gpu)
+		sync_update_notify_gpu(kctx);
+
+	/* Now invoke the callbacks registered on backend side.
+	 * Allow item removal inside the loop, if requested by the callback.
+	 */
+	spin_lock_irqsave(&kctx->csf.event.lock, flags);
+
+	list_for_each_entry_safe(
+		event_cb, next_event_cb, &kctx->csf.event.callback_list, link) {
+		enum kbase_csf_event_callback_action action;
+
+		dev_dbg(kctx->kbdev->dev,
+			"Calling event handler %pK with param %pK\n",
+			(void *)event_cb, event_cb->param);
+		action = event_cb->callback(event_cb->param);
+		if (action == KBASE_CSF_EVENT_CALLBACK_REMOVE) {
+			list_del(&event_cb->link);
+			kfree(event_cb);
+		}
+	}
+
+	spin_unlock_irqrestore(&kctx->csf.event.lock, flags);
+}
+
+void kbase_csf_event_term(struct kbase_context *kctx)
+{
+	struct kbase_csf_event_cb *event_cb, *next_event_cb;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kctx->csf.event.lock, flags);
+
+	list_for_each_entry_safe(
+		event_cb, next_event_cb, &kctx->csf.event.callback_list, link) {
+		list_del(&event_cb->link);
+		dev_warn(kctx->kbdev->dev,
+			"Removed event handler %pK with param %pK\n",
+			(void *)event_cb, event_cb->param);
+		kfree(event_cb);
+	}
+
+	WARN_ON(!list_empty(&kctx->csf.event.error_list));
+
+	spin_unlock_irqrestore(&kctx->csf.event.lock, flags);
+}
+
+void kbase_csf_event_init(struct kbase_context *const kctx)
+{
+	INIT_LIST_HEAD(&kctx->csf.event.callback_list);
+	INIT_LIST_HEAD(&kctx->csf.event.error_list);
+	spin_lock_init(&kctx->csf.event.lock);
+}
+
+void kbase_csf_event_remove_error(struct kbase_context *kctx,
+				  struct kbase_csf_notification *error)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&kctx->csf.event.lock, flags);
+	list_del_init(&error->link);
+	spin_unlock_irqrestore(&kctx->csf.event.lock, flags);
+}
+
+bool kbase_csf_event_read_error(struct kbase_context *kctx,
+				struct base_csf_notification *event_data)
+{
+	struct kbase_csf_notification *error_data = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kctx->csf.event.lock, flags);
+	if (likely(!list_empty(&kctx->csf.event.error_list))) {
+		error_data = list_first_entry(&kctx->csf.event.error_list,
+			struct kbase_csf_notification, link);
+		list_del_init(&error_data->link);
+		*event_data = error_data->data;
+		dev_dbg(kctx->kbdev->dev, "Dequeued error %pK in context %pK\n",
+			(void *)error_data, (void *)kctx);
+	}
+	spin_unlock_irqrestore(&kctx->csf.event.lock, flags);
+	return !!error_data;
+}
+
+void kbase_csf_event_add_error(struct kbase_context *const kctx,
+			struct kbase_csf_notification *const error,
+			struct base_csf_notification const *const data)
+{
+	unsigned long flags;
+
+	if (WARN_ON(!kctx))
+		return;
+
+	if (WARN_ON(!error))
+		return;
+
+	if (WARN_ON(!data))
+		return;
+
+	spin_lock_irqsave(&kctx->csf.event.lock, flags);
+	if (!WARN_ON(!list_empty(&error->link))) {
+		error->data = *data;
+		list_add_tail(&error->link, &kctx->csf.event.error_list);
+		dev_dbg(kctx->kbdev->dev,
+			"Added error %pK of type %d in context %pK\n",
+			(void *)error, data->type, (void *)kctx);
+	}
+	spin_unlock_irqrestore(&kctx->csf.event.lock, flags);
+}
+
+bool kbase_csf_event_error_pending(struct kbase_context *kctx)
+{
+	bool error_pending = false;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kctx->csf.event.lock, flags);
+	error_pending = !list_empty(&kctx->csf.event.error_list);
+
+	dev_dbg(kctx->kbdev->dev, "%s error is pending in context %pK\n",
+		error_pending ? "An" : "No", (void *)kctx);
+
+	spin_unlock_irqrestore(&kctx->csf.event.lock, flags);
+
+	return error_pending;
+}
diff --git a/mali_kbase/csf/mali_kbase_csf_event.h b/mali_kbase/csf/mali_kbase_csf_event.h
new file mode 100644
index 0000000..1270ef6
--- /dev/null
+++ b/mali_kbase/csf/mali_kbase_csf_event.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _KBASE_CSF_EVENT_H_
+#define _KBASE_CSF_EVENT_H_
+
+#include <linux/types.h>
+#include <linux/wait.h>
+
+struct kbase_context;
+struct kbase_csf_event;
+enum kbase_csf_event_callback_action;
+
+/**
+ * kbase_csf_event_callback_action - type for callback functions to be
+ *                                   called upon CSF events.
+ * @param:   Generic parameter to pass to the callback function.
+ *
+ * This is the type of callback functions that can be registered
+ * for CSF events. These function calls shall be triggered by any call
+ * to kbase_csf_event_signal.
+ *
+ * Return: KBASE_CSF_EVENT_CALLBACK_KEEP if the callback should remain
+ * registered, or KBASE_CSF_EVENT_CALLBACK_REMOVE if it should be removed.
+ */
+typedef enum kbase_csf_event_callback_action kbase_csf_event_callback(void *param);
+
+/**
+ * kbase_csf_event_wait_add - Add a CSF event callback
+ *
+ * @kctx:      The Kbase context the @callback should be registered to.
+ * @callback:  The callback function to register.
+ * @param:     Custom parameter to be passed to the @callback function.
+ *
+ * This function adds an event callback to the list of CSF event callbacks
+ * belonging to a given Kbase context, to be triggered when a CSF event is
+ * signalled by kbase_csf_event_signal.
+ *
+ * Return: 0 on success, or negative on failure.
+ */
+int kbase_csf_event_wait_add(struct kbase_context *kctx,
+		kbase_csf_event_callback *callback, void *param);
+
+/**
+ * kbase_csf_event_wait_remove - Remove a CSF event callback
+ *
+ * @kctx:      The kbase context the @callback should be removed from.
+ * @callback:  The callback function to remove.
+ * @param:     Custom parameter that would have been passed to the @p callback
+ *             function.
+ *
+ * This function removes an event callback from the list of CSF event callbacks
+ * belonging to a given Kbase context.
+ */
+void kbase_csf_event_wait_remove(struct kbase_context *kctx,
+		kbase_csf_event_callback *callback, void *param);
+
+/**
+ * kbase_csf_event_term - Removes all CSF event callbacks
+ *
+ * @kctx:  The kbase context for which CSF event callbacks have to be removed.
+ *
+ * This function empties the list of CSF event callbacks belonging to a given
+ * Kbase context.
+ */
+void kbase_csf_event_term(struct kbase_context *kctx);
+
+/**
+ * kbase_csf_event_signal - Signal a CSF event
+ *
+ * @kctx:  The kbase context whose CSF event callbacks shall be triggered.
+ * @notify_gpu: Flag to indicate if CSF firmware should be notified of the
+ *              signaling of event that happened on the Driver side, either
+ *              the signal came from userspace or from kcpu queues.
+ *
+ * This function triggers all the CSF event callbacks that are registered to
+ * a given Kbase context, and also signals the event handling thread of
+ * userspace driver waiting for the CSF event.
+ */
+void kbase_csf_event_signal(struct kbase_context *kctx, bool notify_gpu);
+
+static inline void kbase_csf_event_signal_notify_gpu(struct kbase_context *kctx)
+{
+	kbase_csf_event_signal(kctx, true);
+}
+
+static inline void kbase_csf_event_signal_cpu_only(struct kbase_context *kctx)
+{
+	kbase_csf_event_signal(kctx, false);
+}
+
+/**
+ * kbase_csf_event_init - Initialize event object
+ *
+ * This function initializes the event object.
+ *
+ * @kctx: The kbase context whose event object will be initialized.
+ */
+void kbase_csf_event_init(struct kbase_context *const kctx);
+
+struct kbase_csf_notification;
+struct base_csf_notification;
+/**
+ * kbase_csf_event_read_error - Read and remove an error from error list in event
+ *
+ * @kctx: The kbase context.
+ * @event_data: Caller-provided buffer to copy the fatal error to
+ *
+ * This function takes the CS fatal error from context's ordered
+ * error_list, copies its contents to @event_data.
+ *
+ * Return: true if error is read out or false if there is no error in error list.
+ */
+bool kbase_csf_event_read_error(struct kbase_context *kctx,
+				struct base_csf_notification *event_data);
+
+/**
+ * kbase_csf_event_add_error - Add an error into event error list
+ *
+ * @kctx:  Address of a base context associated with a GPU address space.
+ * @error: Address of the item to be added to the context's pending error list.
+ * @data:  Error data to be returned to userspace.
+ *
+ * Does not wake up the event queue blocking a user thread in kbase_poll. This
+ * is to make it more efficient to add multiple errors.
+ *
+ * The added error must not already be on the context's list of errors waiting
+ * to be reported (e.g. because a previous error concerning the same object has
+ * not yet been reported).
+ *
+ */
+void kbase_csf_event_add_error(struct kbase_context *const kctx,
+			struct kbase_csf_notification *const error,
+			struct base_csf_notification const *const data);
+
+/**
+ * kbase_csf_event_remove_error - Remove an error from event error list
+ *
+ * @kctx:  Address of a base context associated with a GPU address space.
+ * @error: Address of the item to be removed from the context's event error list.
+ */
+void kbase_csf_event_remove_error(struct kbase_context *kctx,
+				  struct kbase_csf_notification *error);
+
+/**
+ * kbase_csf_event_error_pending - Check the error pending status
+ *
+ * @kctx: The kbase context to check fatal error upon.
+ *
+ * Return: true if there is error in the list.
+ */
+bool kbase_csf_event_error_pending(struct kbase_context *kctx);
+#endif /* _KBASE_CSF_EVENT_H_ */
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware.c b/mali_kbase/csf/mali_kbase_csf_firmware.c
index 785555c..202c677 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware.c
+++ b/mali_kbase/csf/mali_kbase_csf_firmware.c
@@ -31,6 +31,7 @@
 #include "device/mali_kbase_device.h"
 #include "backend/gpu/mali_kbase_pm_internal.h"
 #include "tl/mali_kbase_timeline_priv.h"
+#include "tl/mali_kbase_tracepoints.h"
 #include "mali_kbase_csf_tl_reader.h"
 #include "backend/gpu/mali_kbase_clk_rate_trace_mgr.h"
 #include <csf/ipa_control/mali_kbase_csf_ipa_control.h>
@@ -157,8 +158,7 @@ static bool entry_optional(u32 header)
 }
 
 /**
- * struct firmware_timeline_metadata -
- * Timeline metadata item within the MCU firmware
+ * struct firmware_timeline_metadata - Timeline metadata item within the MCU firmware
  *
  * @node: List head linking all timeline metadata to
  *        kbase_device:csf.firmware_timeline_metadata.
@@ -217,10 +217,11 @@ static int wait_mcu_status_value(struct kbase_device *kbdev, u32 val)
 	return (max_loops == 0) ? -1 : 0;
 }
 
-void kbase_csf_firmware_disable_mcu_wait(struct kbase_device *kbdev)
+void kbase_csf_firmware_disable_mcu(struct kbase_device *kbdev)
 {
-	if (wait_mcu_status_value(kbdev, MCU_CNTRL_DISABLE) < 0)
-		dev_err(kbdev->dev, "MCU failed to get disabled");
+	KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_DISABLING(kbdev, kbase_backend_get_cycle_cnt(kbdev));
+
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(MCU_CONTROL), MCU_CNTRL_DISABLE);
 }
 
 static void wait_for_firmware_stop(struct kbase_device *kbdev)
@@ -229,6 +230,13 @@ static void wait_for_firmware_stop(struct kbase_device *kbdev)
 		/* This error shall go away once MIDJM-2371 is closed */
 		dev_err(kbdev->dev, "Firmware failed to stop");
 	}
+
+	KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_OFF(kbdev, kbase_backend_get_cycle_cnt(kbdev));
+}
+
+void kbase_csf_firmware_disable_mcu_wait(struct kbase_device *kbdev)
+{
+	wait_for_firmware_stop(kbdev);
 }
 
 static void stop_csf_firmware(struct kbase_device *kbdev)
@@ -463,16 +471,16 @@ out:
 /**
  * parse_memory_setup_entry() - Process an "interface memory setup" section
  *
+ * @kbdev: Kbase device structure
+ * @fw: The firmware image containing the section
+ * @entry: Pointer to the start of the section
+ * @size: Size (in bytes) of the section
+ *
  * Read an "interface memory setup" section from the firmware image and create
  * the necessary memory region including the MMU page tables. If successful
  * the interface will be added to the kbase_device:csf.firmware_interfaces list.
  *
  * Return: 0 if successful, negative error code on failure
- *
- * @kbdev: Kbase device structure
- * @fw: The firmware image containing the section
- * @entry: Pointer to the start of the section
- * @size: Size (in bytes) of the section
  */
 static int parse_memory_setup_entry(struct kbase_device *kbdev,
 		const struct firmware *fw,
@@ -724,6 +732,11 @@ static int parse_timeline_metadata_entry(struct kbase_device *kbdev,
 /**
  * load_firmware_entry() - Process an entry from a firmware image
  *
+ * @kbdev:  Kbase device
+ * @fw:     Firmware image containing the entry
+ * @offset: Byte offset within the image of the entry to load
+ * @header: Header word of the entry
+ *
  * Read an entry from a firmware image and do any necessary work (e.g. loading
  * the data into page accessible to the MCU).
  *
@@ -731,11 +744,6 @@ static int parse_timeline_metadata_entry(struct kbase_device *kbdev,
  * otherwise the function will fail with -EINVAL
  *
  * Return: 0 if successful, negative error code on failure
- *
- * @kbdev:  Kbase device
- * @fw:     Firmware image containing the entry
- * @offset: Byte offset within the image of the entry to load
- * @header: Header word of the entry
  */
 static int load_firmware_entry(struct kbase_device *kbdev,
 		const struct firmware *fw,
@@ -784,18 +792,6 @@ static int load_firmware_entry(struct kbase_device *kbdev,
 		}
 		return kbase_csf_firmware_cfg_option_entry_parse(
 			kbdev, fw, entry, size, updatable);
-	case CSF_FIRMWARE_ENTRY_TYPE_FUTF_TEST:
-#ifndef MALI_KBASE_BUILD
-		/* FW UTF option */
-		if (size < 2*sizeof(*entry)) {
-			dev_err(kbdev->dev, "FW UTF entry too short (size=%u)\n",
-					size);
-			return -EINVAL;
-		}
-		return mali_kutf_process_fw_utf_entry(kbdev, fw->data,
-						      fw->size, entry);
-#endif
-		break;
 	case CSF_FIRMWARE_ENTRY_TYPE_TRACE_BUFFER:
 		/* Trace buffer */
 		if (size < TRACE_BUFFER_ENTRY_NAME_OFFSET + sizeof(*entry)) {
@@ -1170,6 +1166,7 @@ u32 kbase_csf_firmware_csg_output(
 	dev_dbg(kbdev->dev, "csg output r: reg %08x val %08x\n", offset, val);
 	return val;
 }
+KBASE_EXPORT_TEST_API(kbase_csf_firmware_csg_output);
 
 void kbase_csf_firmware_global_input(
 	const struct kbase_csf_global_iface *const iface, const u32 offset,
@@ -1180,6 +1177,7 @@ void kbase_csf_firmware_global_input(
 	dev_dbg(kbdev->dev, "glob input w: reg %08x val %08x\n", offset, value);
 	input_page_write(iface->input, offset, value);
 }
+KBASE_EXPORT_TEST_API(kbase_csf_firmware_global_input);
 
 void kbase_csf_firmware_global_input_mask(
 	const struct kbase_csf_global_iface *const iface, const u32 offset,
@@ -1191,6 +1189,7 @@ void kbase_csf_firmware_global_input_mask(
 			offset, value, mask);
 	input_page_partial_write(iface->input, offset, value, mask);
 }
+KBASE_EXPORT_TEST_API(kbase_csf_firmware_global_input_mask);
 
 u32 kbase_csf_firmware_global_input_read(
 	const struct kbase_csf_global_iface *const iface, const u32 offset)
@@ -1211,6 +1210,7 @@ u32 kbase_csf_firmware_global_output(
 	dev_dbg(kbdev->dev, "glob output r: reg %08x val %08x\n", offset, val);
 	return val;
 }
+KBASE_EXPORT_TEST_API(kbase_csf_firmware_global_output);
 
 /**
  * handle_internal_firmware_fatal - Handler for CS internal firmware fault.
@@ -1484,8 +1484,7 @@ bool kbase_csf_firmware_core_attr_updated(struct kbase_device *kbdev)
 }
 
 /**
- * kbase_csf_firmware_reload_worker() -
- * reload the fw image and re-enable the MCU
+ * kbase_csf_firmware_reload_worker() - reload the fw image and re-enable the MCU
  * @work: CSF Work item for reloading the firmware.
  *
  * This helper function will reload the firmware image and re-enable the MCU.
@@ -1505,6 +1504,8 @@ static void kbase_csf_firmware_reload_worker(struct work_struct *work)
 
 	dev_info(kbdev->dev, "reloading firmware");
 
+	KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_RELOADING(kbdev, kbase_backend_get_cycle_cnt(kbdev));
+
 	/* Reload just the data sections from firmware binary image */
 	err = reload_fw_data_sections(kbdev);
 	if (err)
@@ -2017,10 +2018,6 @@ void kbase_csf_firmware_term(struct kbase_device *kbdev)
 		kfree(metadata);
 	}
 
-#ifndef MALI_KBASE_BUILD
-	mali_kutf_fw_utf_entry_cleanup(kbdev);
-#endif
-
 	/* This will also free up the region allocated for the shared interface
 	 * entry parsed from the firmware image.
 	 */
@@ -2144,6 +2141,8 @@ void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev)
 	struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface;
 	unsigned long flags;
 
+	KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_REQUEST_HALT(kbdev, kbase_backend_get_cycle_cnt(kbdev));
+
 	kbase_csf_scheduler_spin_lock(kbdev, &flags);
 	/* Validate there are no on-slot groups when sending the
 	 * halt request to firmware.
@@ -2155,12 +2154,25 @@ void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev)
 	kbase_csf_scheduler_spin_unlock(kbdev, flags);
 }
 
+void kbase_csf_firmware_enable_mcu(struct kbase_device *kbdev)
+{
+	KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_ENABLING(kbdev, kbase_backend_get_cycle_cnt(kbdev));
+
+	/* Trigger the boot of MCU firmware, Use the AUTO mode as
+	 * otherwise on fast reset, to exit protected mode, MCU will
+	 * not reboot by itself to enter normal mode.
+	 */
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(MCU_CONTROL), MCU_CNTRL_AUTO);
+}
+
 #ifdef KBASE_PM_RUNTIME
 void kbase_csf_firmware_trigger_mcu_sleep(struct kbase_device *kbdev)
 {
 	struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface;
 	unsigned long flags;
 
+	KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_REQUEST_SLEEP(kbdev, kbase_backend_get_cycle_cnt(kbdev));
+
 	kbase_csf_scheduler_spin_lock(kbdev, &flags);
 	set_global_request(global_iface, GLB_REQ_SLEEP_MASK);
 	dev_dbg(kbdev->dev, "Sending sleep request to MCU");
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware.h b/mali_kbase/csf/mali_kbase_csf_firmware.h
index 0edcc30..f4ce33c 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware.h
+++ b/mali_kbase/csf/mali_kbase_csf_firmware.h
@@ -346,14 +346,14 @@ static inline void kbase_csf_ring_doorbell(struct kbase_device *kbdev,
 /**
  * kbase_csf_read_firmware_memory - Read a value in a GPU address
  *
+ * @kbdev:     Device pointer
+ * @gpu_addr:  GPU address to read
+ * @value:     output pointer to which the read value will be written.
+ *
  * This function read a value in a GPU address that belongs to
  * a private firmware memory region. The function assumes that the location
  * is not permanently mapped on the CPU address space, therefore it maps it
  * and then unmaps it to access it independently.
- *
- * @kbdev:     Device pointer
- * @gpu_addr:  GPU address to read
- * @value:     output pointer to which the read value will be written.
  */
 void kbase_csf_read_firmware_memory(struct kbase_device *kbdev,
 	u32 gpu_addr, u32 *value);
@@ -361,14 +361,14 @@ void kbase_csf_read_firmware_memory(struct kbase_device *kbdev,
 /**
  * kbase_csf_update_firmware_memory - Write a value in a GPU address
  *
+ * @kbdev:     Device pointer
+ * @gpu_addr:  GPU address to write
+ * @value:     Value to write
+ *
  * This function writes a given value in a GPU address that belongs to
  * a private firmware memory region. The function assumes that the destination
  * is not permanently mapped on the CPU address space, therefore it maps it
  * and then unmaps it to access it independently.
- *
- * @kbdev:     Device pointer
- * @gpu_addr:  GPU address to write
- * @value:     Value to write
  */
 void kbase_csf_update_firmware_memory(struct kbase_device *kbdev,
 	u32 gpu_addr, u32 value);
@@ -404,20 +404,20 @@ void kbase_csf_firmware_term(struct kbase_device *kbdev);
 /**
  * kbase_csf_firmware_ping - Send the ping request to firmware.
  *
- * The function sends the ping request to firmware.
- *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ *
+ * The function sends the ping request to firmware.
  */
 void kbase_csf_firmware_ping(struct kbase_device *kbdev);
 
 /**
  * kbase_csf_firmware_ping_wait - Send the ping request to firmware and waits.
  *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ *
  * The function sends the ping request to firmware and waits to confirm it is
  * alive.
  *
- * @kbdev: Instance of a GPU platform device that implements a CSF interface.
- *
  * Return: 0 on success, or negative on failure.
  */
 int kbase_csf_firmware_ping_wait(struct kbase_device *kbdev);
@@ -462,8 +462,12 @@ void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev);
 
 static inline bool kbase_csf_firmware_mcu_halted(struct kbase_device *kbdev)
 {
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	return true;
+#else
 	return (kbase_reg_read(kbdev, GPU_CONTROL_REG(MCU_STATUS)) ==
 		MCU_STATUS_HALTED);
+#endif /* CONFIG_MALI_NO_MALI */
 }
 
 /**
@@ -481,24 +485,14 @@ void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev);
  *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  */
-static inline void kbase_csf_firmware_enable_mcu(struct kbase_device *kbdev)
-{
-	/* Trigger the boot of MCU firmware, Use the AUTO mode as
-	 * otherwise on fast reset, to exit protected mode, MCU will
-	 * not reboot by itself to enter normal mode.
-	 */
-	kbase_reg_write(kbdev, GPU_CONTROL_REG(MCU_CONTROL), MCU_CNTRL_AUTO);
-}
+void kbase_csf_firmware_enable_mcu(struct kbase_device *kbdev);
 
 /**
  * kbase_csf_firmware_disable_mcu - Send the command to disable MCU
  *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  */
-static inline void kbase_csf_firmware_disable_mcu(struct kbase_device *kbdev)
-{
-	kbase_reg_write(kbdev, GPU_CONTROL_REG(MCU_CONTROL), MCU_CNTRL_DISABLE);
-}
+void kbase_csf_firmware_disable_mcu(struct kbase_device *kbdev);
 
 /**
  * kbase_csf_firmware_disable_mcu_wait - Wait for the MCU to reach disabled
@@ -560,9 +554,9 @@ void kbase_csf_firmware_global_reinit(struct kbase_device *kbdev,
  *                      requests, sent after the reboot of MCU firmware, have
  *                      completed or not.
  *
- * Return: true if the Global configuration requests completed otherwise false.
- *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ *
+ * Return: true if the Global configuration requests completed otherwise false.
  */
 bool kbase_csf_firmware_global_reinit_complete(struct kbase_device *kbdev);
 
@@ -587,17 +581,16 @@ void kbase_csf_firmware_update_core_attr(struct kbase_device *kbdev,
  *                  request has completed or not, that was sent to update
  *                  the core attributes.
  *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ *
  * Return: true if the Global configuration request to update the core
  *         attributes has completed, otherwise false.
- *
- * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  */
 bool kbase_csf_firmware_core_attr_updated(struct kbase_device *kbdev);
 
 /**
- * Request the global control block of CSF interface capabilities
- *
- * Return: Total number of CSs, summed across all groups.
+ * kbase_csf_firmware_get_glb_iface - Request the global control block of CSF
+ *                                      interface capabilities
  *
  * @kbdev:                 Kbase device.
  * @group_data:            Pointer where to store all the group data
@@ -620,6 +613,8 @@ bool kbase_csf_firmware_core_attr_updated(struct kbase_device *kbdev);
  * @instr_features:        Instrumentation features. Bits 7:4 hold the max size
  *                         of events. Bits 3:0 hold the offset update rate.
  *                         (csf >= 1,1,0)
+ *
+ * Return: Total number of CSs, summed across all groups.
  */
 u32 kbase_csf_firmware_get_glb_iface(
 	struct kbase_device *kbdev, struct basep_cs_group_control *group_data,
@@ -628,20 +623,26 @@ u32 kbase_csf_firmware_get_glb_iface(
 	u32 *group_num, u32 *prfcnt_size, u32 *instr_features);
 
 /**
- * Get CSF firmware header timeline metadata content
- *
- * Return: The firmware timeline metadata content which match @p name.
+ * kbase_csf_firmware_get_timeline_metadata - Get CSF firmware header timeline
+ *                                            metadata content
  *
  * @kbdev:        Kbase device.
  * @name:         Name of the metadata which metadata content to be returned.
  * @size:         Metadata size if specified metadata found.
+ *
+ * Return: The firmware timeline metadata content which match @p name.
  */
 const char *kbase_csf_firmware_get_timeline_metadata(struct kbase_device *kbdev,
 	const char *name, size_t *size);
 
 /**
- * kbase_csf_firmware_mcu_shared_mapping_init -
- * Allocate and map MCU shared memory.
+ * kbase_csf_firmware_mcu_shared_mapping_init - Allocate and map MCU shared memory.
+ *
+ * @kbdev:              Kbase device the memory mapping shall belong to.
+ * @num_pages:          Number of memory pages to map.
+ * @cpu_map_properties: Either PROT_READ or PROT_WRITE.
+ * @gpu_map_properties: Either KBASE_REG_GPU_RD or KBASE_REG_GPU_WR.
+ * @csf_mapping:        Object where to write metadata for the memory mapping.
  *
  * This helper function allocates memory and maps it on both the CPU
  * and the GPU address spaces. Most of the properties of the mapping
@@ -653,12 +654,6 @@ const char *kbase_csf_firmware_get_timeline_metadata(struct kbase_device *kbdev,
  * will be ignored by the function.
  *
  * Return: 0 if success, or an error code on failure.
- *
- * @kbdev:              Kbase device the memory mapping shall belong to.
- * @num_pages:          Number of memory pages to map.
- * @cpu_map_properties: Either PROT_READ or PROT_WRITE.
- * @gpu_map_properties: Either KBASE_REG_GPU_RD or KBASE_REG_GPU_WR.
- * @csf_mapping:        Object where to write metadata for the memory mapping.
  */
 int kbase_csf_firmware_mcu_shared_mapping_init(
 		struct kbase_device *kbdev,
@@ -676,35 +671,6 @@ int kbase_csf_firmware_mcu_shared_mapping_init(
 void kbase_csf_firmware_mcu_shared_mapping_term(
 		struct kbase_device *kbdev, struct kbase_csf_mapping *csf_mapping);
 
-#ifndef MALI_KBASE_BUILD
-/**
- * mali_kutf_process_fw_utf_entry() - Process the "Firmware UTF tests" section
- *
- * Read "Firmware UTF tests" section from the firmware image and create
- * necessary kutf app+suite+tests.
- *
- * Return: 0 if successful, negative error code on failure. In both cases
- * caller will have to invoke mali_kutf_fw_utf_entry_cleanup for the cleanup
- *
- * @kbdev: Kbase device structure
- * @fw_data: Pointer to the start of firmware binary image loaded from disk
- * @fw_size: Size (in bytes) of the firmware image
- * @entry: Pointer to the start of the section
- */
-int mali_kutf_process_fw_utf_entry(struct kbase_device *kbdev,
-	const void *fw_data, size_t fw_size, const u32 *entry);
-
-/**
- * mali_kutf_fw_utf_entry_cleanup() - Remove the Fw UTF tests debugfs entries
- *
- * Destroy the kutf apps+suites+tests created on parsing "Firmware UTF tests"
- * section from the firmware image.
- *
- * @kbdev: Kbase device structure
- */
-void mali_kutf_fw_utf_entry_cleanup(struct kbase_device *kbdev);
-#endif
-
 #ifdef CONFIG_MALI_DEBUG
 extern bool fw_debug;
 #endif
@@ -722,11 +688,11 @@ static inline long kbase_csf_timeout_in_jiffies(const unsigned int msecs)
  * kbase_csf_firmware_enable_gpu_idle_timer() - Activate the idle hysteresis
  *                                              monitoring operation
  *
+ * @kbdev: Kbase device structure
+ *
  * Program the firmware interface with its configured hysteresis count value
  * and enable the firmware to act on it. The Caller is
  * assumed to hold the kbdev->csf.scheduler.interrupt_lock.
- *
- * @kbdev: Kbase device structure
  */
 void kbase_csf_firmware_enable_gpu_idle_timer(struct kbase_device *kbdev);
 
@@ -734,10 +700,10 @@ void kbase_csf_firmware_enable_gpu_idle_timer(struct kbase_device *kbdev);
  * kbase_csf_firmware_disable_gpu_idle_timer() - Disable the idle time
  *                                             hysteresis monitoring operation
  *
+ * @kbdev: Kbase device structure
+ *
  * Program the firmware interface to disable the idle hysteresis timer. The
  * Caller is assumed to hold the kbdev->csf.scheduler.interrupt_lock.
- *
- * @kbdev: Kbase device structure
  */
 void kbase_csf_firmware_disable_gpu_idle_timer(struct kbase_device *kbdev);
 
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware_cfg.c b/mali_kbase/csf/mali_kbase_csf_firmware_cfg.c
index f00acb1..70bf26a 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware_cfg.c
+++ b/mali_kbase/csf/mali_kbase_csf_firmware_cfg.c
@@ -29,10 +29,6 @@
 /**
  * struct firmware_config - Configuration item within the MCU firmware
  *
- * The firmware may expose configuration options. Each option has a name, the
- * address where the option is controlled and the minimum and maximum values
- * that the option can take.
- *
  * @node:        List head linking all options to
  *               kbase_device:csf.firmware_config
  * @kbdev:       Pointer to the Kbase device
@@ -47,6 +43,10 @@
  * @min:         The lowest legal value of the configuration option
  * @max:         The maximum legal value of the configuration option
  * @cur_val:     The current value of the configuration option
+ *
+ * The firmware may expose configuration options. Each option has a name, the
+ * address where the option is controlled and the minimum and maximum values
+ * that the option can take.
  */
 struct firmware_config {
 	struct list_head node;
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware_cfg.h b/mali_kbase/csf/mali_kbase_csf_firmware_cfg.h
index 080c154..c2d2fc5 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware_cfg.h
+++ b/mali_kbase/csf/mali_kbase_csf_firmware_cfg.h
@@ -32,12 +32,12 @@
  * kbase_csf_firmware_cfg_init - Create the sysfs directory for configuration
  *                               options present in firmware image.
  *
+ * @kbdev: Pointer to the Kbase device
+ *
  * This function would create a sysfs directory and populate it with a
  * sub-directory, that would contain a file per attribute, for every
  * configuration option parsed from firmware image.
  *
- * @kbdev: Pointer to the Kbase device
- *
  * Return: The initialization error code.
  */
 int kbase_csf_firmware_cfg_init(struct kbase_device *kbdev);
@@ -55,16 +55,16 @@ void kbase_csf_firmware_cfg_term(struct kbase_device *kbdev);
  * kbase_csf_firmware_cfg_option_entry_parse() - Process a
  *                                               "configuration option" section.
  *
- * Read a "configuration option" section adding it to the
- * kbase_device:csf.firmware_config list.
- *
- * Return: 0 if successful, negative error code on failure
- *
  * @kbdev:     Kbase device structure
  * @fw:        Firmware image containing the section
  * @entry:     Pointer to the section
  * @size:      Size (in bytes) of the section
  * @updatable: Indicates if entry can be updated with FIRMWARE_CONFIG_UPDATE
+ *
+ * Read a "configuration option" section adding it to the
+ * kbase_device:csf.firmware_config list.
+ *
+ * Return: 0 if successful, negative error code on failure
  */
 int kbase_csf_firmware_cfg_option_entry_parse(struct kbase_device *kbdev,
 					      const struct firmware *fw,
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c b/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c
index e99c968..6f61631 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c
+++ b/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c
@@ -136,13 +136,13 @@ static inline void output_page_write(u32 *const output, const u32 offset,
 /**
  * invent_memory_setup_entry() - Invent an "interface memory setup" section
  *
+ * @kbdev: Kbase device structure
+ *
  * Invent an "interface memory setup" section similar to one from a firmware
  * image. If successful the interface will be added to the
  * kbase_device:csf.firmware_interfaces list.
  *
  * Return: 0 if successful, negative error code on failure
- *
- * @kbdev: Kbase device structure
  */
 static int invent_memory_setup_entry(struct kbase_device *kbdev)
 {
@@ -371,6 +371,7 @@ u32 kbase_csf_firmware_csg_output(
 	dev_dbg(kbdev->dev, "csg output r: reg %08x val %08x\n", offset, val);
 	return val;
 }
+KBASE_EXPORT_TEST_API(kbase_csf_firmware_csg_output);
 
 static void
 csf_firmware_prfcnt_process(const struct kbase_csf_global_iface *const iface,
@@ -418,6 +419,7 @@ void kbase_csf_firmware_global_input(
 		output_page_write(iface->output, GLB_ACK, value);
 	}
 }
+KBASE_EXPORT_TEST_API(kbase_csf_firmware_global_input);
 
 void kbase_csf_firmware_global_input_mask(
 	const struct kbase_csf_global_iface *const iface, const u32 offset,
@@ -431,6 +433,7 @@ void kbase_csf_firmware_global_input_mask(
 	/* NO_MALI: Go through kbase_csf_firmware_global_input to capture writes */
 	kbase_csf_firmware_global_input(iface, offset, (input_page_read(iface->input, offset) & ~mask) | (value & mask));
 }
+KBASE_EXPORT_TEST_API(kbase_csf_firmware_global_input_mask);
 
 u32 kbase_csf_firmware_global_input_read(
 	const struct kbase_csf_global_iface *const iface, const u32 offset)
@@ -451,6 +454,7 @@ u32 kbase_csf_firmware_global_output(
 	dev_dbg(kbdev->dev, "glob output r: reg %08x val %08x\n", offset, val);
 	return val;
 }
+KBASE_EXPORT_TEST_API(kbase_csf_firmware_global_output);
 
 /**
  * handle_internal_firmware_fatal - Handler for CS internal firmware fault.
@@ -1020,10 +1024,6 @@ void kbase_csf_firmware_term(struct kbase_device *kbdev)
 
 	/* NO_MALI: No trace buffers to terminate */
 
-#ifndef MALI_KBASE_BUILD
-	mali_kutf_fw_utf_entry_cleanup(kbdev);
-#endif
-
 	mutex_destroy(&kbdev->csf.reg_lock);
 
 	/* This will also free up the region allocated for the shared interface
@@ -1154,6 +1154,15 @@ void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev)
 	kbase_csf_scheduler_spin_unlock(kbdev, flags);
 }
 
+void kbase_csf_firmware_enable_mcu(struct kbase_device *kbdev)
+{
+	/* Trigger the boot of MCU firmware, Use the AUTO mode as
+	 * otherwise on fast reset, to exit protected mode, MCU will
+	 * not reboot by itself to enter normal mode.
+	 */
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(MCU_CONTROL), MCU_CNTRL_AUTO);
+}
+
 #ifdef KBASE_PM_RUNTIME
 void kbase_csf_firmware_trigger_mcu_sleep(struct kbase_device *kbdev)
 {
@@ -1290,6 +1299,11 @@ const char *kbase_csf_firmware_get_timeline_metadata(
 	return NULL;
 }
 
+void kbase_csf_firmware_disable_mcu(struct kbase_device *kbdev)
+{
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(MCU_CONTROL), MCU_CNTRL_DISABLE);
+}
+
 void kbase_csf_firmware_disable_mcu_wait(struct kbase_device *kbdev)
 {
 	/* NO_MALI: Nothing to do here */
diff --git a/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.h b/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.h
index 993db63..9aab7ab 100644
--- a/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.h
+++ b/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.h
@@ -47,11 +47,11 @@ void kbase_csf_heap_context_allocator_term(
 /**
  * kbase_csf_heap_context_allocator_alloc - Allocate a heap context structure
  *
+ * @ctx_alloc: Pointer to the heap context allocator.
+ *
  * If this function is successful then it returns the address of a
  * zero-initialized heap context structure for use by the firmware.
  *
- * @ctx_alloc: Pointer to the heap context allocator.
- *
  * Return: GPU virtual address of the allocated heap context or 0 on failure.
  */
 u64 kbase_csf_heap_context_allocator_alloc(
@@ -60,13 +60,13 @@ u64 kbase_csf_heap_context_allocator_alloc(
 /**
  * kbase_csf_heap_context_allocator_free - Free a heap context structure
  *
- * This function returns a heap context structure to the free pool of unused
- * contexts for possible reuse by a future call to
- * @kbase_csf_heap_context_allocator_alloc.
- *
  * @ctx_alloc:   Pointer to the heap context allocator.
  * @heap_gpu_va: The GPU virtual address of a heap context structure that
  *               was allocated for the firmware.
+ *
+ * This function returns a heap context structure to the free pool of unused
+ * contexts for possible reuse by a future call to
+ * @kbase_csf_heap_context_allocator_alloc.
  */
 void kbase_csf_heap_context_allocator_free(
 	struct kbase_csf_heap_context_allocator *const ctx_alloc,
diff --git a/mali_kbase/csf/mali_kbase_csf_kcpu.c b/mali_kbase/csf/mali_kbase_csf_kcpu.c
index 8729307..05a4fa0 100644
--- a/mali_kbase/csf/mali_kbase_csf_kcpu.c
+++ b/mali_kbase/csf/mali_kbase_csf_kcpu.c
@@ -190,6 +190,12 @@ static void kbase_jit_add_to_pending_alloc_list(
  *
  * @queue: The queue containing this JIT allocation
  * @cmd:   The JIT allocation command
+ *
+ * Return:
+ * * 0       - allocation OK
+ * * -EINVAL - missing info or JIT ID still in use
+ * * -EAGAIN - Retry
+ * * -ENOMEM - no memory. unable to allocate
  */
 static int kbase_kcpu_jit_allocate_process(
 		struct kbase_kcpu_command_queue *queue,
@@ -289,8 +295,8 @@ static int kbase_kcpu_jit_allocate_process(
 		 * Write the address of the JIT allocation to the user provided
 		 * GPU allocation.
 		 */
-		ptr = kbase_vmap(kctx, info->gpu_alloc_addr, sizeof(*ptr),
-				&mapping);
+		ptr = kbase_vmap_prot(kctx, info->gpu_alloc_addr, sizeof(*ptr),
+				KBASE_REG_CPU_WR, &mapping);
 		if (!ptr) {
 			ret = -ENOMEM;
 			goto fail;
@@ -570,9 +576,11 @@ static int kbase_csf_queue_group_suspend_prepare(
 {
 	struct kbase_context *const kctx = kcpu_queue->kctx;
 	struct kbase_suspend_copy_buffer *sus_buf = NULL;
+	const u32 csg_suspend_buf_size =
+		kctx->kbdev->csf.global_iface.groups[0].suspend_size;
 	u64 addr = suspend_buf->buffer;
 	u64 page_addr = addr & PAGE_MASK;
-	u64 end_addr = addr + suspend_buf->size - 1;
+	u64 end_addr = addr + csg_suspend_buf_size - 1;
 	u64 last_page_addr = end_addr & PAGE_MASK;
 	int nr_pages = (last_page_addr - page_addr) / PAGE_SIZE + 1;
 	int pinned_pages = 0, ret = 0;
@@ -580,8 +588,7 @@ static int kbase_csf_queue_group_suspend_prepare(
 
 	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
 
-	if (suspend_buf->size <
-			kctx->kbdev->csf.global_iface.groups[0].suspend_size)
+	if (suspend_buf->size < csg_suspend_buf_size)
 		return -EINVAL;
 
 	ret = kbase_csf_queue_group_handle_is_valid(kctx,
@@ -593,7 +600,7 @@ static int kbase_csf_queue_group_suspend_prepare(
 	if (!sus_buf)
 		return -ENOMEM;
 
-	sus_buf->size = suspend_buf->size;
+	sus_buf->size = csg_suspend_buf_size;
 	sus_buf->nr_pages = nr_pages;
 	sus_buf->offset = addr & ~PAGE_MASK;
 
diff --git a/mali_kbase/csf/mali_kbase_csf_kcpu.h b/mali_kbase/csf/mali_kbase_csf_kcpu.h
index 6300569..3edb4de 100644
--- a/mali_kbase/csf/mali_kbase_csf_kcpu.h
+++ b/mali_kbase/csf/mali_kbase_csf_kcpu.h
@@ -294,6 +294,8 @@ struct kbase_kcpu_command_queue {
  *		queue will be created.
  * @newq:	Pointer to the structure which contains information about
  *		the new KCPU command queue to be created.
+ *
+ * Return: 0 if successful or a negative error code on failure.
  */
 int kbase_csf_kcpu_queue_new(struct kbase_context *kctx,
 			 struct kbase_ioctl_kcpu_queue_new *newq);
@@ -307,6 +309,8 @@ int kbase_csf_kcpu_queue_new(struct kbase_context *kctx,
  *		queue is to be deleted.
  * @del:	Pointer to the structure which specifies the KCPU command
  *		queue to be deleted.
+ *
+ * Return: 0 if successful or a negative error code on failure.
  */
 int kbase_csf_kcpu_queue_delete(struct kbase_context *kctx,
 			    struct kbase_ioctl_kcpu_queue_delete *del);
@@ -320,6 +324,8 @@ int kbase_csf_kcpu_queue_delete(struct kbase_context *kctx,
  * @enq:	Pointer to the structure which specifies the KCPU command
  *		as well as the KCPU command queue into which the command
  *		is to be enqueued.
+ *
+ * Return: 0 if successful or a negative error code on failure.
  */
 int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,
 				 struct kbase_ioctl_kcpu_queue_enqueue *enq);
@@ -337,11 +343,11 @@ int kbase_csf_kcpu_queue_context_init(struct kbase_context *kctx);
 /**
  * kbase_csf_kcpu_queue_context_term - Terminate the kernel CPU queues context
  *                                     for a GPU address space
+ * @kctx: Pointer to the kbase context being terminated.
  *
  * This function deletes any kernel CPU queues that weren't deleted before
  * context termination.
  *
- * @kctx: Pointer to the kbase context being terminated.
  */
 void kbase_csf_kcpu_queue_context_term(struct kbase_context *kctx);
 
diff --git a/mali_kbase/csf/mali_kbase_csf_reset_gpu.c b/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
index 7b63132..d5d8318 100644
--- a/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
+++ b/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
@@ -576,6 +576,7 @@ int kbase_reset_gpu_silent(struct kbase_device *kbdev)
 
 	return 0;
 }
+KBASE_EXPORT_TEST_API(kbase_reset_gpu_silent);
 
 bool kbase_reset_gpu_is_active(struct kbase_device *kbdev)
 {
diff --git a/mali_kbase/csf/mali_kbase_csf_scheduler.c b/mali_kbase/csf/mali_kbase_csf_scheduler.c
index f22a5d7..cd87027 100644
--- a/mali_kbase/csf/mali_kbase_csf_scheduler.c
+++ b/mali_kbase/csf/mali_kbase_csf_scheduler.c
@@ -164,12 +164,14 @@ static int wait_for_scheduler_to_exit_sleep(struct kbase_device *kbdev)
  * This function will force the Scheduler to exit the sleep state by doing the
  * wake up of MCU and suspension of on-slot groups. It is called at the time of
  * system suspend.
+ *
+ * Return: 0 on success.
  */
-static void force_scheduler_to_exit_sleep(struct kbase_device *kbdev)
+static int force_scheduler_to_exit_sleep(struct kbase_device *kbdev)
 {
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
 	unsigned long flags;
-	int ret;
+	int ret = 0;
 
 	lockdep_assert_held(&scheduler->lock);
 	WARN_ON(scheduler->state != SCHED_SLEEPING);
@@ -177,12 +179,16 @@ static void force_scheduler_to_exit_sleep(struct kbase_device *kbdev)
 
 	kbase_pm_lock(kbdev);
 	ret = kbase_pm_force_mcu_wakeup_after_sleep(kbdev);
-	if (ret)
-		dev_warn(kbdev->dev, "[%llu] Wait for MCU wake up failed on forced scheduler suspend",
-			kbase_backend_get_cycle_cnt(kbdev));
 	kbase_pm_unlock(kbdev);
+	if (ret) {
+		dev_warn(kbdev->dev,
+			 "[%llu] Wait for MCU wake up failed on forced scheduler suspend",
+			 kbase_backend_get_cycle_cnt(kbdev));
+		goto out;
+	}
 
-	suspend_active_groups_on_powerdown(kbdev, true);
+	if (suspend_active_groups_on_powerdown(kbdev, true))
+		goto out;
 
 	kbase_pm_lock(kbdev);
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
@@ -191,12 +197,26 @@ static void force_scheduler_to_exit_sleep(struct kbase_device *kbdev)
 	kbase_pm_update_state(kbdev);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 	ret = kbase_pm_wait_for_desired_state(kbdev);
-	if (ret)
-		dev_warn(kbdev->dev, "[%llu] Wait for pm state change failed on forced scheduler suspend",
-			kbase_backend_get_cycle_cnt(kbdev));
 	kbase_pm_unlock(kbdev);
+	if (ret) {
+		dev_warn(kbdev->dev,
+			 "[%llu] Wait for pm state change failed on forced scheduler suspend",
+			 kbase_backend_get_cycle_cnt(kbdev));
+		goto out;
+	}
 
 	scheduler->state = SCHED_SUSPENDED;
+
+	return 0;
+
+out:
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	kbdev->pm.backend.exit_gpu_sleep_mode = true;
+	kbdev->pm.backend.gpu_wakeup_override = false;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	kbase_csf_scheduler_invoke_tick(kbdev);
+
+	return ret;
 }
 #endif
 
@@ -445,6 +465,13 @@ static bool queue_group_idle_locked(struct kbase_queue_group *group)
 		group->run_state == KBASE_CSF_GROUP_SUSPENDED_ON_IDLE);
 }
 
+static bool on_slot_group_idle_locked(struct kbase_queue_group *group)
+{
+	lockdep_assert_held(&group->kctx->kbdev->csf.scheduler.lock);
+
+	return (group->run_state == KBASE_CSF_GROUP_IDLE);
+}
+
 static bool queue_group_scheduled(struct kbase_queue_group *group)
 {
 	return (group->run_state != KBASE_CSF_GROUP_INACTIVE &&
@@ -582,6 +609,8 @@ static void disable_gpu_idle_fw_timer(struct kbase_device *kbdev)
  * This function is usually called when Scheduler needs to be activated.
  * The PM reference count is acquired for the Scheduler and the power on
  * of GPU is initiated.
+ *
+ * Return: 0 if successful or a negative error code on failure.
  */
 static int scheduler_pm_active_handle_suspend(struct kbase_device *kbdev,
 				enum kbase_pm_suspend_handler suspend_handler)
@@ -1243,8 +1272,16 @@ int kbase_csf_scheduler_queue_stop(struct kbase_queue *queue)
 
 static void update_hw_active(struct kbase_queue *queue, bool active)
 {
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	if (queue && queue->enabled) {
+		u32 *output_addr = (u32 *)(queue->user_io_addr + PAGE_SIZE);
+
+		output_addr[CS_ACTIVE / sizeof(u32)] = active;
+	}
+#else
 	CSTD_UNUSED(queue);
 	CSTD_UNUSED(active);
+#endif
 }
 
 static void program_cs_extract_init(struct kbase_queue *queue)
@@ -2099,6 +2136,10 @@ static void save_csg_slot(struct kbase_queue_group *group)
 		bool sync_wait = false;
 		bool idle = kbase_csf_firmware_csg_output(ginfo, CSG_STATUS_STATE) &
 			    CSG_STATUS_STATE_IDLE_MASK;
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+		for (i = 0; i < max_streams; i++)
+			update_hw_active(group->bound_queues[i], false);
+#endif /* CONFIG_MALI_NO_MALI */
 		for (i = 0; idle && i < max_streams; i++) {
 			struct kbase_queue *const queue =
 					group->bound_queues[i];
@@ -2385,6 +2426,7 @@ static void program_csg_slot(struct kbase_queue_group *group, s8 slot,
 			protm_suspend_buf >> 32);
 	}
 
+
 	/* Enable all interrupts for now */
 	kbase_csf_firmware_csg_input(ginfo, CSG_ACK_IRQ_MASK, ~((u32)0));
 
@@ -2414,7 +2456,7 @@ static void program_csg_slot(struct kbase_queue_group *group, s8 slot,
 	/* Trace the programming of the CSG on the slot */
 	KBASE_TLSTREAM_TL_KBASE_DEVICE_PROGRAM_CSG(
 		kbdev, kbdev->gpu_props.props.raw_props.gpu_id, group->kctx->id,
-		group->handle, slot);
+		group->handle, slot, (state == CSG_REQ_STATE_RESUME) ? 1 : 0);
 
 	dev_dbg(kbdev->dev, "Starting group %d of context %d_%d on slot %d with priority %u\n",
 		group->handle, kctx->tgid, kctx->id, slot, prio);
@@ -3166,15 +3208,15 @@ static void wait_csg_slots_start(struct kbase_device *kbdev)
  *                           flagged after the completion of a CSG status
  *                           update command
  *
+ * @kbdev:  Pointer to the GPU device.
+ * @slot:   The given slot for checking an occupying resident group's idle
+ *          state.
+ *
  * This function is called at the start of scheduling tick to check the
  * idle status of a queue group resident on a CSG slot.
  * The caller must make sure the corresponding status update command has
  * been called and completed before checking this status.
  *
- * @kbdev:  Pointer to the GPU device.
- * @slot:   The given slot for checking an occupying resident group's idle
- *          state.
- *
  * Return: true if the group resident on slot is idle, otherwise false.
  */
 static bool group_on_slot_is_idle(struct kbase_device *kbdev,
@@ -3194,16 +3236,16 @@ static bool group_on_slot_is_idle(struct kbase_device *kbdev,
  * slots_update_state_changed() -  Check the handshake state of a subset of
  *                                 command group slots.
  *
- * Checks the state of a subset of slots selected through the slots_mask
- * bit_map. Records which slots' handshake completed and send it back in the
- * slots_done bit_map.
- *
  * @kbdev:          The GPU device.
  * @field_mask:     The field mask for checking the state in the csg_req/ack.
  * @slots_mask:     A bit_map specifying the slots to check.
  * @slots_done:     A cleared bit_map for returning the slots that
  *                  have finished update.
  *
+ * Checks the state of a subset of slots selected through the slots_mask
+ * bit_map. Records which slots' handshake completed and send it back in the
+ * slots_done bit_map.
+ *
  * Return: true if the slots_done is set for at least one slot.
  *         Otherwise false.
  */
@@ -3237,10 +3279,6 @@ bool slots_update_state_changed(struct kbase_device *kbdev, u32 field_mask,
  * wait_csg_slots_handshake_ack - Wait the req/ack handshakes to complete on
  *                                the specified groups.
  *
- * This function waits for the acknowledgement of the request that have
- * already been placed for the CSG slots by the caller. Currently used for
- * the CSG priority update and status update requests.
- *
  * @kbdev:           Pointer to the GPU device.
  * @field_mask:      The field mask for checking the state in the csg_req/ack.
  * @slot_mask:       Bitmap reflecting the slots, the function will modify
@@ -3248,6 +3286,10 @@ bool slots_update_state_changed(struct kbase_device *kbdev, u32 field_mask,
  *                   bits.
  * @wait_in_jiffies: Wait duration in jiffies, controlling the time-out.
  *
+ * This function waits for the acknowledgment of the request that have
+ * already been placed for the CSG slots by the caller. Currently used for
+ * the CSG priority update and status update requests.
+ *
  * Return: 0 on all specified slots acknowledged; otherwise -ETIMEDOUT. For
  *         timed out condition with unacknowledged slots, their bits remain
  *         set in the slot_mask.
@@ -3349,14 +3391,14 @@ void kbase_csf_scheduler_evict_ctx_slots(struct kbase_device *kbdev,
  * scheduler_slot_protm_ack - Acknowledging the protected region requests
  * from the resident group on a given slot.
  *
- * The function assumes that the given slot is in stable running state and
- * has already been judged by the caller on that any pending protected region
- * requests of the resident group should be acknowledged.
- *
  * @kbdev:  Pointer to the GPU device.
  * @group:  Pointer to the resident group on the given slot.
  * @slot:   The slot that the given group is actively operating on.
  *
+ * The function assumes that the given slot is in stable running state and
+ * has already been judged by the caller on that any pending protected region
+ * requests of the resident group should be acknowledged.
+ *
  * Return: true if the group has pending protm request(s) and is acknowledged.
  *         The caller should arrange to enter the protected mode for servicing
  *         it. Otherwise return false, indicating the group has no pending protm
@@ -3426,15 +3468,15 @@ static bool scheduler_slot_protm_ack(struct kbase_device *const kbdev,
  * scheduler_group_check_protm_enter - Request the given group to be evaluated
  * for triggering the protected mode.
  *
+ * @kbdev:     Pointer to the GPU device.
+ * @input_grp: Pointer to the GPU queue group.
+ *
  * The function assumes the given group is either an active running group or
  * the scheduler internally maintained field scheduler->top_grp.
  *
  * If the GPU is not already running in protected mode and the input group
  * has protected region requests from its bound queues, the requests are
  * acknowledged and the GPU is instructed to enter the protected mode.
- *
- * @kbdev:     Pointer to the GPU device.
- * @input_grp: Pointer to the GPU queue group.
  */
 static void scheduler_group_check_protm_enter(struct kbase_device *const kbdev,
 				struct kbase_queue_group *const input_grp)
@@ -3538,7 +3580,7 @@ static void scheduler_apply(struct kbase_device *kbdev)
 		}
 	}
 
-	/* Initialize the remaining avialable csg slots for the tick/tock */
+	/* Initialize the remaining available csg slots for the tick/tock */
 	scheduler->remaining_tick_slots = available_csg_slots;
 
 	/* If there are spare slots, apply heads in the list */
@@ -3615,8 +3657,9 @@ static void scheduler_ctx_scan_groups(struct kbase_device *kbdev,
 		group->scan_seq_num = scheduler->csg_scan_count_for_tick++;
 
 		if (queue_group_idle_locked(group)) {
-			list_add_tail(&group->link_to_schedule,
-				      &scheduler->idle_groups_to_schedule);
+			if (on_slot_group_idle_locked(group))
+				list_add_tail(&group->link_to_schedule,
+					&scheduler->idle_groups_to_schedule);
 			continue;
 		}
 
@@ -3640,6 +3683,8 @@ static void scheduler_ctx_scan_groups(struct kbase_device *kbdev,
  *                             fairness of scheduling within a single
  *                             kbase_context.
  *
+ * @kbdev:    Pointer to the GPU device.
+ *
  * Since only kbase_csf_scheduler's top_grp (i.e. the queue group assigned
  * the highest slot priority) is guaranteed to get the resources that it
  * needs we only rotate the kbase_context corresponding to it -
@@ -3678,8 +3723,6 @@ static void scheduler_ctx_scan_groups(struct kbase_device *kbdev,
  * the kbase_csf_scheduler's groups_to_schedule list. In this example, it will
  * be for a group in the next lowest priority level or in absence of those the
  * next kbase_context's queue groups.
- *
- * @kbdev:    Pointer to the GPU device.
  */
 static void scheduler_rotate_groups(struct kbase_device *kbdev)
 {
@@ -3750,17 +3793,17 @@ static void scheduler_rotate_ctxs(struct kbase_device *kbdev)
  *                       slots for which the IDLE notification was received
  *                        previously.
  *
- * This function sends a CSG status update request for all the CSG slots
- * present in the bitmap scheduler->csg_slots_idle_mask and wait for the
- * request to complete.
- * The bits set in the scheduler->csg_slots_idle_mask bitmap are cleared by
- * this function.
- *
  * @kbdev:             Pointer to the GPU device.
  * @csg_bitmap:        Bitmap of the CSG slots for which
  *                     the status update request completed successfully.
  * @failed_csg_bitmap: Bitmap of the CSG slots for which
  *                     the status update request timedout.
+ *
+ * This function sends a CSG status update request for all the CSG slots
+ * present in the bitmap scheduler->csg_slots_idle_mask and wait for the
+ * request to complete.
+ * The bits set in the scheduler->csg_slots_idle_mask bitmap are cleared by
+ * this function.
  */
 static void scheduler_update_idle_slots_status(struct kbase_device *kbdev,
 		unsigned long *csg_bitmap, unsigned long *failed_csg_bitmap)
@@ -3832,6 +3875,8 @@ static void scheduler_update_idle_slots_status(struct kbase_device *kbdev,
  *                    resident on CSG slots for which the
  *                    IDLE notification was received previously.
  *
+ * @kbdev:  Pointer to the GPU device.
+ *
  * This function is called at the start of scheduling tick/tock to reconfirm
  * the idle status of queue groups resident on CSG slots for
  * which idle notification was received previously, i.e. all the CSG slots
@@ -3845,8 +3890,6 @@ static void scheduler_update_idle_slots_status(struct kbase_device *kbdev,
  * updated accordingly.
  * The bits corresponding to slots for which the status update request timedout
  * remain set in scheduler->csg_slots_idle_mask.
- *
- * @kbdev:  Pointer to the GPU device.
  */
 static void scheduler_handle_idle_slots(struct kbase_device *kbdev)
 {
@@ -3901,7 +3944,7 @@ static void scheduler_scan_idle_groups(struct kbase_device *kbdev)
 	list_for_each_entry_safe(group, n, &scheduler->idle_groups_to_schedule,
 				 link_to_schedule) {
 
-		WARN_ON(!queue_group_idle_locked(group));
+		WARN_ON(!on_slot_group_idle_locked(group));
 
 		if (!scheduler->ngrp_to_schedule) {
 			/* keep the top csg's origin */
@@ -3955,6 +3998,18 @@ static struct kbase_queue_group *get_tock_top_group(
 	return NULL;
 }
 
+/**
+ * suspend_active_groups_on_powerdown() - Suspend active CSG groups upon
+ *                                        suspend or GPU IDLE.
+ *
+ * @kbdev:          Pointer to the device
+ * @system_suspend: Flag to indicate it's for system suspend.
+ *
+ * This function will suspend all active CSG groups upon either
+ * system suspend, runtime suspend or GPU IDLE.
+ *
+ * Return: 0 on success, -1 otherwise.
+ */
 static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev,
 					      bool system_suspend)
 {
@@ -3964,8 +4019,8 @@ static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev,
 	int ret = suspend_active_queue_groups(kbdev, slot_mask);
 
 	if (ret) {
-		/* The suspend of CSGs failed, trigger the GPU reset and wait
-		 * for it to complete to be in a deterministic state.
+		/* The suspend of CSGs failed,
+		 * trigger the GPU reset to be in a deterministic state.
 		 */
 		dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for CSG slots to suspend on power down, slot_mask: 0x%*pb\n",
 			 kbase_backend_get_cycle_cnt(kbdev),
@@ -3975,13 +4030,6 @@ static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev,
 		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu(kbdev);
 
-		if (system_suspend) {
-			mutex_unlock(&scheduler->lock);
-			kbase_reset_gpu_allow(kbdev);
-			kbase_reset_gpu_wait(kbdev);
-			kbase_reset_gpu_prevent_and_wait(kbdev);
-			mutex_lock(&scheduler->lock);
-		}
 		return -1;
 	}
 
@@ -4059,6 +4107,8 @@ static void scheduler_sleep_on_idle(struct kbase_device *kbdev)
  * This function is called on GPU idle notification to trigger the power down of
  * GPU. Scheduler's state is changed to suspended and all the active queue
  * groups are suspended before halting the MCU firmware.
+ *
+ * Return: true if scheduler will be suspended or false if suspend is aborted.
  */
 static bool scheduler_suspend_on_idle(struct kbase_device *kbdev)
 {
@@ -4104,6 +4154,8 @@ static void gpu_idle_worker(struct work_struct *work)
 	disable_gpu_idle_fw_timer(kbdev);
 	scheduler_is_idle_suspendable = scheduler_idle_suspendable(kbdev);
 	if (scheduler_is_idle_suspendable) {
+		KBASE_KTRACE_ADD(kbdev, GPU_IDLE_HANDLING_START, NULL,
+				 kbase_csf_ktrace_gpu_cycle_cnt(kbdev));
 #ifdef KBASE_PM_RUNTIME
 		if (kbase_pm_gpu_sleep_allowed(kbdev) &&
 		    scheduler->total_runnable_grps)
@@ -4174,8 +4226,7 @@ static int scheduler_prepare(struct kbase_device *kbdev)
 	/* Adds those idle but runnable groups to the scanout list */
 	scheduler_scan_idle_groups(kbdev);
 
-	/* After adding the idle CSGs, the two counts should be the same */
-	WARN_ON(scheduler->csg_scan_count_for_tick != scheduler->ngrp_to_schedule);
+	WARN_ON(scheduler->csg_scan_count_for_tick < scheduler->ngrp_to_schedule);
 
 	KBASE_KTRACE_ADD_CSF_GRP(kbdev, SCHEDULER_TOP_GRP, scheduler->top_grp,
 			scheduler->num_active_address_spaces |
@@ -4705,8 +4756,11 @@ static int suspend_active_queue_groups_on_reset(struct kbase_device *kbdev)
 	 * due to the extra context ref-count, which prevents the
 	 * L2 powering down cache clean operation in the non racing
 	 * case.
+	 * LSC is being flushed together to cover buslogging usecase,
+	 * where GPU reset is done regularly to avoid the log buffer
+	 * overflow.
 	 */
-	kbase_gpu_start_cache_clean(kbdev);
+	kbase_gpu_start_cache_clean(kbdev, GPU_COMMAND_CACHE_CLN_INV_L2_LSC);
 	ret2 = kbase_gpu_wait_cache_clean_timeout(kbdev,
 			kbdev->reset_timeout_ms);
 	if (ret2) {
@@ -5055,13 +5109,18 @@ int kbase_csf_scheduler_group_copy_suspend_buf(struct kbase_queue_group *group,
 		unsigned int target_page_nr = 0, i = 0;
 		u64 offset = sus_buf->offset;
 		size_t to_copy = sus_buf->size;
+		const u32 csg_suspend_buf_nr_pages =
+			PFN_UP(kbdev->csf.global_iface.groups[0].suspend_size);
 
 		if (scheduler->state != SCHED_SUSPENDED) {
 			/* Similar to the case of HW counters, need to flush
-			 * the GPU cache before reading from the suspend buffer
+			 * the GPU L2 cache before reading from the suspend buffer
 			 * pages as they are mapped and cached on GPU side.
+			 * Flushing LSC is not done here, since only the flush of
+			 * CSG suspend buffer contents is needed from the L2 cache.
 			 */
-			kbase_gpu_start_cache_clean(kbdev);
+			kbase_gpu_start_cache_clean(
+				kbdev, GPU_COMMAND_CACHE_CLN_INV_L2);
 			kbase_gpu_wait_cache_clean(kbdev);
 		} else {
 			/* Make sure power down transitions have completed,
@@ -5073,7 +5132,7 @@ int kbase_csf_scheduler_group_copy_suspend_buf(struct kbase_queue_group *group,
 			kbase_pm_wait_for_desired_state(kbdev);
 		}
 
-		for (i = 0; i < PFN_UP(sus_buf->size) &&
+		for (i = 0; i < csg_suspend_buf_nr_pages &&
 				target_page_nr < sus_buf->nr_pages; i++) {
 			struct page *pg =
 				as_page(group->normal_suspend_buf.phy[i]);
@@ -5252,7 +5311,7 @@ void kbase_csf_scheduler_group_protm_enter(struct kbase_queue_group *group)
  * This function will evaluate the sync condition, if any, of all the queues
  * bound to the given group.
  *
- * Return true if the sync condition of at least one queue has been satisfied.
+ * Return: true if the sync condition of at least one queue has been satisfied.
  */
 static bool check_sync_update_for_on_slot_group(
 		struct kbase_queue_group *group)
@@ -5341,7 +5400,7 @@ static bool check_sync_update_for_on_slot_group(
  * protected mode that has a higher priority than the active protected mode
  * group.
  *
- * Return true if the sync condition of at least one queue in a group has been
+ * Return: true if the sync condition of at least one queue in a group has been
  * satisfied.
  */
 static bool check_sync_update_for_idle_groups_protm(struct kbase_device *kbdev)
@@ -5604,8 +5663,14 @@ void kbase_csf_scheduler_term(struct kbase_device *kbdev)
 		flush_work(&kbdev->csf.scheduler.gpu_idle_work);
 		mutex_lock(&kbdev->csf.scheduler.lock);
 
-		if (WARN_ON(kbdev->csf.scheduler.state != SCHED_SUSPENDED))
+		if (kbdev->csf.scheduler.state != SCHED_SUSPENDED) {
+			/* The power policy could prevent the Scheduler from
+			 * getting suspended when GPU becomes idle.
+			 */
+			WARN_ON(kbase_pm_idle_groups_sched_suspendable(kbdev));
 			scheduler_suspend(kbdev);
+		}
+
 		mutex_unlock(&kbdev->csf.scheduler.lock);
 		cancel_delayed_work_sync(&kbdev->csf.scheduler.ping_work);
 		cancel_tick_timer(kbdev);
@@ -5692,12 +5757,16 @@ void kbase_csf_scheduler_timer_set_enabled(struct kbase_device *kbdev,
 		 * available, so need to drop the lock before cancellation.
 		 */
 		cancel_work_sync(&scheduler->tick_work);
-	} else if (!currently_enabled && enable) {
+		return;
+	}
+
+	if (!currently_enabled && enable) {
 		scheduler->timer_enabled = true;
 
 		scheduler_enable_tick_timer_nolock(kbdev);
-		mutex_unlock(&scheduler->lock);
 	}
+
+	mutex_unlock(&scheduler->lock);
 }
 
 void kbase_csf_scheduler_kick(struct kbase_device *kbdev)
@@ -5718,18 +5787,20 @@ out:
 	mutex_unlock(&scheduler->lock);
 }
 
-void kbase_csf_scheduler_pm_suspend(struct kbase_device *kbdev)
+int kbase_csf_scheduler_pm_suspend(struct kbase_device *kbdev)
 {
+	int result = 0;
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
 
 	/* Cancel any potential queued delayed work(s) */
 	cancel_work_sync(&scheduler->tick_work);
 	cancel_tock_work(scheduler);
 
-	if (kbase_reset_gpu_prevent_and_wait(kbdev)) {
+	result = kbase_reset_gpu_prevent_and_wait(kbdev);
+	if (result) {
 		dev_warn(kbdev->dev,
 			 "Stop PM suspending for failing to prevent gpu reset.\n");
-		return;
+		return result;
 	}
 
 	mutex_lock(&scheduler->lock);
@@ -5742,18 +5813,31 @@ void kbase_csf_scheduler_pm_suspend(struct kbase_device *kbdev)
 	 */
 	if (scheduler->state == SCHED_SLEEPING) {
 		dev_info(kbdev->dev, "Activating MCU out of sleep on system suspend");
-		force_scheduler_to_exit_sleep(kbdev);
+		result = force_scheduler_to_exit_sleep(kbdev);
+		if (result) {
+			dev_warn(kbdev->dev, "Scheduler failed to exit from sleep");
+			goto exit;
+		}
 	}
 #endif
 	if (scheduler->state != SCHED_SUSPENDED) {
-		suspend_active_groups_on_powerdown(kbdev, true);
-		dev_info(kbdev->dev, "Scheduler PM suspend");
-		scheduler_suspend(kbdev);
-		cancel_tick_timer(kbdev);
+		result = suspend_active_groups_on_powerdown(kbdev, true);
+		if (result) {
+			dev_warn(kbdev->dev, "failed to suspend active groups");
+			goto exit;
+		} else {
+			dev_info(kbdev->dev, "Scheduler PM suspend");
+			scheduler_suspend(kbdev);
+			cancel_tick_timer(kbdev);
+		}
 	}
+
+exit:
 	mutex_unlock(&scheduler->lock);
 
 	kbase_reset_gpu_allow(kbdev);
+
+	return result;
 }
 KBASE_EXPORT_TEST_API(kbase_csf_scheduler_pm_suspend);
 
diff --git a/mali_kbase/csf/mali_kbase_csf_scheduler.h b/mali_kbase/csf/mali_kbase_csf_scheduler.h
index 73ebb66..068a45b 100644
--- a/mali_kbase/csf/mali_kbase_csf_scheduler.h
+++ b/mali_kbase/csf/mali_kbase_csf_scheduler.h
@@ -23,6 +23,7 @@
 #define _KBASE_CSF_SCHEDULER_H_
 
 #include "mali_kbase_csf.h"
+#include "mali_kbase_csf_event.h"
 
 /**
  * kbase_csf_scheduler_queue_start() - Enable the running of GPU command queue
@@ -250,14 +251,14 @@ void kbase_csf_scheduler_enable_tick_timer(struct kbase_device *kbdev);
  * kbase_csf_scheduler_group_copy_suspend_buf - Suspend a queue
  *		group and copy suspend buffer.
  *
- * This function is called to suspend a queue group and copy the suspend_buffer
- * contents to the input buffer provided.
- *
  * @group:	Pointer to the queue group to be suspended.
  * @sus_buf:	Pointer to the structure which contains details of the
  *		user buffer and its kernel pinned pages to which we need to copy
  *		the group suspend buffer.
  *
+ * This function is called to suspend a queue group and copy the suspend_buffer
+ * contents to the input buffer provided.
+ *
  * Return:	0 on success, or negative on failure.
  */
 int kbase_csf_scheduler_group_copy_suspend_buf(struct kbase_queue_group *group,
@@ -425,8 +426,10 @@ void kbase_csf_scheduler_pm_resume(struct kbase_device *kbdev);
  *
  * This function will make the scheduler suspend all the running queue groups
  * and drop its power managemenet reference.
+ *
+ * Return: 0 on success.
  */
-void kbase_csf_scheduler_pm_suspend(struct kbase_device *kbdev);
+int kbase_csf_scheduler_pm_suspend(struct kbase_device *kbdev);
 
 /**
  * kbase_csf_scheduler_all_csgs_idle() - Check if the scheduler internal
diff --git a/mali_kbase/csf/mali_kbase_csf_tiler_heap.c b/mali_kbase/csf/mali_kbase_csf_tiler_heap.c
index 06a7824..62fb241 100644
--- a/mali_kbase/csf/mali_kbase_csf_tiler_heap.c
+++ b/mali_kbase/csf/mali_kbase_csf_tiler_heap.c
@@ -28,13 +28,13 @@
 /**
  * encode_chunk_ptr - Encode the address and size of a chunk as an integer.
  *
+ * @chunk_size: Size of a tiler heap chunk, in bytes.
+ * @chunk_addr: GPU virtual address of the same tiler heap chunk.
+ *
  * The size and address of the next chunk in a list are packed into a single
  * 64-bit value for storage in a chunk's header. This function returns that
  * value.
  *
- * @chunk_size: Size of a tiler heap chunk, in bytes.
- * @chunk_addr: GPU virtual address of the same tiler heap chunk.
- *
  * Return: Next chunk pointer suitable for writing into a chunk header.
  */
 static u64 encode_chunk_ptr(u32 const chunk_size, u64 const chunk_addr)
@@ -76,14 +76,14 @@ static struct kbase_csf_tiler_heap_chunk *get_last_chunk(
 /**
  * link_chunk - Link a chunk into a tiler heap
  *
+ * @heap:  Pointer to the tiler heap.
+ * @chunk: Pointer to the heap chunk to be linked.
+ *
  * Unless the @chunk is the first in the kernel's list of chunks belonging to
  * a given tiler heap, this function stores the size and address of the @chunk
  * in the header of the preceding chunk. This requires the GPU memory region
  * containing the header to be be mapped temporarily, which can fail.
  *
- * @heap:  Pointer to the tiler heap.
- * @chunk: Pointer to the heap chunk to be linked.
- *
  * Return: 0 if successful or a negative error code on failure.
  */
 static int link_chunk(struct kbase_csf_tiler_heap *const heap,
@@ -118,15 +118,15 @@ static int link_chunk(struct kbase_csf_tiler_heap *const heap,
 /**
  * init_chunk - Initialize and link a tiler heap chunk
  *
- * Zero-initialize a new chunk's header (including its pointer to the next
- * chunk, which doesn't exist yet) and then update the previous chunk's
- * header to link the new chunk into the chunk list.
- *
  * @heap:  Pointer to the tiler heap.
  * @chunk: Pointer to the heap chunk to be initialized and linked.
  * @link_with_prev: Flag to indicate if the new chunk needs to be linked with
  *                  the previously allocated chunk.
  *
+ * Zero-initialize a new chunk's header (including its pointer to the next
+ * chunk, which doesn't exist yet) and then update the previous chunk's
+ * header to link the new chunk into the chunk list.
+ *
  * Return: 0 if successful or a negative error code on failure.
  */
 static int init_chunk(struct kbase_csf_tiler_heap *const heap,
@@ -163,14 +163,14 @@ static int init_chunk(struct kbase_csf_tiler_heap *const heap,
 /**
  * create_chunk - Create a tiler heap chunk
  *
- * This function allocates a chunk of memory for a tiler heap and adds it to
- * the end of the list of chunks associated with that heap. The size of the
- * chunk is not a parameter because it is configured per-heap not per-chunk.
- *
  * @heap: Pointer to the tiler heap for which to allocate memory.
  * @link_with_prev: Flag to indicate if the chunk to be allocated needs to be
  *                  linked with the previously allocated chunk.
  *
+ * This function allocates a chunk of memory for a tiler heap and adds it to
+ * the end of the list of chunks associated with that heap. The size of the
+ * chunk is not a parameter because it is configured per-heap not per-chunk.
+ *
  * Return: 0 if successful or a negative error code on failure.
  */
 static int create_chunk(struct kbase_csf_tiler_heap *const heap,
@@ -237,15 +237,15 @@ static int create_chunk(struct kbase_csf_tiler_heap *const heap,
 /**
  * delete_chunk - Delete a tiler heap chunk
  *
+ * @heap:  Pointer to the tiler heap for which @chunk was allocated.
+ * @chunk: Pointer to a chunk to be deleted.
+ *
  * This function frees a tiler heap chunk previously allocated by @create_chunk
  * and removes it from the list of chunks associated with the heap.
  *
  * WARNING: The deleted chunk is not unlinked from the list of chunks used by
  *          the GPU, therefore it is only safe to use this function when
  *          deleting a heap.
- *
- * @heap:  Pointer to the tiler heap for which @chunk was allocated.
- * @chunk: Pointer to a chunk to be deleted.
  */
 static void delete_chunk(struct kbase_csf_tiler_heap *const heap,
 	struct kbase_csf_tiler_heap_chunk *const chunk)
@@ -264,10 +264,10 @@ static void delete_chunk(struct kbase_csf_tiler_heap *const heap,
 /**
  * delete_all_chunks - Delete all chunks belonging to a tiler heap
  *
+ * @heap: Pointer to a tiler heap.
+ *
  * This function empties the list of chunks associated with a tiler heap by
  * freeing all chunks previously allocated by @create_chunk.
- *
- * @heap: Pointer to a tiler heap.
  */
 static void delete_all_chunks(struct kbase_csf_tiler_heap *heap)
 {
@@ -284,12 +284,12 @@ static void delete_all_chunks(struct kbase_csf_tiler_heap *heap)
 /**
  * create_initial_chunks - Create the initial list of chunks for a tiler heap
  *
- * This function allocates a given number of chunks for a tiler heap and
- * adds them to the list of chunks associated with that heap.
- *
  * @heap:    Pointer to the tiler heap for which to allocate memory.
  * @nchunks: Number of chunks to create.
  *
+ * This function allocates a given number of chunks for a tiler heap and
+ * adds them to the list of chunks associated with that heap.
+ *
  * Return: 0 if successful or a negative error code on failure.
  */
 static int create_initial_chunks(struct kbase_csf_tiler_heap *const heap,
@@ -310,12 +310,12 @@ static int create_initial_chunks(struct kbase_csf_tiler_heap *const heap,
 /**
  * delete_heap - Delete a tiler heap
  *
+ * @heap: Pointer to a tiler heap to be deleted.
+ *
  * This function frees any chunks allocated for a tiler heap previously
  * initialized by @kbase_csf_tiler_heap_init and removes it from the list of
  * heaps associated with the kbase context. The heap context structure used by
  * the firmware is also freed.
- *
- * @heap: Pointer to a tiler heap to be deleted.
  */
 static void delete_heap(struct kbase_csf_tiler_heap *heap)
 {
@@ -346,15 +346,15 @@ static void delete_heap(struct kbase_csf_tiler_heap *heap)
 /**
  * find_tiler_heap - Find a tiler heap from the address of its heap context
  *
+ * @kctx:        Pointer to the kbase context to search for a tiler heap.
+ * @heap_gpu_va: GPU virtual address of a heap context structure.
+ *
  * Each tiler heap managed by the kernel has an associated heap context
  * structure used by the firmware. This function finds a tiler heap object from
  * the GPU virtual address of its associated heap context. The heap context
  * should have been allocated by @kbase_csf_heap_context_allocator_alloc in the
  * same @kctx.
  *
- * @kctx:        Pointer to the kbase context to search for a tiler heap.
- * @heap_gpu_va: GPU virtual address of a heap context structure.
- *
  * Return: pointer to the tiler heap object, or NULL if not found.
  */
 static struct kbase_csf_tiler_heap *find_tiler_heap(
@@ -495,8 +495,11 @@ int kbase_csf_tiler_heap_init(struct kbase_context *const kctx,
 		dev_dbg(kctx->kbdev->dev, "Created tiler heap 0x%llX\n",
 			heap->gpu_va);
 		mutex_unlock(&kctx->csf.tiler_heaps.lock);
+		kctx->running_total_tiler_heap_nr_chunks += heap->chunk_count;
+		kctx->running_total_tiler_heap_memory += heap->chunk_size * heap->chunk_count;
+		if (kctx->running_total_tiler_heap_memory > kctx->peak_total_tiler_heap_memory)
+			kctx->peak_total_tiler_heap_memory = kctx->running_total_tiler_heap_memory;
 	}
-
 	return err;
 }
 
@@ -505,27 +508,36 @@ int kbase_csf_tiler_heap_term(struct kbase_context *const kctx,
 {
 	int err = 0;
 	struct kbase_csf_tiler_heap *heap = NULL;
+	u32 chunk_count = 0;
+	u64 heap_size = 0;
 
 	mutex_lock(&kctx->csf.tiler_heaps.lock);
 
 	heap = find_tiler_heap(kctx, heap_gpu_va);
-	if (likely(heap))
+	if (likely(heap)) {
+		chunk_count = heap->chunk_count;
+		heap_size = heap->chunk_size * chunk_count;
 		delete_heap(heap);
-	else
+	} else
 		err = -EINVAL;
 
 	mutex_unlock(&kctx->csf.tiler_heaps.lock);
-
+	if (likely(kctx->running_total_tiler_heap_memory >= heap_size))
+		kctx->running_total_tiler_heap_memory -= heap_size;
+	else
+		dev_warn(kctx->kbdev->dev,
+			 "Running total tiler heap memory lower than expected!");
+	if (likely(kctx->running_total_tiler_heap_nr_chunks >= chunk_count))
+		kctx->running_total_tiler_heap_nr_chunks -= chunk_count;
+	else
+		dev_warn(kctx->kbdev->dev,
+			 "Running total tiler chunk count lower than expected!");
 	return err;
 }
 
 /**
  * alloc_new_chunk - Allocate a new chunk for the tiler heap.
  *
- * This function will allocate a new chunk for the chunked tiler heap depending
- * on the settings provided by userspace when the heap was created and the
- * heap's statistics (like number of render passes in-flight).
- *
  * @heap:               Pointer to the tiler heap.
  * @nr_in_flight:       Number of render passes that are in-flight, must not be zero.
  * @pending_frag_count: Number of render passes in-flight with completed vertex/tiler stage.
@@ -534,6 +546,10 @@ int kbase_csf_tiler_heap_term(struct kbase_context *const kctx,
  * @new_chunk_ptr:      Where to store the GPU virtual address & size of the new
  *                      chunk allocated for the heap.
  *
+ * This function will allocate a new chunk for the chunked tiler heap depending
+ * on the settings provided by userspace when the heap was created and the
+ * heap's statistics (like number of render passes in-flight).
+ *
  * Return: 0 if a new chunk was allocated otherwise an appropriate negative
  *         error code.
  */
diff --git a/mali_kbase/csf/mali_kbase_csf_tiler_heap.h b/mali_kbase/csf/mali_kbase_csf_tiler_heap.h
index 04c27f7..4031ad4 100644
--- a/mali_kbase/csf/mali_kbase_csf_tiler_heap.h
+++ b/mali_kbase/csf/mali_kbase_csf_tiler_heap.h
@@ -38,10 +38,10 @@ int kbase_csf_tiler_heap_context_init(struct kbase_context *kctx);
  * kbase_csf_tiler_heap_context_term - Terminate the tiler heaps context for a
  *                                     GPU address space
  *
+ * @kctx: Pointer to the kbase context being terminated.
+ *
  * This function deletes any chunked tiler heaps that weren't deleted before
  * context termination.
- *
- * @kctx: Pointer to the kbase context being terminated.
  */
 void kbase_csf_tiler_heap_context_term(struct kbase_context *kctx);
 
@@ -74,15 +74,15 @@ int kbase_csf_tiler_heap_init(struct kbase_context *kctx,
 /**
  * kbasep_cs_tiler_heap_term - Terminate a chunked tiler memory heap.
  *
+ * @kctx: Pointer to the kbase context in which the tiler heap was initialized.
+ * @gpu_heap_va: The GPU virtual address of the context that was set up for the
+ *               tiler heap.
+ *
  * This function will terminate a chunked tiler heap and cause all the chunks
  * (initial and those added during out-of-memory processing) to be freed.
  * It is the caller's responsibility to ensure no further operations on this
  * heap will happen before calling this function.
  *
- * @kctx: Pointer to the kbase context in which the tiler heap was initialized.
- * @gpu_heap_va: The GPU virtual address of the context that was set up for the
- *               tiler heap.
- *
  * Return: 0 if successful or a negative error code on failure.
  */
 int kbase_csf_tiler_heap_term(struct kbase_context *kctx, u64 gpu_heap_va);
@@ -90,12 +90,6 @@ int kbase_csf_tiler_heap_term(struct kbase_context *kctx, u64 gpu_heap_va);
 /**
  * kbase_csf_tiler_heap_alloc_new_chunk - Allocate a new chunk for tiler heap.
  *
- * This function will allocate a new chunk for the chunked tiler heap depending
- * on the settings provided by userspace when the heap was created and the
- * heap's statistics (like number of render passes in-flight).
- * It would return an appropriate error code if a new chunk couldn't be
- * allocated.
- *
  * @kctx:               Pointer to the kbase context in which the tiler heap was initialized.
  * @gpu_heap_va:        GPU virtual address of the heap context.
  * @nr_in_flight:       Number of render passes that are in-flight, must not be zero.
@@ -105,6 +99,12 @@ int kbase_csf_tiler_heap_term(struct kbase_context *kctx, u64 gpu_heap_va);
  * @new_chunk_ptr:      Where to store the GPU virtual address & size of the new
  *                      chunk allocated for the heap.
  *
+ * This function will allocate a new chunk for the chunked tiler heap depending
+ * on the settings provided by userspace when the heap was created and the
+ * heap's statistics (like number of render passes in-flight).
+ * It would return an appropriate error code if a new chunk couldn't be
+ * allocated.
+ *
  * Return: 0 if a new chunk was allocated otherwise an appropriate negative
  *         error code (like -EBUSY when a free chunk is expected to be
  *         available upon completion of a render pass and -EINVAL when
diff --git a/mali_kbase/csf/mali_kbase_csf_tiler_heap_debugfs.c b/mali_kbase/csf/mali_kbase_csf_tiler_heap_debugfs.c
index f46beed..96e0f28 100644
--- a/mali_kbase/csf/mali_kbase_csf_tiler_heap_debugfs.c
+++ b/mali_kbase/csf/mali_kbase_csf_tiler_heap_debugfs.c
@@ -32,7 +32,7 @@
  * @file: The seq_file for printing to
  * @data: The debugfs dentry private data, a pointer to kbase_context
  *
- * Return: Negative error code or 0 on success.
+ * Return: 0 in any case.
  */
 static int kbasep_csf_tiler_heap_debugfs_show(struct seq_file *file, void *data)
 {
@@ -65,11 +65,41 @@ static int kbasep_csf_tiler_heap_debugfs_show(struct seq_file *file, void *data)
 	return 0;
 }
 
+/**
+ * kbasep_csf_tiler_heap_total_debugfs_show() - Print the total memory allocated
+ *                                              for all tiler heaps in a context.
+ *
+ * @file: The seq_file for printing to
+ * @data: The debugfs dentry private data, a pointer to kbase_context
+ *
+ * Return: 0 in any case.
+ */
+static int kbasep_csf_tiler_heap_total_debugfs_show(struct seq_file *file, void *data)
+{
+	struct kbase_context *kctx = file->private;
+
+	seq_printf(file, "MALI_CSF_TILER_HEAP_DEBUGFS_VERSION: v%u\n",
+		   MALI_CSF_TILER_HEAP_DEBUGFS_VERSION);
+	seq_printf(file, "Total number of chunks of all heaps in the context: %lu\n",
+		   (unsigned long)kctx->running_total_tiler_heap_nr_chunks);
+	seq_printf(file, "Total allocated memory of all heaps in the context: %llu\n",
+		   (unsigned long long)kctx->running_total_tiler_heap_memory);
+	seq_printf(file, "Peak allocated tiler heap memory in the context: %llu\n",
+		   (unsigned long long)kctx->peak_total_tiler_heap_memory);
+
+	return 0;
+}
+
 static int kbasep_csf_tiler_heap_debugfs_open(struct inode *in, struct file *file)
 {
 	return single_open(file, kbasep_csf_tiler_heap_debugfs_show, in->i_private);
 }
 
+static int kbasep_csf_tiler_heap_total_debugfs_open(struct inode *in, struct file *file)
+{
+	return single_open(file, kbasep_csf_tiler_heap_total_debugfs_show, in->i_private);
+}
+
 static const struct file_operations kbasep_csf_tiler_heap_debugfs_fops = {
 	.open = kbasep_csf_tiler_heap_debugfs_open,
 	.read = seq_read,
@@ -77,6 +107,13 @@ static const struct file_operations kbasep_csf_tiler_heap_debugfs_fops = {
 	.release = single_release,
 };
 
+static const struct file_operations kbasep_csf_tiler_heap_total_debugfs_fops = {
+	.open = kbasep_csf_tiler_heap_total_debugfs_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
 void kbase_csf_tiler_heap_debugfs_init(struct kbase_context *kctx)
 {
 	struct dentry *file;
@@ -93,6 +130,21 @@ void kbase_csf_tiler_heap_debugfs_init(struct kbase_context *kctx)
 	}
 }
 
+void kbase_csf_tiler_heap_total_debugfs_init(struct kbase_context *kctx)
+{
+	struct dentry *file;
+
+	if (WARN_ON(!kctx || IS_ERR_OR_NULL(kctx->kctx_dentry)))
+		return;
+
+	file = debugfs_create_file("tiler_heaps_total", 0444, kctx->kctx_dentry,
+				   kctx, &kbasep_csf_tiler_heap_total_debugfs_fops);
+
+	if (IS_ERR_OR_NULL(file)) {
+		dev_warn(kctx->kbdev->dev,
+			"Unable to create total tiler heap allocated memory debugfs entry");
+	}
+}
 
 #else
 /*
@@ -102,5 +154,9 @@ void kbase_csf_tiler_heap_debugfs_init(struct kbase_context *kctx)
 {
 }
 
+void kbase_csf_tiler_heap_total_debugfs_init(struct kbase_context *kctx)
+{
+}
+
 #endif /* CONFIG_DEBUG_FS */
 
diff --git a/mali_kbase/csf/mali_kbase_csf_tiler_heap_debugfs.h b/mali_kbase/csf/mali_kbase_csf_tiler_heap_debugfs.h
index 92ae91a..27a9074 100644
--- a/mali_kbase/csf/mali_kbase_csf_tiler_heap_debugfs.h
+++ b/mali_kbase/csf/mali_kbase_csf_tiler_heap_debugfs.h
@@ -34,4 +34,11 @@ struct kbase_context;
  */
 void kbase_csf_tiler_heap_debugfs_init(struct kbase_context *kctx);
 
+/**
+ * kbase_csf_tiler_heap_total_debugfs_init() - Create a debugfs entry for per context tiler heap
+ *
+ * @kctx: The kbase_context for which to create the debugfs entry
+ */
+void kbase_csf_tiler_heap_total_debugfs_init(struct kbase_context *kctx);
+
 #endif /* _KBASE_CSF_TILER_HEAP_DEBUGFS_H_ */
diff --git a/mali_kbase/csf/mali_kbase_csf_tl_reader.c b/mali_kbase/csf/mali_kbase_csf_tl_reader.c
index 563faec..b01ac29 100644
--- a/mali_kbase/csf/mali_kbase_csf_tl_reader.c
+++ b/mali_kbase/csf/mali_kbase_csf_tl_reader.c
@@ -171,8 +171,8 @@ static int kbase_ts_converter_init(
  *
  * Return: The CPU timestamp.
  */
-void kbase_ts_converter_convert(const struct kbase_ts_converter *self,
-				u64 *gpu_ts)
+static void __maybe_unused
+kbase_ts_converter_convert(const struct kbase_ts_converter *self, u64 *gpu_ts)
 {
 	u64 old_gpu_ts = *gpu_ts;
 	*gpu_ts = div64_u64(old_gpu_ts * self->multiplier, self->divisor) +
@@ -477,7 +477,14 @@ int kbase_csf_tl_reader_start(struct kbase_csf_tl_reader *self,
 		return 0;
 
 	if (tl_reader_init_late(self, kbdev)) {
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+		dev_warn(
+			kbdev->dev,
+			"CSFFW timeline is not available for MALI_NO_MALI builds!");
+		return 0;
+#else
 		return -EINVAL;
+#endif
 	}
 
 	tl_reader_reset(self);
@@ -521,14 +528,5 @@ void kbase_csf_tl_reader_stop(struct kbase_csf_tl_reader *self)
 
 void kbase_csf_tl_reader_reset(struct kbase_csf_tl_reader *self)
 {
-	u64 gpu_cycle = 0;
-	struct kbase_device *kbdev = self->kbdev;
-
-	if (!kbdev)
-		return;
-
 	kbase_csf_tl_reader_flush_buffer(self);
-
-	get_cpu_gpu_time(kbdev, NULL, NULL, &gpu_cycle);
-	KBASE_TLSTREAM_TL_KBASE_CSFFW_RESET(kbdev, gpu_cycle);
 }
diff --git a/mali_kbase/csf/mali_kbase_csf_tl_reader.h b/mali_kbase/csf/mali_kbase_csf_tl_reader.h
index 891a8f3..4523ba2 100644
--- a/mali_kbase/csf/mali_kbase_csf_tl_reader.h
+++ b/mali_kbase/csf/mali_kbase_csf_tl_reader.h
@@ -40,8 +40,7 @@ struct kbase_tlstream;
 struct kbase_device;
 
 /**
- * struct kbase_ts_converter -
- * System timestamp to CPU timestamp converter state.
+ * struct kbase_ts_converter - System timestamp to CPU timestamp converter state.
  *
  * @multiplier:		Numerator of the converter's fraction.
  * @divisor:		Denominator of the converter's fraction.
@@ -145,8 +144,7 @@ void kbase_csf_tl_reader_term(struct kbase_csf_tl_reader *self);
 int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self);
 
 /**
- * kbase_csf_tl_reader_start() -
- *	Start asynchronous copying of CSFFW timeline stream.
+ * kbase_csf_tl_reader_start() - Start asynchronous copying of CSFFW timeline stream.
  *
  * @self:	CSFFW TL Reader instance.
  * @kbdev:	Kbase device.
@@ -157,8 +155,7 @@ int kbase_csf_tl_reader_start(struct kbase_csf_tl_reader *self,
 	struct kbase_device *kbdev);
 
 /**
- * kbase_csf_tl_reader_stop() -
- *	Stop asynchronous copying of CSFFW timeline stream.
+ * kbase_csf_tl_reader_stop() - Stop asynchronous copying of CSFFW timeline stream.
  *
  * @self:	CSFFW TL Reader instance.
  */
@@ -166,8 +163,7 @@ void kbase_csf_tl_reader_stop(struct kbase_csf_tl_reader *self);
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 /**
- * kbase_csf_tl_reader_debugfs_init() -
- *	Initialize debugfs for CSFFW Timelime Stream Reader.
+ * kbase_csf_tl_reader_debugfs_init() - Initialize debugfs for CSFFW Timelime Stream Reader.
  *
  * @kbdev:	Kbase device.
  */
@@ -175,8 +171,7 @@ void kbase_csf_tl_reader_debugfs_init(struct kbase_device *kbdev);
 #endif
 
 /**
- * kbase_csf_tl_reader_reset() -
- *	Reset CSFFW timeline reader, it should be called before reset CSFFW.
+ * kbase_csf_tl_reader_reset() - Reset CSFFW timeline reader, it should be called before reset CSFFW.
  *
  * @self:	CSFFW TL Reader instance.
  */
diff --git a/mali_kbase/csf/mali_kbase_csf_trace_buffer.c b/mali_kbase/csf/mali_kbase_csf_trace_buffer.c
index a6343c8..0c72f00 100644
--- a/mali_kbase/csf/mali_kbase_csf_trace_buffer.c
+++ b/mali_kbase/csf/mali_kbase_csf_trace_buffer.c
@@ -38,12 +38,6 @@
 /**
  * struct firmware_trace_buffer - Trace Buffer within the MCU firmware
  *
- * The firmware relays information to the host by writing on memory buffers
- * which are allocated and partially configured by the host. These buffers
- * are called Trace Buffers: each of them has a specific purpose and is
- * identified by a name and a set of memory addresses where the host can
- * set pointers to host-allocated structures.
- *
  * @kbdev:        Pointer to the Kbase device.
  * @node:         List head linking all trace buffers to
  *                kbase_device:csf.firmware_trace_buffers
@@ -73,6 +67,12 @@
  * @num_pages: Size of the data buffer, in pages.
  * @trace_enable_init_mask: Initial value for the trace enable bit mask.
  * @name:  NULL terminated string which contains the name of the trace buffer.
+ *
+ * The firmware relays information to the host by writing on memory buffers
+ * which are allocated and partially configured by the host. These buffers
+ * are called Trace Buffers: each of them has a specific purpose and is
+ * identified by a name and a set of memory addresses where the host can
+ * set pointers to host-allocated structures.
  */
 struct firmware_trace_buffer {
 	struct kbase_device *kbdev;
@@ -100,14 +100,14 @@ struct firmware_trace_buffer {
 /**
  * struct firmware_trace_buffer_data - Configuration data for trace buffers
  *
- * Describe how to set up a trace buffer interface.
- * Trace buffers are identified by name and they require a data buffer and
- * an initial mask of values for the trace enable bits.
- *
  * @name: Name identifier of the trace buffer
  * @trace_enable_init_mask: Initial value to assign to the trace enable bits
  * @size: Size of the data buffer to allocate for the trace buffer, in pages.
  *        The size of a data buffer must always be a power of 2.
+ *
+ * Describe how to set up a trace buffer interface.
+ * Trace buffers are identified by name and they require a data buffer and
+ * an initial mask of values for the trace enable bits.
  */
 struct firmware_trace_buffer_data {
 	char name[64];
@@ -121,14 +121,13 @@ struct firmware_trace_buffer_data {
  * This table contains the configuration data for the trace buffers that are
  * expected to be parsed from the firmware.
  */
-static const struct firmware_trace_buffer_data
-trace_buffer_data[] = {
-#ifndef MALI_KBASE_BUILD
-	{ "fwutf", {0}, 1 },
+static const struct firmware_trace_buffer_data trace_buffer_data[] = {
+#if MALI_UNIT_TEST
+	{ "fwutf", { 0 }, 1 },
 #endif
-	{ FW_TRACE_BUF_NAME, {0}, 4 },
-	{ "benchmark", {0}, 2 },
-	{ "timeline",  {0}, KBASE_CSF_TL_BUFFER_NR_PAGES },
+	{ FW_TRACE_BUF_NAME, { 0 }, 4 },
+	{ "benchmark", { 0 }, 2 },
+	{ "timeline", { 0 }, KBASE_CSF_TL_BUFFER_NR_PAGES },
 };
 
 int kbase_csf_firmware_trace_buffers_init(struct kbase_device *kbdev)
diff --git a/mali_kbase/csf/mali_kbase_csf_trace_buffer.h b/mali_kbase/csf/mali_kbase_csf_trace_buffer.h
index b9f481d..823ace7 100644
--- a/mali_kbase/csf/mali_kbase_csf_trace_buffer.h
+++ b/mali_kbase/csf/mali_kbase_csf_trace_buffer.h
@@ -34,6 +34,8 @@ struct kbase_device;
 /**
  * kbase_csf_firmware_trace_buffers_init - Initialize trace buffers
  *
+ * @kbdev: Device pointer
+ *
  * Allocate resources for trace buffers. In particular:
  * - One memory page of GPU-readable, CPU-writable memory is used for
  *   the Extract variables of all trace buffers.
@@ -52,8 +54,6 @@ struct kbase_device;
  * populated with data from the firmware image parsing.
  *
  * Return: 0 if success, or an error code on failure.
- *
- * @kbdev: Device pointer
  */
 int kbase_csf_firmware_trace_buffers_init(struct kbase_device *kbdev);
 
@@ -67,6 +67,11 @@ void kbase_csf_firmware_trace_buffers_term(struct kbase_device *kbdev);
 /**
  * kbase_csf_firmware_parse_trace_buffer_entry - Process a "trace buffer" section
  *
+ * @kbdev:     Kbase device structure
+ * @entry:     Pointer to the section
+ * @size:      Size (in bytes) of the section
+ * @updatable: Indicates whether config items can be updated with FIRMWARE_CONFIG_UPDATE
+ *
  * Read a "trace buffer" section adding metadata for the related trace buffer
  * to the kbase_device:csf.firmware_trace_buffers list.
  *
@@ -74,11 +79,6 @@ void kbase_csf_firmware_trace_buffers_term(struct kbase_device *kbdev);
  * will not be initialized.
  *
  * Return: 0 if successful, negative error code on failure.
- *
- * @kbdev:     Kbase device structure
- * @entry:     Pointer to the section
- * @size:      Size (in bytes) of the section
- * @updatable: Indicates whether config items can be updated with FIRMWARE_CONFIG_UPDATE
  */
 int kbase_csf_firmware_parse_trace_buffer_entry(struct kbase_device *kbdev,
 						const u32 *entry,
@@ -86,8 +86,9 @@ int kbase_csf_firmware_parse_trace_buffer_entry(struct kbase_device *kbdev,
 						bool updatable);
 
 /**
- * kbase_csf_firmware_reload_trace_buffers_data -
- * Reload trace buffers data for firmware reboot
+ * kbase_csf_firmware_reload_trace_buffers_data - Reload trace buffers data for firmware reboot
+ *
+ * @kbdev: Device pointer
  *
  * Helper function used when rebooting the firmware to reload the initial setup
  * for all the trace buffers which have been previously parsed and initialized.
@@ -99,44 +100,40 @@ int kbase_csf_firmware_parse_trace_buffer_entry(struct kbase_device *kbdev,
  *
  * In other words, the re-initialization done by this function will be
  * equivalent but not necessarily identical to the original initialization.
- *
- * @kbdev: Device pointer
  */
 void kbase_csf_firmware_reload_trace_buffers_data(struct kbase_device *kbdev);
 
 /**
  * kbase_csf_firmware_get_trace_buffer - Get a trace buffer
  *
- * Return: handle to a trace buffer, given the name, or NULL if a trace buffer
- *         with that name couldn't be found.
- *
  * @kbdev: Device pointer
  * @name:  Name of the trace buffer to find
+ *
+ * Return: handle to a trace buffer, given the name, or NULL if a trace buffer
+ *         with that name couldn't be found.
  */
 struct firmware_trace_buffer *kbase_csf_firmware_get_trace_buffer(
 	struct kbase_device *kbdev, const char *name);
 
 /**
- * kbase_csf_firmware_trace_buffer_get_trace_enable_bits_count -
- * Get number of trace enable bits for a trace buffer
- *
- * Return: Number of trace enable bits in a trace buffer.
+ * kbase_csf_firmware_trace_buffer_get_trace_enable_bits_count - Get number of trace enable bits for a trace buffer
  *
  * @trace_buffer: Trace buffer handle
+ *
+ * Return: Number of trace enable bits in a trace buffer.
  */
 unsigned int kbase_csf_firmware_trace_buffer_get_trace_enable_bits_count(
 	const struct firmware_trace_buffer *trace_buffer);
 
 /**
- * kbase_csf_firmware_trace_buffer_update_trace_enable_bit -
- * Update a trace enable bit
- *
- * Update the value of a given trace enable bit.
+ * kbase_csf_firmware_trace_buffer_update_trace_enable_bit - Update a trace enable bit
  *
  * @trace_buffer: Trace buffer handle
  * @bit:          Bit to update
  * @value:        New value for the given bit
  *
+ * Update the value of a given trace enable bit.
+ *
  * Return: 0 if successful, negative error code on failure.
  */
 int kbase_csf_firmware_trace_buffer_update_trace_enable_bit(
@@ -146,9 +143,9 @@ int kbase_csf_firmware_trace_buffer_update_trace_enable_bit(
 /**
  * kbase_csf_firmware_trace_buffer_is_empty - Empty trace buffer predicate
  *
- * Return: True if the trace buffer is empty, or false otherwise.
- *
  * @trace_buffer: Trace buffer handle
+ *
+ * Return: True if the trace buffer is empty, or false otherwise.
  */
 bool kbase_csf_firmware_trace_buffer_is_empty(
 	const struct firmware_trace_buffer *trace_buffer);
@@ -156,14 +153,14 @@ bool kbase_csf_firmware_trace_buffer_is_empty(
 /**
  * kbase_csf_firmware_trace_buffer_read_data - Read data from a trace buffer
  *
+ * @trace_buffer: Trace buffer handle
+ * @data:         Pointer to a client-allocated where data shall be written.
+ * @num_bytes:    Maximum number of bytes to read from the trace buffer.
+ *
  * Read available data from a trace buffer. The client provides a data buffer
  * of a given size and the maximum number of bytes to read.
  *
  * Return: Number of bytes read from the trace buffer.
- *
- * @trace_buffer: Trace buffer handle
- * @data:         Pointer to a client-allocated where data shall be written.
- * @num_bytes:    Maximum number of bytes to read from the trace buffer.
  */
 unsigned int kbase_csf_firmware_trace_buffer_read_data(
 	struct firmware_trace_buffer *trace_buffer, u8 *data, unsigned int num_bytes);
diff --git a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_codes_csf.h b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_codes_csf.h
index d05f802..2506ce1 100644
--- a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_codes_csf.h
+++ b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_codes_csf.h
@@ -97,6 +97,13 @@ int dummy_array[] = {
 	/* info_val = bitmask of slots that gave an ACK for STATUS_UPDATE */
 	KBASE_KTRACE_CODE_MAKE_CODE(SLOTS_STATUS_UPDATE_ACK),
 
+	/* info_val[63:0] = GPU cycle counter, used mainly for benchmarking
+	 * purpose.
+	 */
+	KBASE_KTRACE_CODE_MAKE_CODE(GPU_IDLE_HANDLING_START),
+	KBASE_KTRACE_CODE_MAKE_CODE(MCU_HALTED),
+	KBASE_KTRACE_CODE_MAKE_CODE(MCU_IN_SLEEP),
+
 	/*
 	 * Group events
 	 */
diff --git a/mali_kbase/debug/backend/mali_kbase_debug_linux_ktrace_csf.h b/mali_kbase/debug/backend/mali_kbase_debug_linux_ktrace_csf.h
index 4b23fc9..9ee7f81 100644
--- a/mali_kbase/debug/backend/mali_kbase_debug_linux_ktrace_csf.h
+++ b/mali_kbase/debug/backend/mali_kbase_debug_linux_ktrace_csf.h
@@ -58,6 +58,9 @@ DEFINE_MALI_ADD_EVENT(IDLE_WORKER_END);
 DEFINE_MALI_ADD_EVENT(GROUP_SYNC_UPDATE_WORKER_BEGIN);
 DEFINE_MALI_ADD_EVENT(GROUP_SYNC_UPDATE_WORKER_END);
 DEFINE_MALI_ADD_EVENT(SLOTS_STATUS_UPDATE_ACK);
+DEFINE_MALI_ADD_EVENT(GPU_IDLE_HANDLING_START);
+DEFINE_MALI_ADD_EVENT(MCU_HALTED);
+DEFINE_MALI_ADD_EVENT(MCU_IN_SLEEP);
 
 DECLARE_EVENT_CLASS(mali_csf_grp_q_template,
 	TP_PROTO(struct kbase_device *kbdev, struct kbase_queue_group *group,
diff --git a/mali_kbase/debug/mali_kbase_debug_ktrace.h b/mali_kbase/debug/mali_kbase_debug_ktrace.h
index f943696..f1e6d3d 100644
--- a/mali_kbase/debug/mali_kbase_debug_ktrace.h
+++ b/mali_kbase/debug/mali_kbase_debug_ktrace.h
@@ -49,6 +49,7 @@
 /**
  * kbase_ktrace_init - initialize kbase ktrace.
  * @kbdev: kbase device
+ * Return: 0 if successful or a negative error code on failure.
  */
 int kbase_ktrace_init(struct kbase_device *kbdev);
 
diff --git a/mali_kbase/debug/mali_kbase_debug_ktrace_internal.h b/mali_kbase/debug/mali_kbase_debug_ktrace_internal.h
index d9bd351..ba93f29 100644
--- a/mali_kbase/debug/mali_kbase_debug_ktrace_internal.h
+++ b/mali_kbase/debug/mali_kbase_debug_ktrace_internal.h
@@ -63,6 +63,8 @@ void kbasep_ktrace_backend_format_msg(struct kbase_ktrace_msg *trace_msg,
  * @ktrace: kbase device's ktrace
  *
  * This may also empty the oldest entry in the ringbuffer to make space.
+ *
+ * Return: ktrace message
  */
 struct kbase_ktrace_msg *kbasep_ktrace_reserve(struct kbase_ktrace *ktrace);
 
diff --git a/mali_kbase/device/backend/mali_kbase_device_csf.c b/mali_kbase/device/backend/mali_kbase_device_csf.c
index 7b37a96..8a4d2e2 100644
--- a/mali_kbase/device/backend/mali_kbase_device_csf.c
+++ b/mali_kbase/device/backend/mali_kbase_device_csf.c
@@ -24,11 +24,15 @@
 
 #include <mali_kbase_hwaccess_backend.h>
 #include <mali_kbase_hwcnt_backend_csf_if_fw.h>
+#include <mali_kbase_hwcnt_watchdog_if_timer.h>
 #include <mali_kbase_ctx_sched.h>
 #include <mali_kbase_reset_gpu.h>
 #include <csf/mali_kbase_csf.h>
 #include <csf/ipa_control/mali_kbase_csf_ipa_control.h>
 
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+#include <backend/gpu/mali_kbase_model_linux.h>
+#endif
 
 #include <mali_kbase.h>
 #include <backend/gpu/mali_kbase_irq_internal.h>
@@ -196,9 +200,31 @@ static void kbase_csf_early_term(struct kbase_device *kbdev)
 }
 
 /**
+ * kbase_device_hwcnt_watchdog_if_init - Create hardware counter watchdog
+ *                                       interface.
+ * @kbdev:	Device pointer
+ */
+static int kbase_device_hwcnt_watchdog_if_init(struct kbase_device *kbdev)
+{
+	return kbase_hwcnt_watchdog_if_timer_create(
+		&kbdev->hwcnt_watchdog_timer);
+}
+
+/**
+ * kbase_device_hwcnt_watchdog_if_term - Terminate hardware counter watchdog
+ *                                       interface.
+ * @kbdev:	Device pointer
+ */
+static void kbase_device_hwcnt_watchdog_if_term(struct kbase_device *kbdev)
+{
+	kbase_hwcnt_watchdog_if_timer_destroy(&kbdev->hwcnt_watchdog_timer);
+}
+
+/**
  * kbase_device_hwcnt_backend_csf_if_init - Create hardware counter backend
  *                                          firmware interface.
  * @kbdev:	Device pointer
+ * Return: 0 if successful or a negative error code on failure.
  */
 static int kbase_device_hwcnt_backend_csf_if_init(struct kbase_device *kbdev)
 {
@@ -226,7 +252,7 @@ static int kbase_device_hwcnt_backend_csf_init(struct kbase_device *kbdev)
 	return kbase_hwcnt_backend_csf_create(
 		&kbdev->hwcnt_backend_csf_if_fw,
 		KBASE_HWCNT_BACKEND_CSF_RING_BUFFER_COUNT,
-		&kbdev->hwcnt_gpu_iface);
+		&kbdev->hwcnt_watchdog_timer, &kbdev->hwcnt_gpu_iface);
 }
 
 /**
@@ -239,8 +265,13 @@ static void kbase_device_hwcnt_backend_csf_term(struct kbase_device *kbdev)
 }
 
 static const struct kbase_device_init dev_init[] = {
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	{ kbase_gpu_device_create, kbase_gpu_device_destroy,
+	  "Dummy model initialization failed" },
+#else
 	{ assign_irqs, NULL, "IRQ search failed" },
 	{ registers_map, registers_unmap, "Register map failed" },
+#endif
 	{ power_control_init, power_control_term,
 	  "Power control initialization failed" },
 	{ kbase_device_io_history_init, kbase_device_io_history_term,
@@ -270,6 +301,9 @@ static const struct kbase_device_init dev_init[] = {
 	  "Clock rate trace manager initialization failed" },
 	{ kbase_lowest_gpu_freq_init, NULL,
 	  "Lowest freq initialization failed" },
+	{ kbase_device_hwcnt_watchdog_if_init,
+	  kbase_device_hwcnt_watchdog_if_term,
+	  "GPU hwcnt backend watchdog interface creation failed" },
 	{ kbase_device_hwcnt_backend_csf_if_init,
 	  kbase_device_hwcnt_backend_csf_if_term,
 	  "GPU hwcnt backend CSF interface creation failed" },
@@ -283,7 +317,6 @@ static const struct kbase_device_init dev_init[] = {
 	{ kbase_csf_early_init, kbase_csf_early_term,
 	  "Early CSF initialization failed" },
 	{ NULL, kbase_device_firmware_hwcnt_term, NULL },
-#ifdef MALI_KBASE_BUILD
 	{ kbase_device_debugfs_init, kbase_device_debugfs_term,
 	  "DebugFS initialization failed" },
 	/* Sysfs init needs to happen before registering the device with
@@ -305,7 +338,6 @@ static const struct kbase_device_init dev_init[] = {
 	  "GPU property population failed" },
 	{ kbase_device_late_init, kbase_device_late_term,
 	  "Late device initialization failed" },
-#endif
 };
 
 static void kbase_device_term_partial(struct kbase_device *kbdev,
@@ -476,3 +508,4 @@ out:
 
 	return ret;
 }
+KBASE_EXPORT_TEST_API(kbase_device_firmware_init_once);
diff --git a/mali_kbase/device/backend/mali_kbase_device_jm.c b/mali_kbase/device/backend/mali_kbase_device_jm.c
index 7288e8e..2e022eb 100644
--- a/mali_kbase/device/backend/mali_kbase_device_jm.c
+++ b/mali_kbase/device/backend/mali_kbase_device_jm.c
@@ -28,6 +28,9 @@
 #include <mali_kbase_ctx_sched.h>
 #include <mali_kbase_reset_gpu.h>
 
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+#include <backend/gpu/mali_kbase_model_linux.h>
+#endif /* CONFIG_MALI_NO_MALI */
 
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
 #include <arbiter/mali_kbase_arbiter_pm.h>
@@ -156,8 +159,13 @@ static void kbase_device_hwcnt_backend_jm_term(struct kbase_device *kbdev)
 }
 
 static const struct kbase_device_init dev_init[] = {
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	{ kbase_gpu_device_create, kbase_gpu_device_destroy,
+	  "Dummy model initialization failed" },
+#else
 	{ assign_irqs, NULL, "IRQ search failed" },
 	{ registers_map, registers_unmap, "Register map failed" },
+#endif
 	{ kbase_device_io_history_init, kbase_device_io_history_term,
 	  "Register access history initialization failed" },
 	{ kbase_device_pm_init, kbase_device_pm_term,
@@ -203,7 +211,6 @@ static const struct kbase_device_init dev_init[] = {
 	  "Performance counter instrumentation initialization failed" },
 	{ kbase_backend_late_init, kbase_backend_late_term,
 	  "Late backend initialization failed" },
-#ifdef MALI_KBASE_BUILD
 	{ kbase_debug_job_fault_dev_init, kbase_debug_job_fault_dev_term,
 	  "Job fault debug initialization failed" },
 	{ kbase_device_debugfs_init, kbase_device_debugfs_term,
@@ -225,7 +232,6 @@ static const struct kbase_device_init dev_init[] = {
 	  "Misc device registration failed" },
 	{ kbase_gpuprops_populate_user_buffer, kbase_gpuprops_free_user_buffer,
 	  "GPU property population failed" },
-#endif
 	{ NULL, kbase_dummy_job_wa_cleanup, NULL },
 	{ kbase_device_late_init, kbase_device_late_term,
 	  "Late device initialization failed" },
diff --git a/mali_kbase/device/mali_kbase_device.c b/mali_kbase/device/mali_kbase_device.c
index 518aaf9..dc53c43 100644
--- a/mali_kbase/device/mali_kbase_device.c
+++ b/mali_kbase/device/mali_kbase_device.c
@@ -275,6 +275,7 @@ int kbase_device_misc_init(struct kbase_device * const kbdev)
 	if (err)
 		goto dma_set_mask_failed;
 
+
 	/* There is no limit for Mali, so set to max. We only do this if dma_parms
 	 * is already allocated by the platform.
 	 */
@@ -345,6 +346,7 @@ void kbase_device_misc_term(struct kbase_device *kbdev)
 
 	kbase_device_all_as_term(kbdev);
 
+
 	if (kbdev->oom_notifier_block.notifier_call)
 		unregister_oom_notifier(&kbdev->oom_notifier_block);
 }
diff --git a/mali_kbase/device/mali_kbase_device.h b/mali_kbase/device/mali_kbase_device.h
index 517c16b..22ceca0 100644
--- a/mali_kbase/device/mali_kbase_device.h
+++ b/mali_kbase/device/mali_kbase_device.h
@@ -118,22 +118,42 @@ u32 kbase_reg_read(struct kbase_device *kbdev, u32 offset);
 bool kbase_is_gpu_removed(struct kbase_device *kbdev);
 
 /**
+ * kbase_gpu_cache_flush_and_busy_wait - Start a cache flush and busy wait
+ * @kbdev: Kbase device
+ * @flush_op: Flush command register value to be sent to HW
+ *
+ * Issue a cache flush command to hardware, then busy wait an irq status.
+ * This function will clear CLEAN_CACHES_COMPLETED irq mask bit set by other
+ * threads through kbase_gpu_start_cache_clean(), and wake them up manually
+ * after the busy-wait is done. Any pended cache flush commands raised by
+ * other thread are handled in this function.
+ * hwaccess_lock must be held by the caller.
+ *
+ * Return: 0 if successful or a negative error code on failure.
+ */
+int kbase_gpu_cache_flush_and_busy_wait(struct kbase_device *kbdev,
+					u32 flush_op);
+
+/**
  * kbase_gpu_start_cache_clean - Start a cache clean
  * @kbdev: Kbase device
+ * @flush_op: Flush command register value to be sent to HW
  *
- * Issue a cache clean and invalidate command to hardware. This function will
- * take hwaccess_lock.
+ * Issue a given cache flush command to hardware.
+ * This function will take hwaccess_lock.
  */
-void kbase_gpu_start_cache_clean(struct kbase_device *kbdev);
+void kbase_gpu_start_cache_clean(struct kbase_device *kbdev, u32 flush_op);
 
 /**
  * kbase_gpu_start_cache_clean_nolock - Start a cache clean
  * @kbdev: Kbase device
+ * @flush_op: Flush command register value to be sent to HW
  *
- * Issue a cache clean and invalidate command to hardware. hwaccess_lock
- * must be held by the caller.
+ * Issue a given cache flush command to hardware.
+ * hwaccess_lock must be held by the caller.
  */
-void kbase_gpu_start_cache_clean_nolock(struct kbase_device *kbdev);
+void kbase_gpu_start_cache_clean_nolock(struct kbase_device *kbdev,
+					u32 flush_op);
 
 /**
  * kbase_gpu_wait_cache_clean - Wait for cache cleaning to finish
diff --git a/mali_kbase/device/mali_kbase_device_hw.c b/mali_kbase/device/mali_kbase_device_hw.c
index 4c98ae1..beacc7c 100644
--- a/mali_kbase/device/mali_kbase_device_hw.c
+++ b/mali_kbase/device/mali_kbase_device_hw.c
@@ -38,7 +38,98 @@ bool kbase_is_gpu_removed(struct kbase_device *kbdev)
 }
 #endif /* !IS_ENABLED(CONFIG_MALI_NO_MALI) */
 
-void kbase_gpu_start_cache_clean_nolock(struct kbase_device *kbdev)
+static int busy_wait_cache_clean_irq(struct kbase_device *kbdev)
+{
+	/* Previously MMU-AS command was used for L2 cache flush on page-table update.
+	 * And we're using the same max-loops count for GPU command, because amount of
+	 * L2 cache flush overhead are same between them.
+	 */
+	unsigned int max_loops = KBASE_AS_INACTIVE_MAX_LOOPS;
+
+	/* Wait for the GPU cache clean operation to complete */
+	while (--max_loops &&
+	       !(kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_RAWSTAT)) &
+		 CLEAN_CACHES_COMPLETED)) {
+		;
+	}
+
+	/* reset gpu if time-out occurred */
+	if (max_loops == 0) {
+		dev_err(kbdev->dev,
+			"CLEAN_CACHES_COMPLETED bit stuck, might be caused by slow/unstable GPU clock or possible faulty FPGA connector\n");
+		if (kbase_prepare_to_reset_gpu_locked(kbdev, RESET_FLAGS_NONE))
+			kbase_reset_gpu_locked(kbdev);
+		return -EBUSY;
+	}
+
+	/* Clear the interrupt CLEAN_CACHES_COMPLETED bit. */
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_CLEAR),
+			CLEAN_CACHES_COMPLETED);
+
+	return 0;
+}
+
+int kbase_gpu_cache_flush_and_busy_wait(struct kbase_device *kbdev,
+					u32 flush_op)
+{
+	u32 irq_mask;
+	int need_to_wake_up = 0;
+	int ret = 0;
+
+	/* hwaccess_lock must be held to avoid any sync issue with
+	 * kbase_gpu_start_cache_clean() / kbase_clean_caches_done()
+	 */
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	/* 1. Check if CLEAN_CACHES_COMPLETED irq mask bit is set.
+	 *    If it is set, it means there are threads waiting for
+	 *    CLEAN_CACHES_COMPLETED irq to be raised.
+	 *    We'll clear the irq mask bit and busy-wait for the cache
+	 *    clean operation to complete before submitting the cache
+	 *    clean command required after the GPU page table update.
+	 *    Pended flush commands will be merged to requested command.
+	 */
+	irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
+	if (irq_mask & CLEAN_CACHES_COMPLETED) {
+		/* disable irq first */
+		kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK),
+				irq_mask & ~CLEAN_CACHES_COMPLETED);
+
+		/* busy wait irq status to be enabled */
+		ret = busy_wait_cache_clean_irq(kbdev);
+		if (ret)
+			return ret;
+
+		/* merge pended command if there's any */
+		flush_op = GPU_COMMAND_FLUSH_CACHE_MERGE(
+			kbdev->cache_clean_queued, flush_op);
+
+		/* enable wake up notify flag */
+		need_to_wake_up = 1;
+	} else {
+		/* Clear the interrupt CLEAN_CACHES_COMPLETED bit. */
+		kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_CLEAR),
+				CLEAN_CACHES_COMPLETED);
+	}
+
+	/* 2. Issue GPU_CONTROL.COMMAND.FLUSH_CACHE operation. */
+	KBASE_KTRACE_ADD(kbdev, CORE_GPU_CLEAN_INV_CACHES, NULL, flush_op);
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND), flush_op);
+
+	/* 3. Busy-wait irq status to be enabled. */
+	ret = busy_wait_cache_clean_irq(kbdev);
+	if (ret)
+		return ret;
+
+	/* 4. Wake-up blocked threads when there is any. */
+	if (need_to_wake_up)
+		kbase_gpu_cache_clean_wait_complete(kbdev);
+
+	return ret;
+}
+
+void kbase_gpu_start_cache_clean_nolock(struct kbase_device *kbdev,
+					u32 flush_op)
 {
 	u32 irq_mask;
 
@@ -47,10 +138,11 @@ void kbase_gpu_start_cache_clean_nolock(struct kbase_device *kbdev)
 	if (kbdev->cache_clean_in_progress) {
 		/* If this is called while another clean is in progress, we
 		 * can't rely on the current one to flush any new changes in
-		 * the cache. Instead, trigger another cache clean immediately
-		 * after this one finishes.
+		 * the cache. Instead, accumulate all cache clean operations
+		 * and trigger that immediately after this one finishes.
 		 */
-		kbdev->cache_clean_queued = true;
+		kbdev->cache_clean_queued = GPU_COMMAND_FLUSH_CACHE_MERGE(
+			kbdev->cache_clean_queued, flush_op);
 		return;
 	}
 
@@ -59,19 +151,18 @@ void kbase_gpu_start_cache_clean_nolock(struct kbase_device *kbdev)
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK),
 				irq_mask | CLEAN_CACHES_COMPLETED);
 
-	KBASE_KTRACE_ADD(kbdev, CORE_GPU_CLEAN_INV_CACHES, NULL, 0);
-	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
-			GPU_COMMAND_CACHE_CLN_INV_L2);
+	KBASE_KTRACE_ADD(kbdev, CORE_GPU_CLEAN_INV_CACHES, NULL, flush_op);
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND), flush_op);
 
 	kbdev->cache_clean_in_progress = true;
 }
 
-void kbase_gpu_start_cache_clean(struct kbase_device *kbdev)
+void kbase_gpu_start_cache_clean(struct kbase_device *kbdev, u32 flush_op)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	kbase_gpu_start_cache_clean_nolock(kbdev);
+	kbase_gpu_start_cache_clean_nolock(kbdev, flush_op);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 }
 
@@ -79,7 +170,7 @@ void kbase_gpu_cache_clean_wait_complete(struct kbase_device *kbdev)
 {
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
-	kbdev->cache_clean_queued = false;
+	kbdev->cache_clean_queued = 0;
 	kbdev->cache_clean_in_progress = false;
 	wake_up(&kbdev->cache_clean_wait);
 }
@@ -92,11 +183,14 @@ void kbase_clean_caches_done(struct kbase_device *kbdev)
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 
 	if (kbdev->cache_clean_queued) {
-		kbdev->cache_clean_queued = false;
+		u32 pended_flush_op = kbdev->cache_clean_queued;
+
+		kbdev->cache_clean_queued = 0;
 
-		KBASE_KTRACE_ADD(kbdev, CORE_GPU_CLEAN_INV_CACHES, NULL, 0);
+		KBASE_KTRACE_ADD(kbdev, CORE_GPU_CLEAN_INV_CACHES, NULL,
+				 pended_flush_op);
 		kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
-				GPU_COMMAND_CACHE_CLN_INV_L2);
+				pended_flush_op);
 	} else {
 		/* Disable interrupt */
 		irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
diff --git a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_jm.c b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_jm.c
index 2f4c9d9..e095986 100644
--- a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_jm.c
+++ b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_jm.c
@@ -24,6 +24,9 @@
 #include "mali_kbase_ipa_counter_common_jm.h"
 #include "mali_kbase.h"
 
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+#include <backend/gpu/mali_kbase_model_dummy.h>
+#endif /* CONFIG_MALI_NO_MALI */
 
 /* Performance counter blocks base offsets */
 #define JM_BASE             (0 * KBASE_IPA_NR_BYTES_PER_BLOCK)
@@ -94,9 +97,15 @@ static u32 kbase_g7x_power_model_get_memsys_counter(struct kbase_ipa_model_vinst
 static u32 kbase_g7x_power_model_get_sc_counter(struct kbase_ipa_model_vinstr_data *model_data,
 						u32 counter_block_offset)
 {
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	const u32 sc_base = MEMSYS_BASE +
+		(KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS *
+		 KBASE_IPA_NR_BYTES_PER_BLOCK);
+#else
 	const u32 sc_base = MEMSYS_BASE +
 		(model_data->kbdev->gpu_props.props.l2_props.num_l2_slices *
 		 KBASE_IPA_NR_BYTES_PER_BLOCK);
+#endif
 	return sc_base + counter_block_offset;
 }
 
diff --git a/mali_kbase/ipa/mali_kbase_ipa.c b/mali_kbase/ipa/mali_kbase_ipa.c
index 8b05e68..c0c0cbb 100644
--- a/mali_kbase/ipa/mali_kbase_ipa.c
+++ b/mali_kbase/ipa/mali_kbase_ipa.c
@@ -537,18 +537,34 @@ static void opp_translate_freq_voltage(struct kbase_device *kbdev,
 				       unsigned long *freqs,
 				       unsigned long *volts)
 {
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	/* An arbitrary voltage and frequency value can be chosen for testing
+	 * in no mali configuration which may not match with any OPP level.
+	 */
+	freqs[KBASE_IPA_BLOCK_TYPE_TOP_LEVEL] = nominal_freq;
+	volts[KBASE_IPA_BLOCK_TYPE_TOP_LEVEL] = nominal_voltage;
+
+	freqs[KBASE_IPA_BLOCK_TYPE_SHADER_CORES] = nominal_freq;
+	volts[KBASE_IPA_BLOCK_TYPE_SHADER_CORES] = nominal_voltage;
+#else
 	u64 core_mask;
+	unsigned int i;
 
 	kbase_devfreq_opp_translate(kbdev, nominal_freq, &core_mask,
 				    freqs, volts);
 	CSTD_UNUSED(core_mask);
 
+	/* Convert micro volts to milli volts */
+	for (i = 0; i < kbdev->nr_clocks; i++)
+		volts[i] /= 1000;
+
 	if (kbdev->nr_clocks == 1) {
 		freqs[KBASE_IPA_BLOCK_TYPE_SHADER_CORES] =
 			freqs[KBASE_IPA_BLOCK_TYPE_TOP_LEVEL];
 		volts[KBASE_IPA_BLOCK_TYPE_SHADER_CORES] =
 			volts[KBASE_IPA_BLOCK_TYPE_TOP_LEVEL];
 	}
+#endif
 }
 
 #if KERNEL_VERSION(5, 10, 0) > LINUX_VERSION_CODE
diff --git a/mali_kbase/jm/mali_kbase_jm_defs.h b/mali_kbase/jm/mali_kbase_jm_defs.h
index cb1c276..ac8f89b 100644
--- a/mali_kbase/jm/mali_kbase_jm_defs.h
+++ b/mali_kbase/jm/mali_kbase_jm_defs.h
@@ -653,8 +653,8 @@ static inline bool kbase_jd_katom_is_protected(
 
 /**
  * kbase_atom_is_younger - query if one atom is younger by age than another
- * @katom_a the first atom
- * @katom_a the second atom
+ * @katom_a: the first atom
+ * @katom_a: the second atom
  *
  * Return: true if the first atom is strictly younger than the second, false
  * otherwise.
diff --git a/mali_kbase/mali_base_hwconfig_features.h b/mali_kbase/mali_base_hwconfig_features.h
index 2e81cb1..0f2b106 100644
--- a/mali_kbase/mali_base_hwconfig_features.h
+++ b/mali_kbase/mali_base_hwconfig_features.h
@@ -37,41 +37,42 @@ enum base_hw_feature {
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_ASN_HASH,
 	BASE_HW_FEATURE_GPU_SLEEP,
+	BASE_HW_FEATURE_FLUSH_INV_SHADER_OTHER,
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_generic[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_generic[] = {
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tMIx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tMIx[] = {
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tHEx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tHEx[] = {
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tSIx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tSIx[] = {
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tDVx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tDVx[] = {
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tNOx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tNOx[] = {
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
@@ -80,7 +81,7 @@ static const enum base_hw_feature base_hw_features_tNOx[] = {
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tGOx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tGOx[] = {
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
@@ -89,50 +90,55 @@ static const enum base_hw_feature base_hw_features_tGOx[] = {
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tTRx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tTRx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
+	BASE_HW_FEATURE_FLUSH_INV_SHADER_OTHER,
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tNAx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tNAx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
+	BASE_HW_FEATURE_FLUSH_INV_SHADER_OTHER,
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tBEx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tBEx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
+	BASE_HW_FEATURE_FLUSH_INV_SHADER_OTHER,
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tBAx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tBAx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
+	BASE_HW_FEATURE_FLUSH_INV_SHADER_OTHER,
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tDUx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tDUx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
+	BASE_HW_FEATURE_FLUSH_INV_SHADER_OTHER,
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tODx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tODx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_L2_CONFIG,
@@ -140,7 +146,7 @@ static const enum base_hw_feature base_hw_features_tODx[] = {
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tGRx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tGRx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_L2_CONFIG,
@@ -148,7 +154,7 @@ static const enum base_hw_feature base_hw_features_tGRx[] = {
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tVAx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tVAx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_L2_CONFIG,
@@ -156,7 +162,7 @@ static const enum base_hw_feature base_hw_features_tVAx[] = {
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tTUx[] = {
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tTUx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_L2_CONFIG,
diff --git a/mali_kbase/mali_base_hwconfig_issues.h b/mali_kbase/mali_base_hwconfig_issues.h
index d188120..ad45325 100644
--- a/mali_kbase/mali_base_hwconfig_issues.h
+++ b/mali_kbase/mali_base_hwconfig_issues.h
@@ -63,11 +63,11 @@ enum base_hw_issue {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_generic[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_generic[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tMIx_r0p0_05dev0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tMIx_r0p0_05dev0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_10682,
 	BASE_HW_ISSUE_11054,
@@ -87,7 +87,7 @@ static const enum base_hw_issue base_hw_issues_tMIx_r0p0_05dev0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tMIx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tMIx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_10682,
 	BASE_HW_ISSUE_11054,
@@ -107,7 +107,7 @@ static const enum base_hw_issue base_hw_issues_tMIx_r0p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tMIx_r0p1[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tMIx_r0p1[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_10682,
 	BASE_HW_ISSUE_11054,
@@ -127,7 +127,7 @@ static const enum base_hw_issue base_hw_issues_tMIx_r0p1[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tMIx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tMIx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_7891,
@@ -142,7 +142,7 @@ static const enum base_hw_issue base_hw_issues_model_tMIx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tHEx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tHEx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_10682,
 	BASE_HW_ISSUE_11054,
@@ -155,7 +155,7 @@ static const enum base_hw_issue base_hw_issues_tHEx_r0p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tHEx_r0p1[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tHEx_r0p1[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_10682,
 	BASE_HW_ISSUE_11054,
@@ -168,7 +168,7 @@ static const enum base_hw_issue base_hw_issues_tHEx_r0p1[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tHEx_r0p2[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tHEx_r0p2[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_10682,
 	BASE_HW_ISSUE_11054,
@@ -181,7 +181,7 @@ static const enum base_hw_issue base_hw_issues_tHEx_r0p2[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tHEx_r0p3[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tHEx_r0p3[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_10682,
 	BASE_HW_ISSUE_TMIX_7891,
@@ -193,7 +193,7 @@ static const enum base_hw_issue base_hw_issues_tHEx_r0p3[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tHEx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tHEx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_7891,
@@ -203,7 +203,7 @@ static const enum base_hw_issue base_hw_issues_model_tHEx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tSIx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tSIx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_11054,
 	BASE_HW_ISSUE_TMIX_8133,
@@ -216,7 +216,7 @@ static const enum base_hw_issue base_hw_issues_tSIx_r0p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tSIx_r0p1[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tSIx_r0p1[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_11054,
 	BASE_HW_ISSUE_TMIX_8133,
@@ -229,7 +229,7 @@ static const enum base_hw_issue base_hw_issues_tSIx_r0p1[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tSIx_r1p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tSIx_r1p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_11054,
 	BASE_HW_ISSUE_TMIX_8133,
@@ -241,7 +241,7 @@ static const enum base_hw_issue base_hw_issues_tSIx_r1p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tSIx_r1p1[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tSIx_r1p1[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
 	BASE_HW_ISSUE_TSIX_1116,
@@ -252,7 +252,7 @@ static const enum base_hw_issue base_hw_issues_tSIx_r1p1[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tSIx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tSIx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
@@ -262,7 +262,7 @@ static const enum base_hw_issue base_hw_issues_model_tSIx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tDVx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tDVx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
 	BASE_HW_ISSUE_TSIX_1116,
@@ -273,7 +273,7 @@ static const enum base_hw_issue base_hw_issues_tDVx_r0p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tDVx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tDVx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
@@ -283,7 +283,7 @@ static const enum base_hw_issue base_hw_issues_model_tDVx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tNOx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tNOx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
 	BASE_HW_ISSUE_TSIX_1116,
@@ -295,7 +295,7 @@ static const enum base_hw_issue base_hw_issues_tNOx_r0p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tNOx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tNOx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
@@ -305,7 +305,7 @@ static const enum base_hw_issue base_hw_issues_model_tNOx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tGOx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tGOx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
 	BASE_HW_ISSUE_TSIX_1116,
@@ -317,7 +317,7 @@ static const enum base_hw_issue base_hw_issues_tGOx_r0p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tGOx_r1p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tGOx_r1p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
 	BASE_HW_ISSUE_TSIX_1116,
@@ -329,7 +329,7 @@ static const enum base_hw_issue base_hw_issues_tGOx_r1p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tGOx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tGOx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
@@ -339,7 +339,7 @@ static const enum base_hw_issue base_hw_issues_model_tGOx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tTRx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tTRx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -355,7 +355,7 @@ static const enum base_hw_issue base_hw_issues_tTRx_r0p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tTRx_r0p1[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tTRx_r0p1[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -371,7 +371,7 @@ static const enum base_hw_issue base_hw_issues_tTRx_r0p1[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tTRx_r0p2[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tTRx_r0p2[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -386,7 +386,7 @@ static const enum base_hw_issue base_hw_issues_tTRx_r0p2[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tTRx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tTRx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
@@ -398,7 +398,7 @@ static const enum base_hw_issue base_hw_issues_model_tTRx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tNAx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tNAx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -414,7 +414,7 @@ static const enum base_hw_issue base_hw_issues_tNAx_r0p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tNAx_r0p1[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tNAx_r0p1[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -429,7 +429,7 @@ static const enum base_hw_issue base_hw_issues_tNAx_r0p1[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tNAx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tNAx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
@@ -441,7 +441,7 @@ static const enum base_hw_issue base_hw_issues_model_tNAx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tBEx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tBEx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -455,7 +455,7 @@ static const enum base_hw_issue base_hw_issues_tBEx_r0p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tBEx_r0p1[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tBEx_r0p1[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -468,7 +468,7 @@ static const enum base_hw_issue base_hw_issues_tBEx_r0p1[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tBEx_r1p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tBEx_r1p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -481,7 +481,7 @@ static const enum base_hw_issue base_hw_issues_tBEx_r1p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tBEx_r1p1[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tBEx_r1p1[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -494,7 +494,7 @@ static const enum base_hw_issue base_hw_issues_tBEx_r1p1[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tBEx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tBEx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
@@ -506,7 +506,7 @@ static const enum base_hw_issue base_hw_issues_model_tBEx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_lBEx_r1p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_lBEx_r1p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -520,7 +520,7 @@ static const enum base_hw_issue base_hw_issues_lBEx_r1p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_lBEx_r1p1[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_lBEx_r1p1[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -533,7 +533,7 @@ static const enum base_hw_issue base_hw_issues_lBEx_r1p1[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tBAx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tBAx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -546,7 +546,7 @@ static const enum base_hw_issue base_hw_issues_tBAx_r0p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tBAx_r1p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tBAx_r1p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -559,7 +559,7 @@ static const enum base_hw_issue base_hw_issues_tBAx_r1p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tBAx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tBAx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
@@ -571,7 +571,7 @@ static const enum base_hw_issue base_hw_issues_model_tBAx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tDUx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tDUx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -581,7 +581,7 @@ static const enum base_hw_issue base_hw_issues_tDUx_r0p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tDUx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tDUx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
@@ -591,7 +591,7 @@ static const enum base_hw_issue base_hw_issues_model_tDUx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tODx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tODx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
@@ -599,7 +599,7 @@ static const enum base_hw_issue base_hw_issues_tODx_r0p0[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tODx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tODx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
@@ -608,14 +608,14 @@ static const enum base_hw_issue base_hw_issues_model_tODx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tGRx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tGRx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tGRx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tGRx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
@@ -623,14 +623,14 @@ static const enum base_hw_issue base_hw_issues_model_tGRx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tVAx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tVAx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tVAx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tVAx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
@@ -638,7 +638,7 @@ static const enum base_hw_issue base_hw_issues_model_tVAx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tTUx[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tTUx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
@@ -646,7 +646,7 @@ static const enum base_hw_issue base_hw_issues_model_tTUx[] = {
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tTUx_r0p0[] = {
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tTUx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
diff --git a/mali_kbase/mali_kbase_core_linux.c b/mali_kbase/mali_kbase_core_linux.c
index 2472c7c..0cbbf44 100644
--- a/mali_kbase/mali_kbase_core_linux.c
+++ b/mali_kbase/mali_kbase_core_linux.c
@@ -31,6 +31,10 @@
 #include <ipa/mali_kbase_ipa_debugfs.h>
 #endif /* CONFIG_DEVFREQ_THERMAL */
 #endif /* CONFIG_MALI_DEVFREQ */
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+#include "backend/gpu/mali_kbase_model_linux.h"
+#include <backend/gpu/mali_kbase_model_dummy.h>
+#endif /* CONFIG_MALI_NO_MALI */
 #include "mali_kbase_mem_profile_debugfs_buf_size.h"
 #include "mali_kbase_mem.h"
 #include "mali_kbase_mem_pool_debugfs.h"
@@ -52,7 +56,6 @@
 #endif
 #include "mali_kbase_hwcnt_context.h"
 #include "mali_kbase_hwcnt_virtualizer.h"
-#include "mali_kbase_hwcnt_legacy.h"
 #include "mali_kbase_kinstr_prfcnt.h"
 #include "mali_kbase_vinstr.h"
 #if MALI_USE_CSF
@@ -60,6 +63,7 @@
 #include "csf/mali_kbase_csf_tiler_heap.h"
 #include "csf/mali_kbase_csf_csg_debugfs.h"
 #include "csf/mali_kbase_csf_cpu_queue_debugfs.h"
+#include "csf/mali_kbase_csf_event.h"
 #endif
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
 #include "arbiter/mali_kbase_arbiter_pm.h"
@@ -342,15 +346,6 @@ static void kbase_file_delete(struct kbase_file *const kfile)
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 		kbasep_mem_profile_debugfs_remove(kctx);
 #endif
-
-		mutex_lock(&kctx->legacy_hwcnt_lock);
-		/* If this client was performing hardware counter dumping and
-		 * did not explicitly detach itself, destroy it now
-		 */
-		kbase_hwcnt_legacy_client_destroy(kctx->legacy_hwcnt_cli);
-		kctx->legacy_hwcnt_cli = NULL;
-		mutex_unlock(&kctx->legacy_hwcnt_lock);
-
 		kbase_context_debugfs_term(kctx);
 
 		kbase_destroy_context(kctx);
@@ -905,62 +900,6 @@ static int kbase_api_hwcnt_reader_setup(struct kbase_context *kctx,
 	return kbase_vinstr_hwcnt_reader_setup(kctx->kbdev->vinstr_ctx, setup);
 }
 
-static int kbase_api_hwcnt_enable(struct kbase_context *kctx,
-		struct kbase_ioctl_hwcnt_enable *enable)
-{
-	int ret;
-
-	mutex_lock(&kctx->legacy_hwcnt_lock);
-	if (enable->dump_buffer != 0) {
-		/* Non-zero dump buffer, so user wants to create the client */
-		if (kctx->legacy_hwcnt_cli == NULL) {
-			ret = kbase_hwcnt_legacy_client_create(
-				kctx->kbdev->hwcnt_gpu_virt,
-				enable,
-				&kctx->legacy_hwcnt_cli);
-		} else {
-			/* This context already has a client */
-			ret = -EBUSY;
-		}
-	} else {
-		/* Zero dump buffer, so user wants to destroy the client */
-		if (kctx->legacy_hwcnt_cli != NULL) {
-			kbase_hwcnt_legacy_client_destroy(
-				kctx->legacy_hwcnt_cli);
-			kctx->legacy_hwcnt_cli = NULL;
-			ret = 0;
-		} else {
-			/* This context has no client to destroy */
-			ret = -EINVAL;
-		}
-	}
-	mutex_unlock(&kctx->legacy_hwcnt_lock);
-
-	return ret;
-}
-
-static int kbase_api_hwcnt_dump(struct kbase_context *kctx)
-{
-	int ret;
-
-	mutex_lock(&kctx->legacy_hwcnt_lock);
-	ret = kbase_hwcnt_legacy_client_dump(kctx->legacy_hwcnt_cli);
-	mutex_unlock(&kctx->legacy_hwcnt_lock);
-
-	return ret;
-}
-
-static int kbase_api_hwcnt_clear(struct kbase_context *kctx)
-{
-	int ret;
-
-	mutex_lock(&kctx->legacy_hwcnt_lock);
-	ret = kbase_hwcnt_legacy_client_clear(kctx->legacy_hwcnt_cli);
-	mutex_unlock(&kctx->legacy_hwcnt_lock);
-
-	return ret;
-}
-
 static int kbase_api_get_cpu_gpu_timeinfo(struct kbase_context *kctx,
 		union kbase_ioctl_get_cpu_gpu_timeinfo *timeinfo)
 {
@@ -992,6 +931,17 @@ static int kbase_api_get_cpu_gpu_timeinfo(struct kbase_context *kctx,
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+static int kbase_api_hwcnt_set(struct kbase_context *kctx,
+		struct kbase_ioctl_hwcnt_values *values)
+{
+	gpu_model_set_dummy_prfcnt_sample(
+			(u32 __user *)(uintptr_t)values->data,
+			values->size);
+
+	return 0;
+}
+#endif /* CONFIG_MALI_NO_MALI */
 
 static int kbase_api_disjoint_query(struct kbase_context *kctx,
 		struct kbase_ioctl_disjoint_query *query)
@@ -1415,6 +1365,30 @@ static int kbasep_cs_queue_kick(struct kbase_context *kctx,
 	return kbase_csf_queue_kick(kctx, kick);
 }
 
+static int kbasep_cs_queue_group_create_1_6(
+	struct kbase_context *kctx,
+	union kbase_ioctl_cs_queue_group_create_1_6 *create)
+{
+	union kbase_ioctl_cs_queue_group_create
+		new_create = { .in = {
+				       .tiler_mask = create->in.tiler_mask,
+				       .fragment_mask =
+					       create->in.fragment_mask,
+				       .compute_mask = create->in.compute_mask,
+				       .cs_min = create->in.cs_min,
+				       .priority = create->in.priority,
+				       .tiler_max = create->in.tiler_max,
+				       .fragment_max = create->in.fragment_max,
+				       .compute_max = create->in.compute_max,
+			       } };
+
+	int ret = kbase_csf_queue_group_create(kctx, &new_create);
+
+	create->out.group_handle = new_create.out.group_handle;
+	create->out.group_uid = new_create.out.group_uid;
+
+	return ret;
+}
 static int kbasep_cs_queue_group_create(struct kbase_context *kctx,
 			     union kbase_ioctl_cs_queue_group_create *create)
 {
@@ -1873,28 +1847,20 @@ static long kbase_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 				struct kbase_ioctl_hwcnt_reader_setup,
 				kctx);
 		break;
-	case KBASE_IOCTL_HWCNT_ENABLE:
-		KBASE_HANDLE_IOCTL_IN(KBASE_IOCTL_HWCNT_ENABLE,
-				kbase_api_hwcnt_enable,
-				struct kbase_ioctl_hwcnt_enable,
-				kctx);
-		break;
-	case KBASE_IOCTL_HWCNT_DUMP:
-		KBASE_HANDLE_IOCTL(KBASE_IOCTL_HWCNT_DUMP,
-				kbase_api_hwcnt_dump,
-				kctx);
-		break;
-	case KBASE_IOCTL_HWCNT_CLEAR:
-		KBASE_HANDLE_IOCTL(KBASE_IOCTL_HWCNT_CLEAR,
-				kbase_api_hwcnt_clear,
-				kctx);
-		break;
 	case KBASE_IOCTL_GET_CPU_GPU_TIMEINFO:
 		KBASE_HANDLE_IOCTL_INOUT(KBASE_IOCTL_GET_CPU_GPU_TIMEINFO,
 				kbase_api_get_cpu_gpu_timeinfo,
 				union kbase_ioctl_get_cpu_gpu_timeinfo,
 				kctx);
 		break;
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	case KBASE_IOCTL_HWCNT_SET:
+		KBASE_HANDLE_IOCTL_IN(KBASE_IOCTL_HWCNT_SET,
+				kbase_api_hwcnt_set,
+				struct kbase_ioctl_hwcnt_values,
+				kctx);
+		break;
+#endif /* CONFIG_MALI_NO_MALI */
 #ifdef CONFIG_MALI_CINSTR_GWT
 	case KBASE_IOCTL_CINSTR_GWT_START:
 		KBASE_HANDLE_IOCTL(KBASE_IOCTL_CINSTR_GWT_START,
@@ -1949,6 +1915,12 @@ static long kbase_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 				struct kbase_ioctl_cs_queue_kick,
 				kctx);
 		break;
+	case KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6:
+		KBASE_HANDLE_IOCTL_INOUT(
+			KBASE_IOCTL_CS_QUEUE_GROUP_CREATE_1_6,
+			kbasep_cs_queue_group_create_1_6,
+			union kbase_ioctl_cs_queue_group_create_1_6, kctx);
+		break;
 	case KBASE_IOCTL_CS_QUEUE_GROUP_CREATE:
 		KBASE_HANDLE_IOCTL_INOUT(KBASE_IOCTL_CS_QUEUE_GROUP_CREATE,
 				kbasep_cs_queue_group_create,
@@ -2048,7 +2020,7 @@ static ssize_t kbase_read(struct file *filp, char __user *buf, size_t count, lof
 	if (atomic_read(&kctx->event_count))
 		read_event = true;
 	else
-		read_error = kbase_csf_read_error(kctx, &event_data);
+		read_error = kbase_csf_event_read_error(kctx, &event_data);
 
 	if (!read_event && !read_error) {
 		bool dump = kbase_csf_cpu_queue_read_dump_req(kctx,
@@ -2153,7 +2125,7 @@ int kbase_event_pending(struct kbase_context *ctx)
 	WARN_ON_ONCE(!ctx);
 
 	return (atomic_read(&ctx->event_count) != 0) ||
-		kbase_csf_error_pending(ctx) ||
+		kbase_csf_event_error_pending(ctx) ||
 		kbase_csf_cpu_queue_dump_needed(ctx);
 }
 #else
@@ -3910,8 +3882,6 @@ static DEVICE_ATTR(js_ctx_scheduling_mode, S_IRUGO | S_IWUSR,
 		show_js_ctx_scheduling_mode,
 		set_js_ctx_scheduling_mode);
 
-#ifdef MALI_KBASE_BUILD
-
 /* Number of entries in serialize_jobs_settings[] */
 #define NR_SERIALIZE_JOBS_SETTINGS 5
 /* Maximum string length in serialize_jobs_settings[].name */
@@ -4126,7 +4096,6 @@ static ssize_t store_serialize_jobs_sysfs(struct device *dev,
 
 static DEVICE_ATTR(serialize_jobs, 0600, show_serialize_jobs_sysfs,
 		   store_serialize_jobs_sysfs);
-#endif /* MALI_KBASE_BUILD */
 #endif /* !MALI_USE_CSF */
 
 static void kbasep_protected_mode_hwcnt_disable_worker(struct work_struct *data)
@@ -4222,6 +4191,15 @@ void kbase_protected_mode_term(struct kbase_device *kbdev)
 	kfree(kbdev->protected_dev);
 }
 
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+static int kbase_common_reg_map(struct kbase_device *kbdev)
+{
+	return 0;
+}
+static void kbase_common_reg_unmap(struct kbase_device * const kbdev)
+{
+}
+#else /* CONFIG_MALI_NO_MALI */
 static int kbase_common_reg_map(struct kbase_device *kbdev)
 {
 	int err = 0;
@@ -4257,6 +4235,7 @@ static void kbase_common_reg_unmap(struct kbase_device * const kbdev)
 		kbdev->reg_size = 0;
 	}
 }
+#endif /* CONFIG_MALI_NO_MALI */
 
 int registers_map(struct kbase_device * const kbdev)
 {
@@ -4574,7 +4553,6 @@ void power_control_term(struct kbase_device *kbdev)
 #endif
 }
 
-#ifdef MALI_KBASE_BUILD
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 
 static void trigger_reset(struct kbase_device *kbdev)
@@ -4847,7 +4825,6 @@ void kbase_device_debugfs_term(struct kbase_device *kbdev)
 	debugfs_remove_recursive(kbdev->mali_debugfs_directory);
 }
 #endif /* CONFIG_DEBUG_FS */
-#endif /* MALI_KBASE_BUILD */
 
 int kbase_device_coherency_init(struct kbase_device *kbdev)
 {
@@ -5238,10 +5215,8 @@ static int kbase_platform_device_probe(struct platform_device *pdev)
 		dev_set_drvdata(kbdev->dev, NULL);
 		kbase_device_free(kbdev);
 	} else {
-#ifdef MALI_KBASE_BUILD
 		dev_info(kbdev->dev,
 			"Probed as %s\n", dev_name(kbdev->mdev.this_device));
-#endif /* MALI_KBASE_BUILD */
 		kbase_increment_device_id();
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
 		mutex_lock(&kbdev->pm.lock);
@@ -5262,7 +5237,7 @@ static int kbase_platform_device_probe(struct platform_device *pdev)
  *
  * @dev:  The device to suspend
  *
- * Return: A standard Linux error code
+ * Return: A standard Linux error code on failure, 0 otherwise.
  */
 static int kbase_device_suspend(struct device *dev)
 {
@@ -5271,7 +5246,10 @@ static int kbase_device_suspend(struct device *dev)
 	if (!kbdev)
 		return -ENODEV;
 
-	kbase_pm_suspend(kbdev);
+	if (kbase_pm_suspend(kbdev)) {
+		dev_warn(kbdev->dev, "Abort suspend as GPU suspension failed");
+		return -EBUSY;
+	}
 
 #ifdef CONFIG_MALI_MIDGARD_DVFS
 	kbase_pm_metrics_stop(kbdev);
@@ -5512,6 +5490,7 @@ MODULE_VERSION(MALI_RELEASE_NAME " (UK version " \
 		__stringify(BASE_UK_VERSION_MAJOR) "." \
 		__stringify(BASE_UK_VERSION_MINOR) ")");
 MODULE_SOFTDEP("pre: memory_group_manager");
+MODULE_INFO(import_ns, "DMA_BUF");
 
 #define CREATE_TRACE_POINTS
 /* Create the trace points (otherwise we just get code to call a tracepoint) */
diff --git a/mali_kbase/mali_kbase_ctx_sched.c b/mali_kbase/mali_kbase_ctx_sched.c
index d06380d..8026e7f 100644
--- a/mali_kbase/mali_kbase_ctx_sched.c
+++ b/mali_kbase/mali_kbase_ctx_sched.c
@@ -23,6 +23,9 @@
 #include <mali_kbase_defs.h>
 #include "mali_kbase_ctx_sched.h"
 #include "tl/mali_kbase_tracepoints.h"
+#if !MALI_USE_CSF
+#include <mali_kbase_hwaccess_jm.h>
+#endif
 
 /* Helper for ktrace */
 #if KBASE_KTRACE_ENABLE
@@ -124,7 +127,6 @@ int kbase_ctx_sched_retain_ctx(struct kbase_context *kctx)
 						kbdev, prev_kctx->id);
 					prev_kctx->as_nr = KBASEP_AS_NR_INVALID;
 				}
-
 				kctx->as_nr = free_as;
 				kbdev->as_to_kctx[free_as] = kctx;
 				KBASE_TLSTREAM_TL_KBASE_CTX_ASSIGN_AS(
@@ -173,6 +175,9 @@ void kbase_ctx_sched_release_ctx(struct kbase_context *kctx)
 			kbdev->as_to_kctx[kctx->as_nr] = NULL;
 			kctx->as_nr = KBASEP_AS_NR_INVALID;
 			kbase_ctx_flag_clear(kctx, KCTX_AS_DISABLED_ON_FAULT);
+#if !MALI_USE_CSF
+			kbase_backend_slot_kctx_purge_locked(kbdev, kctx);
+#endif
 		}
 	}
 
diff --git a/mali_kbase/mali_kbase_defs.h b/mali_kbase/mali_kbase_defs.h
index 5b1fdd3..86e4042 100644
--- a/mali_kbase/mali_kbase_defs.h
+++ b/mali_kbase/mali_kbase_defs.h
@@ -742,6 +742,7 @@ struct kbase_process {
  * @hwcnt.addr:            HW counter address
  * @hwcnt.addr_bytes:      HW counter size in bytes
  * @hwcnt.backend:         Kbase instrumentation backend
+ * @hwcnt_watchdog_timer:  Hardware counter watchdog interface.
  * @hwcnt_gpu_iface:       Backend interface for GPU hardware counter access.
  * @hwcnt_gpu_ctx:         Context for GPU hardware counter access.
  *                         @hwaccess_lock must be held when calling
@@ -770,8 +771,8 @@ struct kbase_process {
  * @cache_clean_in_progress: Set when a cache clean has been started, and
  *                         cleared when it has finished. This prevents multiple
  *                         cache cleans being done simultaneously.
- * @cache_clean_queued:    Set if a cache clean is invoked while another is in
- *                         progress. If this happens, another cache clean needs
+ * @cache_clean_queued:    Pended cache clean operations invoked while another is
+ *                         in progress. If this is not 0, another cache clean needs
  *                         to be triggered immediately after completion of the
  *                         current one.
  * @cache_clean_wait:      Signalled when a cache clean has finished.
@@ -979,6 +980,15 @@ struct kbase_device {
 	char devname[DEVNAME_SIZE];
 	u32  id;
 
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	void *model;
+	struct kmem_cache *irq_slab;
+	struct workqueue_struct *irq_workq;
+	atomic_t serving_job_irq;
+	atomic_t serving_gpu_irq;
+	atomic_t serving_mmu_irq;
+	spinlock_t reg_op_lock;
+#endif /* CONFIG_MALI_NO_MALI */
 	struct kbase_pm_device_data pm;
 
 	struct kbase_mem_pool_group mem_pools;
@@ -1008,6 +1018,7 @@ struct kbase_device {
 
 #if MALI_USE_CSF
 	struct kbase_hwcnt_backend_csf_if hwcnt_backend_csf_if_fw;
+	struct kbase_hwcnt_watchdog_interface hwcnt_watchdog_timer;
 #else
 	struct kbase_hwcnt {
 		spinlock_t lock;
@@ -1037,7 +1048,7 @@ struct kbase_device {
 	u64 lowest_gpu_freq_khz;
 
 	bool cache_clean_in_progress;
-	bool cache_clean_queued;
+	u32 cache_clean_queued;
 	wait_queue_head_t cache_clean_wait;
 
 	void *platform_context;
@@ -1205,6 +1216,7 @@ struct kbase_device {
 	struct priority_control_manager_device *pcm_dev;
 
 	struct notifier_block oom_notifier_block;
+
 };
 
 /**
@@ -1562,6 +1574,12 @@ struct kbase_sub_alloc {
  *                        pages used for GPU allocations, done for the context,
  *                        to the memory consumed by the process.
  * @gpu_va_end:           End address of the GPU va space (in 4KB page units)
+ * @running_total_tiler_heap_nr_chunks: Running total of number of chunks in all
+ *                        tiler heaps of the kbase context.
+ * @running_total_tiler_heap_memory: Running total of the tiler heap memory in the
+ *                        kbase context.
+ * @peak_total_tiler_heap_memory: Peak value of the total tiler heap memory in the
+ *                        kbase context.
  * @jit_va:               Indicates if a JIT_VA zone has been created.
  * @mem_profile_data:     Buffer containing the profiling information provided by
  *                        Userspace, can be read through the mem_profile debugfs file.
@@ -1588,11 +1606,6 @@ struct kbase_sub_alloc {
  * @slots_pullable:       Bitmask of slots, indicating the slots for which the
  *                        context has pullable atoms in the runnable tree.
  * @work:                 Work structure used for deferred ASID assignment.
- * @legacy_hwcnt_cli:     Pointer to the legacy userspace hardware counters
- *                        client, there can be only such client per kbase
- *                        context.
- * @legacy_hwcnt_lock:    Lock used to prevent concurrent access to
- *                        @legacy_hwcnt_cli.
  * @completed_jobs:       List containing completed atoms for which base_jd_event is
  *                        to be posted.
  * @work_count:           Number of work items, corresponding to atoms, currently
@@ -1775,6 +1788,11 @@ struct kbase_context {
 	spinlock_t         mm_update_lock;
 	struct mm_struct __rcu *process_mm;
 	u64 gpu_va_end;
+#if MALI_USE_CSF
+	u32 running_total_tiler_heap_nr_chunks;
+	u64 running_total_tiler_heap_memory;
+	u64 peak_total_tiler_heap_memory;
+#endif
 	bool jit_va;
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
@@ -1788,10 +1806,6 @@ struct kbase_context {
 	struct list_head job_fault_resume_event_list;
 
 #endif /* CONFIG_DEBUG_FS */
-
-	struct kbase_hwcnt_legacy_client *legacy_hwcnt_cli;
-	struct mutex legacy_hwcnt_lock;
-
 	struct kbase_va_region *jit_alloc[1 + BASE_JIT_ALLOC_COUNT];
 	u8 jit_max_allocations;
 	u8 jit_current_allocations;
diff --git a/mali_kbase/mali_kbase_gpuprops.c b/mali_kbase/mali_kbase_gpuprops.c
index 967c08e..b5ba642 100644
--- a/mali_kbase/mali_kbase_gpuprops.c
+++ b/mali_kbase/mali_kbase_gpuprops.c
@@ -371,6 +371,7 @@ static void kbase_gpuprops_calculate_props(
 	gpu_id = kbdev->gpu_props.props.raw_props.gpu_id;
 
 #if MALI_USE_CSF
+	CSTD_UNUSED(gpu_id);
 	gpu_props->thread_props.max_registers =
 		KBASE_UBFX32(gpu_props->raw_props.thread_features,
 			     0U, 22);
diff --git a/mali_kbase/mali_kbase_hwaccess_instr.h b/mali_kbase/mali_kbase_hwaccess_instr.h
index 819ca13..3766310 100644
--- a/mali_kbase/mali_kbase_hwaccess_instr.h
+++ b/mali_kbase/mali_kbase_hwaccess_instr.h
@@ -144,4 +144,27 @@ void kbase_instr_backend_term(struct kbase_device *kbdev);
 void kbase_instr_backend_debugfs_init(struct kbase_device *kbdev);
 #endif
 
+/**
+ * kbase_instr_hwcnt_on_unrecoverable_error() - JM HWC instr backend function
+ *                                              called when unrecoverable errors
+ *                                              are detected.
+ * @kbdev: Kbase device
+ *
+ * This should be called on encountering errors that can only be recovered from
+ * with reset, or that may put HWC logic in state that could result in hang. For
+ * example, when HW becomes unresponsive.
+ *
+ * Caller requires kbdev->hwaccess_lock held.
+ */
+void kbase_instr_hwcnt_on_unrecoverable_error(struct kbase_device *kbdev);
+
+/**
+ * kbase_instr_hwcnt_on_before_reset() - JM HWC instr backend function to be
+ *                                       called immediately before a reset.
+ *                                       Takes us out of the unrecoverable
+ *                                       error state, if we were in it.
+ * @kbdev: Kbase device
+ */
+void kbase_instr_hwcnt_on_before_reset(struct kbase_device *kbdev);
+
 #endif /* _KBASE_HWACCESS_INSTR_H_ */
diff --git a/mali_kbase/mali_kbase_hwaccess_jm.h b/mali_kbase/mali_kbase_hwaccess_jm.h
index 8689647..d0207f7 100644
--- a/mali_kbase/mali_kbase_hwaccess_jm.h
+++ b/mali_kbase/mali_kbase_hwaccess_jm.h
@@ -299,4 +299,21 @@ void kbase_job_slot_hardstop(struct kbase_context *kctx, int js,
  */
 bool kbase_gpu_atoms_submitted_any(struct kbase_device *kbdev);
 
+/**
+ * kbase_backend_slot_kctx_purge_locked - Perform a purge on the slot_rb tracked
+ *                                        kctx
+ *
+ * @kbdev:	Device pointer
+ * @kctx:	The kbase context that needs to be purged from slot_rb[]
+ *
+ * For JM GPUs, the L1 read only caches may need a start_flush invalidation,
+ * potentially on all slots (even if the kctx was only using a single slot),
+ * following a context termination or address-space ID recycle. This function
+ * performs a clean-up purge on the given kctx which if it has been tracked by
+ * slot_rb[] objects.
+ *
+ * Caller must hold kbase_device->hwaccess_lock.
+ */
+void kbase_backend_slot_kctx_purge_locked(struct kbase_device *kbdev, struct kbase_context *kctx);
+
 #endif /* _KBASE_HWACCESS_JM_H_ */
diff --git a/mali_kbase/mali_kbase_hwaccess_pm.h b/mali_kbase/mali_kbase_hwaccess_pm.h
index 36bbe2d..a8e4b95 100644
--- a/mali_kbase/mali_kbase_hwaccess_pm.h
+++ b/mali_kbase/mali_kbase_hwaccess_pm.h
@@ -85,8 +85,10 @@ void kbase_hwaccess_pm_halt(struct kbase_device *kbdev);
  * Perform any backend-specific actions to suspend the GPU
  *
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * Return: 0 if suspend was successful.
  */
-void kbase_hwaccess_pm_suspend(struct kbase_device *kbdev);
+int kbase_hwaccess_pm_suspend(struct kbase_device *kbdev);
 
 /**
  * Perform any backend-specific actions to resume the GPU from a suspend
diff --git a/mali_kbase/mali_kbase_hwcnt.c b/mali_kbase/mali_kbase_hwcnt.c
index ea4893d..1fa6640 100644
--- a/mali_kbase/mali_kbase_hwcnt.c
+++ b/mali_kbase/mali_kbase_hwcnt.c
@@ -158,7 +158,6 @@ int kbase_hwcnt_context_init(
 
 	return 0;
 
-	destroy_workqueue(hctx->wq);
 err_alloc_workqueue:
 	kfree(hctx);
 err_alloc_hctx:
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_csf.c b/mali_kbase/mali_kbase_hwcnt_backend_csf.c
index 7ba1671..4602138 100644
--- a/mali_kbase/mali_kbase_hwcnt_backend_csf.c
+++ b/mali_kbase/mali_kbase_hwcnt_backend_csf.c
@@ -36,16 +36,24 @@
 #define BASE_MAX_NR_CLOCKS_REGULATORS 2
 #endif
 
+/* Backend watch dog timer interval in milliseconds: 1 second. */
+#define HWCNT_BACKEND_WATCHDOG_TIMER_INTERVAL_MS ((u32)1000)
+
 /**
  * enum kbase_hwcnt_backend_csf_dump_state - HWC CSF backend dumping states.
  *
  * @KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE: Initial state, or the state if there is
  * an error.
  *
- * @KBASE_HWCNT_BACKEND_CSF_DUMP_REQUESTED: A dump has been requested and we are
- * waiting for an ACK, this ACK could come from either PRFCNT_ACK,
+ * @KBASE_HWCNT_BACKEND_CSF_DUMP_REQUESTED: A user dump has been requested and
+ * we are waiting for an ACK, this ACK could come from either PRFCNT_ACK,
  * PROTMODE_ENTER_ACK, or if an error occurs.
  *
+ * @KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED: A watchdog dump has been
+ * requested and we're waiting for an ACK - this ACK could come from either
+ * PRFCNT_ACK, or if an error occurs, PROTMODE_ENTER_ACK is not applied here
+ * since watchdog request can't be triggered in protected mode.
+ *
  * @KBASE_HWCNT_BACKEND_CSF_DUMP_QUERYING_INSERT: Checking the insert
  * immediately after receiving the ACK, so we know which index corresponds to
  * the buffer we requested.
@@ -60,18 +68,25 @@
  * @KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED: The dump completed successfully.
  *
  * Valid state transitions:
- * IDLE -> REQUESTED (on dump request)
- * REQUESTED -> QUERYING_INSERT (on dump ack)
+ * IDLE -> REQUESTED (on user dump request)
+ * IDLE -> WATCHDOG_REQUESTED (on watchdog request)
+ * IDLE -> QUERYING_INSERT (on user dump request in protected mode)
+ * REQUESTED -> QUERYING_INSERT (on dump acknowledged from firmware)
+ * WATCHDOG_REQUESTED -> REQUESTED (on user dump request)
+ * WATCHDOG_REQUESTED -> COMPLETED (on dump acknowledged from firmware for watchdog request)
  * QUERYING_INSERT -> WORKER_LAUNCHED (on worker submission)
  * WORKER_LAUNCHED -> ACCUMULATING (while the worker is accumulating)
  * ACCUMULATING -> COMPLETED (on accumulation completion)
- * COMPLETED -> REQUESTED (on dump request)
+ * COMPLETED -> QUERYING_INSERT (on user dump request in protected mode)
+ * COMPLETED -> REQUESTED (on user dump request)
+ * COMPLETED -> WATCHDOG_REQUESTED (on watchdog request)
  * COMPLETED -> IDLE (on disable)
  * ANY -> IDLE (on error)
  */
 enum kbase_hwcnt_backend_csf_dump_state {
 	KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE,
 	KBASE_HWCNT_BACKEND_CSF_DUMP_REQUESTED,
+	KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED,
 	KBASE_HWCNT_BACKEND_CSF_DUMP_QUERYING_INSERT,
 	KBASE_HWCNT_BACKEND_CSF_DUMP_WORKER_LAUNCHED,
 	KBASE_HWCNT_BACKEND_CSF_DUMP_ACCUMULATING,
@@ -136,6 +151,7 @@ enum kbase_hwcnt_backend_csf_enable_state {
  * @counter_set:                  The performance counter set to use.
  * @metadata:                     Hardware counter metadata.
  * @prfcnt_info:                  Performance counter information.
+ * @watchdog_if:                  Watchdog interface object pointer.
  */
 struct kbase_hwcnt_backend_csf_info {
 	struct kbase_hwcnt_backend_csf *backend;
@@ -146,6 +162,7 @@ struct kbase_hwcnt_backend_csf_info {
 	enum kbase_hwcnt_set counter_set;
 	const struct kbase_hwcnt_metadata *metadata;
 	struct kbase_hwcnt_backend_csf_if_prfcnt_info prfcnt_info;
+	struct kbase_hwcnt_watchdog_interface *watchdog_if;
 };
 
 /**
@@ -192,6 +209,10 @@ struct kbase_hwcnt_csf_physical_layout {
  * @old_sample_buf:             HWC sample buffer to save the previous values
  *                              for delta calculation, size
  *                              prfcnt_info.dump_bytes.
+ * @watchdog_last_seen_insert_idx: The insert index which watchdog has last
+ *                                 seen, to check any new firmware automatic
+ *                                 samples generated during the watchdog
+ *                                 period.
  * @ring_buf:                   Opaque pointer for ring buffer object.
  * @ring_buf_cpu_base:          CPU base address of the allocated ring buffer.
  * @clk_enable_map:             The enable map specifying enabled clock domains.
@@ -204,6 +225,8 @@ struct kbase_hwcnt_csf_physical_layout {
  *                              it is completed accumulating up to the
  *                              insert_index_to_accumulate.
  *                              Should be initialized to the "complete" state.
+ * @user_requested:             Flag to indicate a dump_request called from
+ *                              user.
  * @hwc_dump_workq:             Single threaded work queue for HWC workers
  *                              execution.
  * @hwc_dump_work:              Worker to accumulate samples.
@@ -219,6 +242,7 @@ struct kbase_hwcnt_backend_csf {
 	u64 *to_user_buf;
 	u64 *accum_buf;
 	u32 *old_sample_buf;
+	u32 watchdog_last_seen_insert_idx;
 	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf;
 	void *ring_buf_cpu_base;
 	u64 clk_enable_map;
@@ -226,6 +250,7 @@ struct kbase_hwcnt_backend_csf {
 	u64 prev_cycle_count[BASE_MAX_NR_CLOCKS_REGULATORS];
 	struct kbase_hwcnt_csf_physical_layout phys_layout;
 	struct completion dump_completed;
+	bool user_requested;
 	struct workqueue_struct *hwc_dump_workq;
 	struct work_struct hwc_dump_work;
 	struct work_struct hwc_threshold_work;
@@ -594,6 +619,10 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(
 	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 	backend_csf->info->csf_if->set_extract_index(
 		backend_csf->info->csf_if->ctx, insert_index_to_stop);
+	/* Update the watchdog last seen index to check any new FW auto samples
+	 * in next watchdog callback.
+	 */
+	backend_csf->watchdog_last_seen_insert_idx = insert_index_to_stop;
 	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
 					  flags);
 }
@@ -612,6 +641,67 @@ static void kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
 	}
 }
 
+static void kbasep_hwcnt_backend_watchdog_timer_cb(void *info)
+{
+	struct kbase_hwcnt_backend_csf_info *csf_info = info;
+	struct kbase_hwcnt_backend_csf *backend_csf;
+	unsigned long flags;
+
+	csf_info->csf_if->lock(csf_info->csf_if->ctx, &flags);
+
+	if (WARN_ON(!kbasep_hwcnt_backend_csf_backend_exists(csf_info))) {
+		csf_info->csf_if->unlock(csf_info->csf_if->ctx, flags);
+		return;
+	}
+
+	backend_csf = csf_info->backend;
+
+	/* Only do watchdog request when all conditions are met: */
+	if (/* 1. Backend is enabled. */
+	    (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_ENABLED) &&
+	    /* 2. FW is not in protected mode. */
+	    (!csf_info->fw_in_protected_mode) &&
+	    /* 3. dump state indicates no other dumping is in progress. */
+	    ((backend_csf->dump_state == KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE) ||
+	     (backend_csf->dump_state ==
+	      KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED))) {
+		u32 extract_index;
+		u32 insert_index;
+
+		/* Read the raw extract and insert indexes from the CSF interface. */
+		csf_info->csf_if->get_indexes(csf_info->csf_if->ctx,
+					      &extract_index, &insert_index);
+
+		/* Do watchdog request if no new FW auto samples. */
+		if (insert_index ==
+		    backend_csf->watchdog_last_seen_insert_idx) {
+			/* Trigger the watchdog request. */
+			csf_info->csf_if->dump_request(csf_info->csf_if->ctx);
+
+			/* A watchdog dump is required, change the state to
+			 * start the request process.
+			 */
+			backend_csf->dump_state =
+				KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED;
+		}
+	}
+
+	/* Must schedule another callback when in the transitional state because
+	 * this function can be called for the first time before the performance
+	 * counter enabled interrupt.
+	 */
+	if ((backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_ENABLED) ||
+	    (backend_csf->enable_state ==
+	     KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED)) {
+		/* Reschedule the timer for next watchdog callback. */
+		csf_info->watchdog_if->modify(
+			csf_info->watchdog_if->timer,
+			HWCNT_BACKEND_WATCHDOG_TIMER_INTERVAL_MS);
+	}
+
+	csf_info->csf_if->unlock(csf_info->csf_if->ctx, flags);
+}
+
 /**
  * kbasep_hwcnt_backend_csf_dump_worker() - HWC dump worker.
  * @work: Work structure.
@@ -826,6 +916,7 @@ static int kbasep_hwcnt_backend_csf_dump_enable_nolock(
 	struct kbase_hwcnt_backend_csf *backend_csf =
 		(struct kbase_hwcnt_backend_csf *)backend;
 	struct kbase_hwcnt_backend_csf_if_enable enable;
+	int err;
 
 	if (!backend_csf || !enable_map ||
 	    (enable_map->metadata != backend_csf->info->metadata))
@@ -841,6 +932,13 @@ static int kbasep_hwcnt_backend_csf_dump_enable_nolock(
 	if (backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_DISABLED)
 		return -EIO;
 
+	err = backend_csf->info->watchdog_if->enable(
+		backend_csf->info->watchdog_if->timer,
+		HWCNT_BACKEND_WATCHDOG_TIMER_INTERVAL_MS,
+		kbasep_hwcnt_backend_watchdog_timer_cb, backend_csf->info);
+	if (err)
+		return err;
+
 	backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE;
 	WARN_ON(!completion_done(&backend_csf->dump_completed));
 	kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
@@ -948,6 +1046,13 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
 					  flags);
 
+	/* Deregister the timer and block until any timer callback has completed.
+	 * We've transitioned out of the ENABLED state so we can guarantee it
+	 * won't reschedule itself.
+	 */
+	backend_csf->info->watchdog_if->disable(
+		backend_csf->info->watchdog_if->timer);
+
 	/* Block until any async work has completed. We have transitioned out of
 	 * the ENABLED state so we can guarantee no new work will concurrently
 	 * be submitted.
@@ -978,6 +1083,9 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 		break;
 	}
 
+	backend_csf->user_requested = false;
+	backend_csf->watchdog_last_seen_insert_idx = 0;
+
 	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
 					  flags);
 
@@ -1006,6 +1114,7 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 	struct kbase_hwcnt_backend_csf *backend_csf =
 		(struct kbase_hwcnt_backend_csf *)backend;
 	bool do_request = false;
+	bool watchdog_dumping = false;
 
 	if (!backend_csf)
 		return -EINVAL;
@@ -1022,6 +1131,7 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 			KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED;
 		*dump_time_ns = kbasep_hwcnt_backend_csf_timestamp_ns(backend);
 		kbasep_hwcnt_backend_csf_cc_update(backend_csf);
+		backend_csf->user_requested = true;
 		backend_csf->info->csf_if->unlock(
 			backend_csf->info->csf_if->ctx, flags);
 		return 0;
@@ -1035,11 +1145,21 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 	}
 
 	/* Make sure that this is either the first request since enable or the
-	 * previous dump has completed, so we can avoid midway through a dump.
+	 * previous user dump has completed or a watchdog dump is in progress,
+	 * so we can avoid midway through a user dump.
+	 * If user request comes while a watchdog dumping is in progress,
+	 * the user request takes the ownership of the watchdog dumping sample by
+	 * changing the dump_state so the interrupt for the watchdog
+	 * request can be processed instead of ignored.
 	 */
 	if ((backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE) &&
 	    (backend_csf->dump_state !=
-	     KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED)) {
+	     KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED) &&
+	    (backend_csf->dump_state !=
+	     KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED)) {
+		/* HWC is disabled or another user dump is ongoing,
+		 * or we're on fault.
+		 */
 		backend_csf->info->csf_if->unlock(
 			backend_csf->info->csf_if->ctx, flags);
 		/* HWC is disabled or another dump is ongoing, or we are on
@@ -1051,6 +1171,10 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 	/* Reset the completion so dump_wait() has something to wait on. */
 	reinit_completion(&backend_csf->dump_completed);
 
+	if (backend_csf->dump_state ==
+	    KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED)
+		watchdog_dumping = true;
+
 	if ((backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_ENABLED) &&
 	    !backend_csf->info->fw_in_protected_mode) {
 		/* Only do the request if we are fully enabled and not in
@@ -1078,15 +1202,29 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 
 	*dump_time_ns = kbasep_hwcnt_backend_csf_timestamp_ns(backend);
 	kbasep_hwcnt_backend_csf_cc_update(backend_csf);
+	backend_csf->user_requested = true;
 
-	if (do_request)
-		backend_csf->info->csf_if->dump_request(
-			backend_csf->info->csf_if->ctx);
-	else
+	if (do_request) {
+		/* If a watchdog dumping is in progress, don't need to do
+		 * another request, just update the dump_state and take the
+		 * ownership of the sample which watchdog requested.
+		 */
+		if (!watchdog_dumping)
+			backend_csf->info->csf_if->dump_request(
+				backend_csf->info->csf_if->ctx);
+	} else
 		kbase_hwcnt_backend_csf_submit_dump_worker(backend_csf->info);
 
 	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
 					  flags);
+
+	/* Modify watchdog timer to delay the regular check time since
+	 * just requested.
+	 */
+	backend_csf->info->watchdog_if->modify(
+		backend_csf->info->watchdog_if->timer,
+		HWCNT_BACKEND_WATCHDOG_TIMER_INTERVAL_MS);
+
 	return 0;
 }
 
@@ -1105,11 +1243,18 @@ kbasep_hwcnt_backend_csf_dump_wait(struct kbase_hwcnt_backend *backend)
 	wait_for_completion(&backend_csf->dump_completed);
 
 	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
-	/* Make sure the last dump actually succeeded. */
-	errcode = (backend_csf->dump_state ==
-		   KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED) ?
-			  0 :
-			  -EIO;
+	/* Make sure the last dump actually succeeded when user requested is
+	 * set.
+	 */
+	if (backend_csf->user_requested &&
+	    ((backend_csf->dump_state ==
+	      KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED) ||
+	     (backend_csf->dump_state ==
+	      KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED)))
+		errcode = 0;
+	else
+		errcode = -EIO;
+
 	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
 					  flags);
 
@@ -1155,13 +1300,16 @@ static int kbasep_hwcnt_backend_csf_dump_get(
 	    (dst_enable_map->metadata != dst->metadata))
 		return -EINVAL;
 
+	/* Extract elapsed cycle count for each clock domain if enabled. */
 	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk) {
 		if (!kbase_hwcnt_clk_enable_map_enabled(
 			    dst_enable_map->clk_enable_map, clk))
 			continue;
 
-		/* Extract elapsed cycle count for each clock domain. */
-		dst->clk_cnt_buf[clk] = backend_csf->cycle_count_elapsed[clk];
+		/* Reset the counter to zero if accumulation is off. */
+		if (!accumulate)
+			dst->clk_cnt_buf[clk] = 0;
+		dst->clk_cnt_buf[clk] += backend_csf->cycle_count_elapsed[clk];
 	}
 
 	/* We just return the user buffer without checking the current state,
@@ -1279,6 +1427,8 @@ kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info,
 	backend_csf->enable_state = KBASE_HWCNT_BACKEND_CSF_DISABLED;
 	backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE;
 	complete_all(&backend_csf->dump_completed);
+	backend_csf->user_requested = false;
+	backend_csf->watchdog_last_seen_insert_idx = 0;
 
 	*out_backend = backend_csf;
 	return 0;
@@ -1401,38 +1551,41 @@ static void kbasep_hwcnt_backend_csf_info_destroy(
  *                 used to create backend interface.
  * @ring_buf_cnt: The buffer count of the CSF hwcnt backend ring buffer.
  *                MUST be power of 2.
+ * @watchdog_if:  Non-NULL pointer to a hwcnt watchdog interface structure used to create
+ *                backend interface.
  * @out_info:     Non-NULL pointer to where info is stored on success.
  * @return 0 on success, else error code.
  */
 static int kbasep_hwcnt_backend_csf_info_create(
 	struct kbase_hwcnt_backend_csf_if *csf_if, u32 ring_buf_cnt,
+	struct kbase_hwcnt_watchdog_interface *watchdog_if,
 	const struct kbase_hwcnt_backend_csf_info **out_info)
 {
 	struct kbase_hwcnt_backend_csf_info *info = NULL;
 
-	WARN_ON(!csf_if);
-	WARN_ON(!out_info);
-	WARN_ON(!is_power_of_2(ring_buf_cnt));
+	if (WARN_ON(!csf_if) || WARN_ON(!watchdog_if) || WARN_ON(!out_info) ||
+	    WARN_ON(!is_power_of_2(ring_buf_cnt)))
+		return -EINVAL;
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return -ENOMEM;
 
+	*info = (struct kbase_hwcnt_backend_csf_info)
+	{
 #if defined(CONFIG_MALI_PRFCNT_SET_SECONDARY)
-	info->counter_set = KBASE_HWCNT_SET_SECONDARY;
+		.counter_set = KBASE_HWCNT_SET_SECONDARY,
 #elif defined(CONFIG_MALI_PRFCNT_SET_TERTIARY)
-	info->counter_set = KBASE_HWCNT_SET_TERTIARY;
+		.counter_set = KBASE_HWCNT_SET_TERTIARY,
 #else
-	/* Default to primary */
-	info->counter_set = KBASE_HWCNT_SET_PRIMARY;
+		/* Default to primary */
+		.counter_set = KBASE_HWCNT_SET_PRIMARY,
 #endif
-
-	info->backend = NULL;
-	info->csf_if = csf_if;
-	info->ring_buf_cnt = ring_buf_cnt;
-	info->fw_in_protected_mode = false;
-	info->unrecoverable_error_happened = false;
-
+		.backend = NULL, .csf_if = csf_if, .ring_buf_cnt = ring_buf_cnt,
+		.fw_in_protected_mode = false,
+		.unrecoverable_error_happened = false,
+		.watchdog_if = watchdog_if,
+	};
 	*out_info = info;
 
 	return 0;
@@ -1653,6 +1806,14 @@ void kbase_hwcnt_backend_csf_on_prfcnt_sample(
 		return;
 	backend_csf = csf_info->backend;
 
+	/* Skip the dump_work if it's a watchdog request. */
+	if (backend_csf->dump_state ==
+	    KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED) {
+		backend_csf->dump_state =
+			KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED;
+		return;
+	}
+
 	/* If the current state is not REQUESTED, this HWC sample will be
 	 * skipped and processed in next dump_request.
 	 */
@@ -1831,14 +1992,15 @@ void kbase_hwcnt_backend_csf_metadata_term(
 	}
 }
 
-int kbase_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_if *csf_if,
-				   u32 ring_buf_cnt,
-				   struct kbase_hwcnt_backend_interface *iface)
+int kbase_hwcnt_backend_csf_create(
+	struct kbase_hwcnt_backend_csf_if *csf_if, u32 ring_buf_cnt,
+	struct kbase_hwcnt_watchdog_interface *watchdog_if,
+	struct kbase_hwcnt_backend_interface *iface)
 {
 	int errcode;
 	const struct kbase_hwcnt_backend_csf_info *info = NULL;
 
-	if (!iface || !csf_if)
+	if (!iface || !csf_if || !watchdog_if)
 		return -EINVAL;
 
 	/* The buffer count must be power of 2 */
@@ -1846,7 +2008,7 @@ int kbase_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_if *csf_if,
 		return -EINVAL;
 
 	errcode = kbasep_hwcnt_backend_csf_info_create(csf_if, ring_buf_cnt,
-						       &info);
+						       watchdog_if, &info);
 	if (errcode)
 		return errcode;
 
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_csf.h b/mali_kbase/mali_kbase_hwcnt_backend_csf.h
index bfdf140..e0cafbe 100644
--- a/mali_kbase/mali_kbase_hwcnt_backend_csf.h
+++ b/mali_kbase/mali_kbase_hwcnt_backend_csf.h
@@ -29,6 +29,7 @@
 
 #include "mali_kbase_hwcnt_backend.h"
 #include "mali_kbase_hwcnt_backend_csf_if.h"
+#include "mali_kbase_hwcnt_watchdog_if.h"
 
 /**
  * kbase_hwcnt_backend_csf_create() - Create a CSF hardware counter backend
@@ -37,6 +38,8 @@
  *                used to create backend interface.
  * @ring_buf_cnt: The buffer count of CSF hwcnt backend, used when allocate ring
  *                buffer, MUST be power of 2.
+ * @watchdog_if:  Non-NULL pointer to a hwcnt watchdog interface structure used
+ *                to create backend interface.
  * @iface:        Non-NULL pointer to backend interface structure that is filled
  *                in on creation success.
  *
@@ -44,9 +47,10 @@
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_if *csf_if,
-				   u32 ring_buf_cnt,
-				   struct kbase_hwcnt_backend_interface *iface);
+int kbase_hwcnt_backend_csf_create(
+	struct kbase_hwcnt_backend_csf_if *csf_if, u32 ring_buf_cnt,
+	struct kbase_hwcnt_watchdog_interface *watchdog_if,
+	struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_csf_metadata_init() - Initialize the metadata for a CSF
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c b/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c
index 124224d..40cf6bb 100644
--- a/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c
+++ b/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c
@@ -38,6 +38,9 @@
 #include <linux/log2.h>
 #include "mali_kbase_ccswe.h"
 
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+#include <backend/gpu/mali_kbase_model_dummy.h>
+#endif /* CONFIG_MALI_NO_MALI */
 
 /** The number of nanoseconds in a second. */
 #define NSECS_IN_SEC 1000000000ull /* ns */
@@ -217,6 +220,26 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
 	struct kbase_hwcnt_backend_csf_if_prfcnt_info *prfcnt_info)
 {
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	size_t dummy_model_blk_count;
+	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
+		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
+
+	prfcnt_info->l2_count = KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS;
+	prfcnt_info->core_mask =
+		(1ull << KBASE_DUMMY_MODEL_MAX_SHADER_CORES) - 1;
+	/* 1 FE block + 1 Tiler block + l2_count blocks + shader_core blocks */
+	dummy_model_blk_count =
+		2 + prfcnt_info->l2_count + fls64(prfcnt_info->core_mask);
+	prfcnt_info->dump_bytes =
+		dummy_model_blk_count * KBASE_DUMMY_MODEL_BLOCK_SIZE;
+	prfcnt_info->prfcnt_block_size =
+		KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK *
+		KBASE_HWCNT_VALUE_HW_BYTES;
+	prfcnt_info->clk_cnt = 1;
+	prfcnt_info->clearing_samples = true;
+	fw_ctx->buf_bytes = prfcnt_info->dump_bytes;
+#else
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx;
 	struct kbase_device *kbdev;
 	u32 prfcnt_size;
@@ -261,6 +284,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 	/* Total size must be multiple of block size. */
 	WARN_ON((prfcnt_info->dump_bytes % prfcnt_info->prfcnt_block_size) !=
 		0);
+#endif
 }
 
 static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
@@ -355,6 +379,11 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
 	*out_ring_buf =
 		(struct kbase_hwcnt_backend_csf_if_ring_buf *)fw_ring_buf;
 
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	/* The dummy model needs the CPU mapping. */
+	gpu_model_set_dummy_prfcnt_base_cpu(fw_ring_buf->cpu_dump_base, kbdev,
+					    phys, num_pages);
+#endif /* CONFIG_MALI_NO_MALI */
 
 	return 0;
 
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_jm.c b/mali_kbase/mali_kbase_hwcnt_backend_jm.c
index 56bb1b6..d041391 100644
--- a/mali_kbase/mali_kbase_hwcnt_backend_jm.c
+++ b/mali_kbase/mali_kbase_hwcnt_backend_jm.c
@@ -28,6 +28,9 @@
 #include "mali_kbase_hwaccess_time.h"
 #include "mali_kbase_ccswe.h"
 
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+#include "backend/gpu/mali_kbase_model_dummy.h"
+#endif /* CONFIG_MALI_NO_MALI */
 #include "backend/gpu/mali_kbase_clk_rate_trace_mgr.h"
 
 #include "backend/gpu/mali_kbase_pm_internal.h"
@@ -140,6 +143,11 @@ kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
 	if (!kbdev || !info)
 		return -EINVAL;
 
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	info->l2_count = KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS;
+	info->core_mask = (1ull << KBASE_DUMMY_MODEL_MAX_SHADER_CORES) - 1;
+	info->prfcnt_values_per_block = KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK;
+#else /* CONFIG_MALI_NO_MALI */
 	{
 		const struct base_gpu_props *props = &kbdev->gpu_props.props;
 		const size_t l2_count = props->l2_props.num_l2_slices;
@@ -151,6 +159,7 @@ kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
 		info->prfcnt_values_per_block =
 			KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK;
 	}
+#endif /* CONFIG_MALI_NO_MALI */
 
 	/* Determine the number of available clock domains. */
 	for (clk = 0; clk < BASE_MAX_NR_CLOCKS_REGULATORS; clk++) {
@@ -569,6 +578,11 @@ static int kbasep_hwcnt_backend_jm_dump_get(
 	struct kbase_hwcnt_backend_jm *backend_jm =
 		(struct kbase_hwcnt_backend_jm *)backend;
 	size_t clk;
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	struct kbase_device *kbdev;
+	unsigned long flags;
+	int errcode;
+#endif /* CONFIG_MALI_NO_MALI */
 
 	if (!backend_jm || !dst || !dst_enable_map ||
 	    (backend_jm->info->metadata != dst->metadata) ||
@@ -582,15 +596,32 @@ static int kbasep_hwcnt_backend_jm_dump_get(
 	/* Dump sample to the internal 64-bit user buffer. */
 	kbasep_hwcnt_backend_jm_dump_sample(backend_jm);
 
+	/* Extract elapsed cycle count for each clock domain if enabled. */
 	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk) {
 		if (!kbase_hwcnt_clk_enable_map_enabled(
 			dst_enable_map->clk_enable_map, clk))
 			continue;
 
-		/* Extract elapsed cycle count for each clock domain. */
-		dst->clk_cnt_buf[clk] = backend_jm->cycle_count_elapsed[clk];
+		/* Reset the counter to zero if accumulation is off. */
+		if (!accumulate)
+			dst->clk_cnt_buf[clk] = 0;
+		dst->clk_cnt_buf[clk] += backend_jm->cycle_count_elapsed[clk];
 	}
 
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	kbdev = backend_jm->kctx->kbdev;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+
+	/* Update the current configuration information. */
+	errcode = kbasep_hwcnt_gpu_update_curr_config(kbdev,
+		&backend_jm->curr_config);
+
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	if (errcode)
+		return errcode;
+#endif /* CONFIG_MALI_NO_MALI */
 	return kbase_hwcnt_jm_dump_get(dst, backend_jm->to_user_buf,
 				       dst_enable_map, backend_jm->pm_core_mask,
 				       &backend_jm->curr_config, accumulate);
@@ -700,6 +731,9 @@ static int kbasep_hwcnt_backend_jm_create(
 	int errcode;
 	struct kbase_device *kbdev;
 	struct kbase_hwcnt_backend_jm *backend = NULL;
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	size_t page_count;
+#endif
 
 	WARN_ON(!info);
 	WARN_ON(!out_backend);
@@ -739,6 +773,13 @@ static int kbasep_hwcnt_backend_jm_create(
 	kbase_ccswe_init(&backend->ccswe_shader_cores);
 	backend->rate_listener.notify = kbasep_hwcnt_backend_jm_on_freq_change;
 
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	/* The dummy model needs the CPU mapping. */
+	page_count = PFN_UP(info->dump_bytes);
+	gpu_model_set_dummy_prfcnt_base_cpu(backend->cpu_dump_va, kbdev,
+					    backend->vmap->cpu_pages,
+					    page_count);
+#endif /* CONFIG_MALI_NO_MALI */
 
 	*out_backend = backend;
 	return 0;
diff --git a/mali_kbase/mali_kbase_hwcnt_legacy.c b/mali_kbase/mali_kbase_hwcnt_legacy.c
deleted file mode 100644
index 5ca4c51..0000000
--- a/mali_kbase/mali_kbase_hwcnt_legacy.c
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
-/*
- *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU license.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- */
-
-#include "mali_kbase_hwcnt_legacy.h"
-#include "mali_kbase_hwcnt_virtualizer.h"
-#include "mali_kbase_hwcnt_types.h"
-#include "mali_kbase_hwcnt_gpu.h"
-#include "mali_kbase_hwcnt_gpu_narrow.h"
-#include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h>
-
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-/**
- * struct kbase_hwcnt_legacy_client - Legacy hardware counter client.
- * @user_dump_buf: Pointer to a non-NULL user buffer, where dumps are returned.
- * @enable_map:    Counter enable map.
- * @dump_buf:      Dump buffer used to manipulate dumps from virtualizer.
- * @hvcli:         Hardware counter virtualizer client.
- * @dump_buf_user: Narrow dump buffer used to manipulate dumps before they are
- *                 copied to user.
- * @metadata_user: For compatibility with the user driver interface, this
- *                 contains a narrowed version of the hardware counter metadata
- *                 which is limited to 64 entries per block and 32-bit for each
- *                 entry.
- */
-struct kbase_hwcnt_legacy_client {
-	void __user *user_dump_buf;
-	struct kbase_hwcnt_enable_map enable_map;
-	struct kbase_hwcnt_dump_buffer dump_buf;
-	struct kbase_hwcnt_virtualizer_client *hvcli;
-	struct kbase_hwcnt_dump_buffer_narrow dump_buf_user;
-	const struct kbase_hwcnt_metadata_narrow *metadata_user;
-};
-
-int kbase_hwcnt_legacy_client_create(
-	struct kbase_hwcnt_virtualizer *hvirt,
-	struct kbase_ioctl_hwcnt_enable *enable,
-	struct kbase_hwcnt_legacy_client **out_hlcli)
-{
-	int errcode;
-	struct kbase_hwcnt_legacy_client *hlcli;
-	const struct kbase_hwcnt_metadata *metadata;
-	struct kbase_hwcnt_physical_enable_map phys_em;
-
-	if (!hvirt || !enable || !enable->dump_buffer || !out_hlcli)
-		return -EINVAL;
-
-	metadata = kbase_hwcnt_virtualizer_metadata(hvirt);
-
-	hlcli = kzalloc(sizeof(*hlcli), GFP_KERNEL);
-	if (!hlcli)
-		return -ENOMEM;
-
-	errcode = kbase_hwcnt_gpu_metadata_narrow_create(&hlcli->metadata_user,
-							 metadata);
-	if (errcode)
-		goto error;
-
-	errcode = kbase_hwcnt_dump_buffer_narrow_alloc(hlcli->metadata_user,
-						       &hlcli->dump_buf_user);
-	if (errcode)
-		goto error;
-
-	hlcli->user_dump_buf = (void __user *)(uintptr_t)enable->dump_buffer;
-
-	errcode = kbase_hwcnt_enable_map_alloc(metadata, &hlcli->enable_map);
-	if (errcode)
-		goto error;
-
-	/* Translate from the ioctl enable map to the internal one */
-	phys_em.fe_bm = enable->fe_bm;
-	phys_em.shader_bm = enable->shader_bm;
-	phys_em.tiler_bm = enable->tiler_bm;
-	phys_em.mmu_l2_bm = enable->mmu_l2_bm;
-	kbase_hwcnt_gpu_enable_map_from_physical(&hlcli->enable_map, &phys_em);
-
-	errcode = kbase_hwcnt_dump_buffer_alloc(metadata, &hlcli->dump_buf);
-	if (errcode)
-		goto error;
-
-	errcode = kbase_hwcnt_virtualizer_client_create(
-		hvirt, &hlcli->enable_map, &hlcli->hvcli);
-	if (errcode)
-		goto error;
-
-	*out_hlcli = hlcli;
-	return 0;
-
-error:
-	kbase_hwcnt_legacy_client_destroy(hlcli);
-	return errcode;
-}
-
-void kbase_hwcnt_legacy_client_destroy(struct kbase_hwcnt_legacy_client *hlcli)
-{
-	if (!hlcli)
-		return;
-
-	kbase_hwcnt_virtualizer_client_destroy(hlcli->hvcli);
-	kbase_hwcnt_dump_buffer_free(&hlcli->dump_buf);
-	kbase_hwcnt_enable_map_free(&hlcli->enable_map);
-	kbase_hwcnt_dump_buffer_narrow_free(&hlcli->dump_buf_user);
-	kbase_hwcnt_gpu_metadata_narrow_destroy(hlcli->metadata_user);
-	kfree(hlcli);
-}
-
-int kbase_hwcnt_legacy_client_dump(struct kbase_hwcnt_legacy_client *hlcli)
-{
-	int errcode;
-	u64 ts_start_ns;
-	u64 ts_end_ns;
-
-	if (!hlcli)
-		return -EINVAL;
-
-	/* Dump into the kernel buffer */
-	errcode = kbase_hwcnt_virtualizer_client_dump(hlcli->hvcli,
-		&ts_start_ns, &ts_end_ns, &hlcli->dump_buf);
-	if (errcode)
-		return errcode;
-
-	/* Patch the dump buf headers, to hide the counters that other hwcnt
-	 * clients are using.
-	 */
-	kbase_hwcnt_gpu_patch_dump_headers(
-		&hlcli->dump_buf, &hlcli->enable_map);
-
-	/* Copy the dump buffer to the userspace visible buffer. The strict
-	 * variant will explicitly zero any non-enabled counters to ensure
-	 * nothing except exactly what the user asked for is made visible.
-	 *
-	 * A narrow copy is required since virtualizer has a bigger buffer
-	 * but user only needs part of it.
-	 */
-	kbase_hwcnt_dump_buffer_copy_strict_narrow(
-		&hlcli->dump_buf_user, &hlcli->dump_buf, &hlcli->enable_map);
-
-	/* Copy into the user's buffer */
-	errcode = copy_to_user(hlcli->user_dump_buf,
-			       hlcli->dump_buf_user.dump_buf,
-			       hlcli->dump_buf_user.md_narrow->dump_buf_bytes);
-	/* Non-zero errcode implies user buf was invalid or too small */
-	if (errcode)
-		return -EFAULT;
-
-	return 0;
-}
-
-int kbase_hwcnt_legacy_client_clear(struct kbase_hwcnt_legacy_client *hlcli)
-{
-	u64 ts_start_ns;
-	u64 ts_end_ns;
-
-	if (!hlcli)
-		return -EINVAL;
-
-	/* Dump with a NULL buffer to clear this client's counters */
-	return kbase_hwcnt_virtualizer_client_dump(hlcli->hvcli,
-		&ts_start_ns, &ts_end_ns, NULL);
-}
diff --git a/mali_kbase/mali_kbase_hwcnt_legacy.h b/mali_kbase/mali_kbase_hwcnt_legacy.h
deleted file mode 100644
index 163ae8d..0000000
--- a/mali_kbase/mali_kbase_hwcnt_legacy.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU license.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- */
-
-/*
- * Legacy hardware counter interface, giving userspace clients simple,
- * synchronous access to hardware counters.
- *
- * Any functions operating on an single legacy hardware counter client instance
- * must be externally synchronised.
- * Different clients may safely be used concurrently.
- */
-
-#ifndef _KBASE_HWCNT_LEGACY_H_
-#define _KBASE_HWCNT_LEGACY_H_
-
-struct kbase_hwcnt_legacy_client;
-struct kbase_ioctl_hwcnt_enable;
-struct kbase_hwcnt_virtualizer;
-
-/**
- * kbase_hwcnt_legacy_client_create() - Create a legacy hardware counter client.
- * @hvirt:     Non-NULL pointer to hardware counter virtualizer the client
- *             should be attached to.
- * @enable:    Non-NULL pointer to hwcnt_enable structure, containing a valid
- *             pointer to a user dump buffer large enough to hold a dump, and
- *             the counters that should be enabled.
- * @out_hlcli: Non-NULL pointer to where the pointer to the created client will
- *             be stored on success.
- *
- * Return: 0 on success, else error code.
- */
-int kbase_hwcnt_legacy_client_create(
-	struct kbase_hwcnt_virtualizer *hvirt,
-	struct kbase_ioctl_hwcnt_enable *enable,
-	struct kbase_hwcnt_legacy_client **out_hlcli);
-
-/**
- * kbase_hwcnt_legacy_client_destroy() - Destroy a legacy hardware counter
- *                                       client.
- * @hlcli: Pointer to the legacy hardware counter client.
- *
- * Will safely destroy a client in any partial state of construction.
- */
-void kbase_hwcnt_legacy_client_destroy(struct kbase_hwcnt_legacy_client *hlcli);
-
-/**
- * kbase_hwcnt_legacy_client_dump() - Perform a hardware counter dump into the
- *                                    client's user buffer.
- * @hlcli: Non-NULL pointer to the legacy hardware counter client.
- *
- * This function will synchronously dump hardware counters into the user buffer
- * specified on client creation, with the counters specified on client creation.
- *
- * The counters are automatically cleared after each dump, such that the next
- * dump performed will return the counter values accumulated between the time of
- * this function call and the next dump.
- *
- * Return: 0 on success, else error code.
- */
-int kbase_hwcnt_legacy_client_dump(struct kbase_hwcnt_legacy_client *hlcli);
-
-/**
- * kbase_hwcnt_legacy_client_clear() - Perform and discard a hardware counter
- *                                     dump.
- * @hlcli: Non-NULL pointer to the legacy hardware counter client.
- *
- * This function will synchronously clear the hardware counters, such that the
- * next dump performed will return the counter values accumulated between the
- * time of this function call and the next dump.
- *
- * Return: 0 on success, else error code.
- */
-int kbase_hwcnt_legacy_client_clear(struct kbase_hwcnt_legacy_client *hlcli);
-
-#endif /* _KBASE_HWCNT_LEGACY_H_ */
diff --git a/mali_kbase/mali_kbase_hwcnt_watchdog_if.h b/mali_kbase/mali_kbase_hwcnt_watchdog_if.h
new file mode 100644
index 0000000..1873318
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_watchdog_if.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * Virtual interface for hardware counter watchdog.
+ */
+
+#ifndef _KBASE_HWCNT_WATCHDOG_IF_H_
+#define _KBASE_HWCNT_WATCHDOG_IF_H_
+
+#include <linux/types.h>
+
+/*
+ * Opaque structure of information used to create a watchdog timer interface.
+ */
+struct kbase_hwcnt_watchdog_info;
+
+/**
+ * typedef kbase_hwcnt_watchdog_callback_fn - Callback function when watchdog timer is done
+ *
+ * @user_data: Pointer to the callback user data.
+ */
+typedef void kbase_hwcnt_watchdog_callback_fn(void *user_data);
+
+/**
+ * typedef kbase_hwcnt_watchdog_enable_fn - Enable watchdog timer
+ *
+ * @timer:     Non-NULL pointer to a watchdog timer interface context
+ * @period_ms: Period in milliseconds of the watchdog timer
+ * @callback:  Non-NULL pointer to a watchdog callback function
+ * @user_data: Pointer to the user data, used when watchdog timer callback is called
+ *
+ * Return: 0 if the watchdog timer enabled successfully, error code otherwise.
+ */
+typedef int kbase_hwcnt_watchdog_enable_fn(
+	const struct kbase_hwcnt_watchdog_info *timer, u32 period_ms,
+	kbase_hwcnt_watchdog_callback_fn *callback, void *user_data);
+
+/**
+ * typedef kbase_hwcnt_watchdog_disable_fn - Disable watchdog timer
+ *
+ * @timer: Non-NULL pointer to a watchdog timer interface context
+ */
+typedef void
+kbase_hwcnt_watchdog_disable_fn(const struct kbase_hwcnt_watchdog_info *timer);
+
+/**
+ * typedef kbase_hwcnt_watchdog_modify_fn - Modify watchdog timer's timeout
+ *
+ * @timer:    Non-NULL pointer to a watchdog timer interface context
+ * @delay_ms: Watchdog timer expiration in milliseconds
+ */
+typedef void
+kbase_hwcnt_watchdog_modify_fn(const struct kbase_hwcnt_watchdog_info *timer,
+			       u32 delay_ms);
+
+/**
+ * struct kbase_hwcnt_watchdog_interface - Hardware counter watchdog virtual interface.
+ *
+ * @timer:   Immutable watchdog timer info
+ * @enable:  Function ptr to enable watchdog
+ * @disable: Function ptr to disable watchdog
+ * @modify:  Function ptr to modify watchdog
+ */
+struct kbase_hwcnt_watchdog_interface {
+	const struct kbase_hwcnt_watchdog_info *timer;
+	kbase_hwcnt_watchdog_enable_fn *enable;
+	kbase_hwcnt_watchdog_disable_fn *disable;
+	kbase_hwcnt_watchdog_modify_fn *modify;
+};
+
+#endif /* _KBASE_HWCNT_WATCHDOG_IF_H_ */
diff --git a/mali_kbase/mali_kbase_hwcnt_watchdog_if_timer.c b/mali_kbase/mali_kbase_hwcnt_watchdog_if_timer.c
new file mode 100644
index 0000000..4a03080
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_watchdog_if_timer.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#include "mali_kbase.h"
+#include "mali_kbase_hwcnt_watchdog_if.h"
+#include "mali_kbase_hwcnt_watchdog_if_timer.h"
+
+#include <linux/timer.h>
+#include <linux/slab.h>
+
+/**
+ * struct kbase_hwcnt_watchdog_if_timer_info - Timer information for watchdog
+ *                                             interface.
+ *
+ * @watchdog_timer: Watchdog timer
+ * @timer_enabled:  True if watchdog timer enabled, otherwise false
+ * @callback:       Watchdog callback function
+ * @user_data:      Pointer to user data passed as argument to the callback
+ *                  function
+ */
+struct kbase_hwcnt_watchdog_if_timer_info {
+	struct timer_list watchdog_timer;
+	bool timer_enabled;
+	kbase_hwcnt_watchdog_callback_fn *callback;
+	void *user_data;
+};
+
+/**
+ * kbasep_hwcnt_watchdog_callback() - Watchdog timer callback
+ *
+ * @timer: Timer structure
+ *
+ * Function to be called when watchdog timer expires. Will call the callback
+ * function provided at enable().
+ */
+static void kbasep_hwcnt_watchdog_callback(struct timer_list *const timer)
+{
+	struct kbase_hwcnt_watchdog_if_timer_info *const info =
+		container_of(timer, struct kbase_hwcnt_watchdog_if_timer_info,
+			     watchdog_timer);
+	if (info->callback)
+		info->callback(info->user_data);
+}
+
+static int kbasep_hwcnt_watchdog_if_timer_enable(
+	const struct kbase_hwcnt_watchdog_info *const timer,
+	u32 const period_ms, kbase_hwcnt_watchdog_callback_fn *const callback,
+	void *const user_data)
+{
+	struct kbase_hwcnt_watchdog_if_timer_info *const timer_info =
+		(void *)timer;
+
+	if (WARN_ON(!timer) || WARN_ON(!callback))
+		return -EINVAL;
+
+	timer_info->callback = callback;
+	timer_info->user_data = user_data;
+
+	mod_timer(&timer_info->watchdog_timer,
+		  jiffies + msecs_to_jiffies(period_ms));
+	timer_info->timer_enabled = true;
+
+	return 0;
+}
+
+static void kbasep_hwcnt_watchdog_if_timer_disable(
+	const struct kbase_hwcnt_watchdog_info *const timer)
+{
+	struct kbase_hwcnt_watchdog_if_timer_info *const timer_info =
+		(void *)timer;
+
+	if (WARN_ON(!timer))
+		return;
+
+	if (!timer_info->timer_enabled)
+		return;
+
+	del_timer_sync(&timer_info->watchdog_timer);
+	timer_info->timer_enabled = false;
+}
+
+static void kbasep_hwcnt_watchdog_if_timer_modify(
+	const struct kbase_hwcnt_watchdog_info *const timer, u32 const delay_ms)
+{
+	struct kbase_hwcnt_watchdog_if_timer_info *const timer_info =
+		(void *)timer;
+
+	if (WARN_ON(!timer))
+		return;
+
+	mod_timer(&timer_info->watchdog_timer,
+		  jiffies + msecs_to_jiffies(delay_ms));
+}
+
+void kbase_hwcnt_watchdog_if_timer_destroy(
+	struct kbase_hwcnt_watchdog_interface *const watchdog_if)
+{
+	struct kbase_hwcnt_watchdog_if_timer_info *timer_info;
+
+	if (WARN_ON(!watchdog_if))
+		return;
+
+	timer_info = (void *)watchdog_if->timer;
+
+	if (WARN_ON(!timer_info))
+		return;
+
+	del_timer_sync(&timer_info->watchdog_timer);
+	kfree(timer_info);
+
+	memset(watchdog_if, 0, sizeof(*watchdog_if));
+}
+
+int kbase_hwcnt_watchdog_if_timer_create(
+	struct kbase_hwcnt_watchdog_interface *const watchdog_if)
+{
+	struct kbase_hwcnt_watchdog_if_timer_info *timer_info;
+
+	if (WARN_ON(!watchdog_if))
+		return -EINVAL;
+
+	timer_info = kmalloc(sizeof(*timer_info), GFP_KERNEL);
+	if (!timer_info)
+		return -ENOMEM;
+
+	*timer_info =
+		(struct kbase_hwcnt_watchdog_if_timer_info){ .timer_enabled =
+								     false };
+
+	kbase_timer_setup(&timer_info->watchdog_timer,
+			  kbasep_hwcnt_watchdog_callback);
+
+	*watchdog_if = (struct kbase_hwcnt_watchdog_interface){
+		.timer = (void *)timer_info,
+		.enable = kbasep_hwcnt_watchdog_if_timer_enable,
+		.disable = kbasep_hwcnt_watchdog_if_timer_disable,
+		.modify = kbasep_hwcnt_watchdog_if_timer_modify,
+	};
+
+	return 0;
+}
diff --git a/mali_kbase/mali_kbase_hwcnt_watchdog_if_timer.h b/mali_kbase/mali_kbase_hwcnt_watchdog_if_timer.h
new file mode 100644
index 0000000..3bd69c3
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_watchdog_if_timer.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * Concrete implementation of kbase_hwcnt_watchdog_interface for HWC backend
+ */
+
+#ifndef _KBASE_HWCNT_WATCHDOG_IF_TIMER_H_
+#define _KBASE_HWCNT_WATCHDOG_IF_TIMER_H_
+
+struct kbase_hwcnt_watchdog_interface;
+
+/**
+ * kbase_hwcnt_watchdog_if_timer_create() - Create a watchdog interface of hardware counter backend.
+ *
+ * @watchdog_if: Non-NULL pointer to watchdog interface that is filled in on creation success
+ *
+ * Return: 0 on success, error otherwise.
+ */
+int kbase_hwcnt_watchdog_if_timer_create(
+	struct kbase_hwcnt_watchdog_interface *watchdog_if);
+
+/**
+ * kbase_hwcnt_watchdog_if_timer_destroy() - Destroy a watchdog interface of hardware counter
+ *                                           backend.
+ *
+ * @watchdog_if: Pointer to watchdog interface to destroy
+ */
+void kbase_hwcnt_watchdog_if_timer_destroy(
+	struct kbase_hwcnt_watchdog_interface *watchdog_if);
+
+#endif /* _KBASE_HWCNT_WATCHDOG_IF_TIMER_H_ */
diff --git a/mali_kbase/mali_kbase_jd.c b/mali_kbase/mali_kbase_jd.c
index c892455..08824bd 100644
--- a/mali_kbase/mali_kbase_jd.c
+++ b/mali_kbase/mali_kbase_jd.c
@@ -619,8 +619,8 @@ static void jd_update_jit_usage(struct kbase_jd_atom *katom)
 		else if (reg->flags & KBASE_REG_TILER_ALIGN_TOP)
 			size_to_read = sizeof(u64[COUNT]);
 
-		ptr = kbase_vmap(kctx, reg->heap_info_gpu_addr, size_to_read,
-				&mapping);
+		ptr = kbase_vmap_prot(kctx, reg->heap_info_gpu_addr, size_to_read,
+				KBASE_REG_CPU_RD, &mapping);
 
 		if (!ptr) {
 			dev_warn(kctx->kbdev->dev,
diff --git a/mali_kbase/mali_kbase_kinstr_prfcnt.c b/mali_kbase/mali_kbase_kinstr_prfcnt.c
index ce996ca..27ff3bb 100644
--- a/mali_kbase/mali_kbase_kinstr_prfcnt.c
+++ b/mali_kbase/mali_kbase_kinstr_prfcnt.c
@@ -19,10 +19,10 @@
  *
  */
 
+#include "mali_kbase.h"
 #include "mali_kbase_kinstr_prfcnt.h"
 #include "mali_kbase_hwcnt_virtualizer.h"
 #include "mali_kbase_hwcnt_types.h"
-#include <uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h>
 #include "mali_kbase_hwcnt_gpu.h"
 #include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h>
 #include "mali_malisw.h"
@@ -44,14 +44,12 @@
  */
 #define DUMP_INTERVAL_MIN_NS (100 * NSEC_PER_USEC)
 
-/* The minimum allowed interval between dumps, in microseconds
- * (equivalent to 10KHz)
- */
-#define DUMP_INTERVAL_MIN_US (DUMP_INTERVAL_MIN_NS / 1000)
-
 /* The maximum allowed buffers per client */
 #define MAX_BUFFER_COUNT 32
 
+/* The module printing prefix */
+#define KINSTR_PRFCNT_PREFIX "mali_kbase_kinstr_prfcnt: "
+
 /**
  * struct kbase_kinstr_prfcnt_context - IOCTL interface for userspace hardware
  *                                      counters.
@@ -80,11 +78,11 @@ struct kbase_kinstr_prfcnt_context {
 
 /**
  * struct kbase_kinstr_prfcnt_sample - Buffer and descriptor for sample data.
- * @sample_meta: Pointer to samle metadata.
+ * @sample_meta: Pointer to sample metadata.
  * @dump_buf:    Dump buffer containing sample data.
  */
 struct kbase_kinstr_prfcnt_sample {
-	u64 *sample_meta;
+	struct prfcnt_metadata *sample_meta;
 	struct kbase_hwcnt_dump_buffer dump_buf;
 };
 
@@ -92,7 +90,8 @@ struct kbase_kinstr_prfcnt_sample {
  * struct kbase_kinstr_prfcnt_sample_array - Array of sample data.
  * @page_addr:    Address of allocated pages. A single allocation is used
  *                for all Dump Buffers in the array.
- * @page_order: The allocation order of the pages.
+ * @page_order:   The allocation order of the pages, the order is on a
+ *                logarithmic scale.
  * @sample_count: Number of allocated samples.
  * @samples:      Non-NULL pointer to the array of Dump Buffers.
  */
@@ -107,59 +106,91 @@ struct kbase_kinstr_prfcnt_sample_array {
  * struct kbase_kinstr_prfcnt_client_config - Client session configuration.
  * @prfcnt_mode:  Sampling mode: either manual or periodic.
  * @counter_set:  Set of performance counter blocks.
+ * @scope:        Scope of performance counters to capture.
  * @buffer_count: Number of buffers used to store samples.
- * @period_us:    Sampling period, in microseconds, or 0 if manual mode.
+ * @period_ns:    Sampling period, in nanoseconds, or 0 if manual mode.
  * @phys_em:      Enable map used by the GPU.
  */
 struct kbase_kinstr_prfcnt_client_config {
 	u8 prfcnt_mode;
 	u8 counter_set;
+	u8 scope;
 	u16 buffer_count;
-	u64 period_us;
+	u64 period_ns;
 	struct kbase_hwcnt_physical_enable_map phys_em;
 };
 
 /**
+ * struct kbase_kinstr_prfcnt_async - Asynchronous sampling operation to
+ *                                    carry out for a kinstr_prfcnt_client.
+ * @dump_work: Worker for performing asynchronous counter dumps.
+ * @user_data: User data for asynchronous dump in progress.
+ * @ts_end_ns: End timestamp of most recent async dump.
+ */
+struct kbase_kinstr_prfcnt_async {
+	struct work_struct dump_work;
+	u64 user_data;
+	u64 ts_end_ns;
+};
+
+/**
  * struct kbase_kinstr_prfcnt_client - A kinstr_prfcnt client attached
  *                                     to a kinstr_prfcnt context.
- * @kinstr_ctx:        kinstr_prfcnt context client is attached to.
- * @hvcli:             Hardware counter virtualizer client.
- * @node:              Node used to attach this client to list in kinstr_prfcnt
- *                     context.
- * @next_dump_time_ns: Time in ns when this client's next periodic dump must
- *                     occur. If 0, not a periodic client.
- * @dump_interval_ns:  Interval between periodic dumps. If 0, not a periodic
- *                     client.
- * @config:            Configuration of the client session.
- * @enable_map:        Counters enable map.
- * @tmp_buf:           Temporary buffer to use before handing over dump to
- *                     client.
- * @sample_arr:        Array of dump buffers allocated by this client.
- * @dump_bufs_meta:    Metadata of dump buffers.
- * @meta_idx:          Index of metadata being accessed by userspace.
- * @read_idx:          Index of buffer read by userspace.
- * @write_idx:         Index of buffer being written by dump worker.
- * @waitq:             Client's notification queue.
- * @sample_size:       Size of the data required for one sample, in bytes.
- * @sample_count:      Number of samples the client is able to capture.
+ * @kinstr_ctx:           kinstr_prfcnt context client is attached to.
+ * @hvcli:                Hardware counter virtualizer client.
+ * @node:                 Node used to attach this client to list in
+ *                        kinstr_prfcnt context.
+ * @cmd_sync_lock:        Lock coordinating the reader interface for commands
+ *                        that need interacting with the async sample dump
+ *                        worker thread.
+ * @next_dump_time_ns:    Time in ns when this client's next periodic dump must
+ *                        occur. If 0, not a periodic client.
+ * @dump_interval_ns:     Interval between periodic dumps. If 0, not a periodic
+ *                        client.
+ * @sample_flags:         Flags for the current active dumping sample, marking
+ *                        the conditions/events during the dump duration.
+ * @active:               True if the client has been started.
+ * @config:               Configuration of the client session.
+ * @enable_map:           Counters enable map.
+ * @tmp_buf:              Temporary buffer to use before handing over dump to
+ *                        client.
+ * @sample_arr:           Array of dump buffers allocated by this client.
+ * @read_idx:             Index of buffer read by userspace.
+ * @write_idx:            Index of buffer being written by dump worker.
+ * @waitq:                Client's notification queue.
+ * @sample_size:          Size of the data required for one sample, in bytes.
+ * @sample_count:         Number of samples the client is able to capture.
+ * @sync_sample_count:    Number of available spaces for synchronous samples.
+ *                        It can differ from sample_count if asynchronous
+ *                        sample requests are reserving space in the buffer.
+ * @user_data:            User data associated with the session.
+ *                        This is set when the session is started and stopped.
+ *                        This value is ignored for control commands that
+ *                        provide another value.
+ * @async:                Asynchronous sampling operations to carry out in this
+ *                        client's session.
  */
 struct kbase_kinstr_prfcnt_client {
 	struct kbase_kinstr_prfcnt_context *kinstr_ctx;
 	struct kbase_hwcnt_virtualizer_client *hvcli;
 	struct list_head node;
+	struct mutex cmd_sync_lock;
 	u64 next_dump_time_ns;
 	u32 dump_interval_ns;
+	u32 sample_flags;
+	bool active;
 	struct kbase_kinstr_prfcnt_client_config config;
 	struct kbase_hwcnt_enable_map enable_map;
 	struct kbase_hwcnt_dump_buffer tmp_buf;
 	struct kbase_kinstr_prfcnt_sample_array sample_arr;
-	struct kbase_hwcnt_reader_metadata *dump_bufs_meta;
-	atomic_t meta_idx;
 	atomic_t read_idx;
 	atomic_t write_idx;
 	wait_queue_head_t waitq;
 	size_t sample_size;
 	size_t sample_count;
+	atomic_t sync_sample_count;
+	u64 user_data;
+	struct kbase_kinstr_prfcnt_async async;
 };
 
 static struct prfcnt_enum_item kinstr_prfcnt_supported_requests[] = {
@@ -188,21 +219,6 @@ static struct prfcnt_enum_item kinstr_prfcnt_supported_requests[] = {
 };
 
 /**
- * kbasep_kinstr_prfcnt_hwcnt_reader_buffer_ready() - Check if client has ready
- *                                                    buffers.
- * @cli: Non-NULL pointer to kinstr_prfcnt client.
- *
- * Return: Non-zero if client has at least one dumping buffer filled that was
- *         not notified to user yet.
- */
-static int kbasep_kinstr_prfcnt_hwcnt_reader_buffer_ready(
-	struct kbase_kinstr_prfcnt_client *cli)
-{
-	WARN_ON(!cli);
-	return atomic_read(&cli->write_idx) != atomic_read(&cli->meta_idx);
-}
-
-/**
  * kbasep_kinstr_prfcnt_hwcnt_reader_poll() - hwcnt reader's poll.
  * @filp: Non-NULL pointer to file structure.
  * @wait: Non-NULL pointer to poll table.
@@ -210,8 +226,15 @@ static int kbasep_kinstr_prfcnt_hwcnt_reader_buffer_ready(
  * Return: POLLIN if data can be read without blocking, 0 if data can not be
  *         read without blocking, else error code.
  */
-static unsigned int kbasep_kinstr_prfcnt_hwcnt_reader_poll(struct file *filp,
-							   poll_table *wait)
+#if KERNEL_VERSION(4, 16, 0) >= LINUX_VERSION_CODE
+static unsigned int
+kbasep_kinstr_prfcnt_hwcnt_reader_poll(struct file *filp,
+				       struct poll_table_struct *wait)
+#else
+static __poll_t
+kbasep_kinstr_prfcnt_hwcnt_reader_poll(struct file *filp,
+				       struct poll_table_struct *wait)
+#endif
 {
 	struct kbase_kinstr_prfcnt_client *cli;
 
@@ -225,13 +248,776 @@ static unsigned int kbasep_kinstr_prfcnt_hwcnt_reader_poll(struct file *filp,
 
 	poll_wait(filp, &cli->waitq, wait);
 
-	if (kbasep_kinstr_prfcnt_hwcnt_reader_buffer_ready(cli))
+	if (atomic_read(&cli->write_idx) != atomic_read(&cli->read_idx))
 		return POLLIN;
 
 	return 0;
 }
 
 /**
+ * kbasep_kinstr_prfcnt_next_dump_time_ns() - Calculate the next periodic
+ *                                            dump time.
+ * @cur_ts_ns: Current time in nanoseconds.
+ * @interval:  Interval between dumps in nanoseconds.
+ *
+ * Return: 0 if interval is 0 (i.e. a non-periodic client), or the next dump
+ *         time that occurs after cur_ts_ns.
+ */
+static u64 kbasep_kinstr_prfcnt_next_dump_time_ns(u64 cur_ts_ns, u32 interval)
+{
+	/* Non-periodic client */
+	if (interval == 0)
+		return 0;
+
+	/*
+	 * Return the next interval after the current time relative to t=0.
+	 * This means multiple clients with the same period will synchronize,
+	 * regardless of when they were started, allowing the worker to be
+	 * scheduled less frequently.
+	 */
+	do_div(cur_ts_ns, interval);
+
+	return (cur_ts_ns + 1) * interval;
+}
+
+/**
+ * kbasep_kinstr_prfcnt_timestamp_ns() - Get the current time in nanoseconds.
+ *
+ * Return: Current time in nanoseconds.
+ */
+static u64 kbasep_kinstr_prfcnt_timestamp_ns(void)
+{
+	return ktime_get_raw_ns();
+}
+
+/**
+ * kbasep_kinstr_prfcnt_reschedule_worker() - Update next dump times for all
+ *                                            periodic kinstr_prfcnt clients,
+ *                                            then reschedule the dump worker
+ *                                            appropriately.
+ * @kinstr_ctx: Non-NULL pointer to the kinstr_prfcnt context.
+ *
+ * If there are no periodic clients, then the dump worker will not be
+ * rescheduled. Else, the dump worker will be rescheduled for the next
+ * periodic client dump.
+ */
+static void kbasep_kinstr_prfcnt_reschedule_worker(
+	struct kbase_kinstr_prfcnt_context *kinstr_ctx)
+{
+	u64 cur_ts_ns;
+	u64 shortest_period_ns = U64_MAX;
+	struct kbase_kinstr_prfcnt_client *pos;
+
+	WARN_ON(!kinstr_ctx);
+	lockdep_assert_held(&kinstr_ctx->lock);
+	cur_ts_ns = kbasep_kinstr_prfcnt_timestamp_ns();
+
+	/*
+	 * This loop fulfills 2 separate tasks that don't affect each other:
+	 *
+	 * 1) Determine the shortest period.
+	 * 2) Update the next dump time of clients that have already been
+	 *    dumped. It's important not to alter the next dump time of clients
+	 *    that haven't been dumped yet.
+	 *
+	 * For the sake of efficiency, the rescheduling decision ignores the time
+	 * of the next dump and just uses the shortest period among all periodic
+	 * clients. It is more efficient to serve multiple dump requests at once,
+	 * rather than trying to reschedule the worker to serve each request
+	 * individually.
+	 */
+	list_for_each_entry(pos, &kinstr_ctx->clients, node) {
+		/* Ignore clients that are not periodic or not active. */
+		if (pos->active && pos->dump_interval_ns > 0) {
+			shortest_period_ns =
+				MIN(shortest_period_ns, pos->dump_interval_ns);
+
+			/* Next dump should happen exactly one period after the last dump.
+			 * If last dump was overdue and scheduled to happen more than one
+			 * period ago, compensate for that by scheduling next dump in the
+			 * immediate future.
+			 */
+			if (pos->next_dump_time_ns < cur_ts_ns)
+				pos->next_dump_time_ns =
+					MAX(cur_ts_ns + 1,
+					    pos->next_dump_time_ns +
+						    pos->dump_interval_ns);
+		}
+	}
+
+	/* Cancel the timer if it is already pending */
+	hrtimer_cancel(&kinstr_ctx->dump_timer);
+
+	/* Start the timer if there are periodic clients and kinstr_prfcnt is not
+	 * suspended.
+	 */
+	if ((shortest_period_ns != U64_MAX) &&
+	    (kinstr_ctx->suspend_count == 0)) {
+		u64 next_schedule_time_ns =
+			kbasep_kinstr_prfcnt_next_dump_time_ns(
+				cur_ts_ns, shortest_period_ns);
+		hrtimer_start(&kinstr_ctx->dump_timer,
+			      ns_to_ktime(next_schedule_time_ns - cur_ts_ns),
+			      HRTIMER_MODE_REL);
+	}
+}
+
+static enum prfcnt_block_type
+kbase_hwcnt_metadata_block_type_to_prfcnt_block_type(u64 type)
+{
+	enum prfcnt_block_type block_type;
+
+	switch (type) {
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE:
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE2:
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE3:
+		block_type = PRFCNT_BLOCK_TYPE_FE;
+		break;
+
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER:
+		block_type = PRFCNT_BLOCK_TYPE_TILER;
+		break;
+
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC:
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2:
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC3:
+		block_type = PRFCNT_BLOCK_TYPE_SHADER_CORE;
+		break;
+
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS:
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2:
+		block_type = PRFCNT_BLOCK_TYPE_MEMORY;
+		break;
+
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_UNDEFINED:
+	default:
+		block_type = PRFCNT_BLOCK_TYPE_RESERVED;
+		break;
+	}
+
+	return block_type;
+}
+
+/**
+ * kbasep_kinstr_prfcnt_set_block_meta_items() - Populate a sample's block meta
+ *                                               item array.
+ * @dst:             Non-NULL pointer to the sample's dump buffer object.
+ * @block_meta_base: Non-NULL double pointer to the start of the block meta
+ *                   data items.
+ * @base_addr:       Address of allocated pages for array of samples. Used
+ *                   to calculate offset of block values.
+ * @counter_set:     The SET which blocks represent.
+ */
+int kbasep_kinstr_prfcnt_set_block_meta_items(struct kbase_hwcnt_dump_buffer *dst,
+					      struct prfcnt_metadata **block_meta_base,
+					      u64 base_addr, u8 counter_set)
+{
+	size_t grp, blk, blk_inst;
+	struct prfcnt_metadata **ptr_md = block_meta_base;
+	const struct kbase_hwcnt_metadata *metadata;
+
+	if (!dst || !*block_meta_base)
+		return -EINVAL;
+
+	metadata = dst->metadata;
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+		u64 *dst_blk;
+
+		/* Skip unused blocks */
+		if (!kbase_hwcnt_metadata_block_instance_avail(metadata, grp, blk, blk_inst))
+			continue;
+
+		dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		(*ptr_md)->hdr.item_type = PRFCNT_SAMPLE_META_TYPE_BLOCK;
+		(*ptr_md)->hdr.item_version = PRFCNT_READER_API_VERSION;
+		(*ptr_md)->u.block_md.block_type =
+			kbase_hwcnt_metadata_block_type_to_prfcnt_block_type(
+				kbase_hwcnt_metadata_block_type(metadata, grp,
+								blk));
+		(*ptr_md)->u.block_md.block_idx = (u8)blk_inst;
+		(*ptr_md)->u.block_md.set = counter_set;
+		(*ptr_md)->u.block_md.block_state = BLOCK_STATE_UNKNOWN;
+		(*ptr_md)->u.block_md.values_offset = (u32)((u64)(uintptr_t)dst_blk - base_addr);
+
+		/* update the buf meta data block pointer to next item */
+		(*ptr_md)++;
+	}
+
+	return 0;
+}
+
+/**
+ * kbasep_kinstr_prfcnt_set_sample_metadata() - Set sample metadata for sample
+ *                                              output.
+ * @cli:       Non-NULL pointer to a kinstr_prfcnt client.
+ * @dump_buf:  Non-NULL pointer to dump buffer where sample is stored.
+ * @ptr_md:    Non-NULL pointer to sample metadata.
+ */
+static void kbasep_kinstr_prfcnt_set_sample_metadata(
+	struct kbase_kinstr_prfcnt_client *cli,
+	struct kbase_hwcnt_dump_buffer *dump_buf,
+	struct prfcnt_metadata *ptr_md)
+{
+	u8 clk_cnt, i;
+
+	clk_cnt = cli->kinstr_ctx->metadata->clk_cnt;
+
+	/* PRFCNT_SAMPLE_META_TYPE_SAMPLE must be the first item */
+	ptr_md->hdr.item_type = PRFCNT_SAMPLE_META_TYPE_SAMPLE;
+	ptr_md->hdr.item_version = PRFCNT_READER_API_VERSION;
+	ptr_md->u.sample_md.seq = atomic_read(&cli->write_idx);
+	ptr_md->u.sample_md.flags = cli->sample_flags;
+
+	/* Place the PRFCNT_SAMPLE_META_TYPE_CLOCK optionally as the 2nd */
+	ptr_md++;
+	if (clk_cnt > MAX_REPORTED_DOMAINS)
+		clk_cnt = MAX_REPORTED_DOMAINS;
+
+	/* Handle the prfcnt_clock_metadata meta item */
+	ptr_md->hdr.item_type = PRFCNT_SAMPLE_META_TYPE_CLOCK;
+	ptr_md->hdr.item_version = PRFCNT_READER_API_VERSION;
+	ptr_md->u.clock_md.num_domains = clk_cnt;
+	for (i = 0; i < clk_cnt; i++)
+		ptr_md->u.clock_md.cycles[i] = dump_buf->clk_cnt_buf[i];
+
+	/* Dealing with counter blocks */
+	ptr_md++;
+	if (WARN_ON(kbasep_kinstr_prfcnt_set_block_meta_items(
+		    dump_buf, &ptr_md, cli->sample_arr.page_addr, cli->config.counter_set)))
+		return;
+
+	/* Handle the last sentinel item */
+	ptr_md->hdr.item_type = FLEX_LIST_TYPE_NONE;
+	ptr_md->hdr.item_version = 0;
+}
+
+/**
+ * kbasep_kinstr_prfcnt_client_output_empty_sample() - Assemble an empty sample
+ *                                                     for output.
+ * @cli:          Non-NULL pointer to a kinstr_prfcnt client.
+ * @buf_idx:      The index to the sample array for saving the sample.
+ */
+static void kbasep_kinstr_prfcnt_client_output_empty_sample(
+	struct kbase_kinstr_prfcnt_client *cli, unsigned int buf_idx)
+{
+	struct kbase_hwcnt_dump_buffer *dump_buf;
+	struct prfcnt_metadata *ptr_md;
+
+	if (WARN_ON(buf_idx >= cli->sample_arr.sample_count))
+		return;
+
+	dump_buf = &cli->sample_arr.samples[buf_idx].dump_buf;
+	ptr_md = cli->sample_arr.samples[buf_idx].sample_meta;
+
+	kbase_hwcnt_dump_buffer_zero(dump_buf, &cli->enable_map);
+
+	/* Use end timestamp from most recent async dump */
+	ptr_md->u.sample_md.timestamp_start = cli->async.ts_end_ns;
+	ptr_md->u.sample_md.timestamp_end = cli->async.ts_end_ns;
+
+	kbasep_kinstr_prfcnt_set_sample_metadata(cli, dump_buf, ptr_md);
+}
+
+/**
+ * kbasep_kinstr_prfcnt_client_output_sample() - Assemble a sample for output.
+ * @cli:          Non-NULL pointer to a kinstr_prfcnt client.
+ * @buf_idx:      The index to the sample array for saving the sample.
+ * @user_data:    User data to return to the user.
+ * @ts_start_ns:  Time stamp for the start point of the sample dump.
+ * @ts_end_ns:    Time stamp for the end point of the sample dump.
+ */
+static void kbasep_kinstr_prfcnt_client_output_sample(
+	struct kbase_kinstr_prfcnt_client *cli, unsigned int buf_idx,
+	u64 user_data, u64 ts_start_ns, u64 ts_end_ns)
+{
+	struct kbase_hwcnt_dump_buffer *dump_buf;
+	struct kbase_hwcnt_dump_buffer *tmp_buf = &cli->tmp_buf;
+	struct prfcnt_metadata *ptr_md;
+
+	if (WARN_ON(buf_idx >= cli->sample_arr.sample_count))
+		return;
+
+	dump_buf = &cli->sample_arr.samples[buf_idx].dump_buf;
+	ptr_md = cli->sample_arr.samples[buf_idx].sample_meta;
+
+	/* Patch the dump buf headers, to hide the counters that other hwcnt
+	 * clients are using.
+	 */
+	kbase_hwcnt_gpu_patch_dump_headers(tmp_buf, &cli->enable_map);
+
+	/* Copy the temp buffer to the userspace visible buffer. The strict
+	 * variant will explicitly zero any non-enabled counters to ensure
+	 * nothing except exactly what the user asked for is made visible.
+	 */
+	kbase_hwcnt_dump_buffer_copy_strict(dump_buf, tmp_buf,
+					    &cli->enable_map);
+
+	/* PRFCNT_SAMPLE_META_TYPE_SAMPLE must be the first item.
+	 * Set timestamp and user data for real dump.
+	 */
+	ptr_md->u.sample_md.timestamp_start = ts_start_ns;
+	ptr_md->u.sample_md.timestamp_end = ts_end_ns;
+	ptr_md->u.sample_md.user_data = user_data;
+
+	kbasep_kinstr_prfcnt_set_sample_metadata(cli, dump_buf, ptr_md);
+}
+
+/**
+ * kbasep_kinstr_prfcnt_client_dump() - Perform a dump for a client.
+ * @cli:          Non-NULL pointer to a kinstr_prfcnt client.
+ * @event_id:     Event type that triggered the dump.
+ * @user_data:    User data to return to the user.
+ * @async_dump:   Whether this is an asynchronous dump or not.
+ * @empty_sample: Sample block data will be 0 if this is true.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int
+kbasep_kinstr_prfcnt_client_dump(struct kbase_kinstr_prfcnt_client *cli,
+				 enum base_hwcnt_reader_event event_id,
+				 u64 user_data, bool async_dump,
+				 bool empty_sample)
+{
+	int ret;
+	u64 ts_start_ns = 0;
+	u64 ts_end_ns = 0;
+	unsigned int write_idx;
+	unsigned int read_idx;
+	size_t available_samples_count;
+
+	WARN_ON(!cli);
+	lockdep_assert_held(&cli->kinstr_ctx->lock);
+
+	write_idx = atomic_read(&cli->write_idx);
+	read_idx = atomic_read(&cli->read_idx);
+
+	/* Check if there is a place to copy HWC block into. Calculate the
+	 * number of available samples count, by taking into account the type
+	 * of dump.
+	 * Asynchronous dumps have the ability to reserve space in the samples
+	 * array for future dumps, unlike synchronous dumps. Because of that,
+	 * the samples count for synchronous dumps is managed by a variable
+	 * called sync_sample_count, that originally is defined as equal to the
+	 * size of the whole array but later decreases every time an
+	 * asynchronous dump request is pending and then re-increased every
+	 * time an asynchronous dump request is completed.
+	 */
+	available_samples_count = async_dump ?
+					  cli->sample_arr.sample_count :
+					  atomic_read(&cli->sync_sample_count);
+	if (write_idx - read_idx == available_samples_count) {
+		/* For periodic sampling, the current active dump
+		 * will be accumulated in the next sample, when
+		 * a buffer becomes available.
+		 */
+		if (event_id == BASE_HWCNT_READER_EVENT_PERIODIC)
+			cli->sample_flags |= SAMPLE_FLAG_OVERFLOW;
+		return -EBUSY;
+	}
+
+	/* For the rest of the function, use the actual sample_count
+	 * that represents the real size of the array.
+	 */
+	write_idx %= cli->sample_arr.sample_count;
+
+	if (!empty_sample) {
+		ret = kbase_hwcnt_virtualizer_client_dump(
+			cli->hvcli, &ts_start_ns, &ts_end_ns, &cli->tmp_buf);
+		/* HWC dump error, set the sample with error flag */
+		if (ret)
+			cli->sample_flags |= SAMPLE_FLAG_ERROR;
+
+		/* Make the sample ready and copy it to the userspace mapped buffer */
+		kbasep_kinstr_prfcnt_client_output_sample(
+			cli, write_idx, user_data, ts_start_ns, ts_end_ns);
+	} else {
+		if (!async_dump) {
+			struct prfcnt_metadata *ptr_md;
+			/* User data will not be updated for empty samples. */
+			ptr_md = cli->sample_arr.samples[write_idx].sample_meta;
+			ptr_md->u.sample_md.user_data = user_data;
+		}
+
+		/* Make the sample ready and copy it to the userspace mapped buffer */
+		kbasep_kinstr_prfcnt_client_output_empty_sample(cli, write_idx);
+	}
+
+	/* Notify client. Make sure all changes to memory are visible. */
+	wmb();
+	atomic_inc(&cli->write_idx);
+	if (async_dump) {
+		/* Remember the end timestamp of async dump for empty samples */
+		if (!empty_sample)
+			cli->async.ts_end_ns = ts_end_ns;
+
+		atomic_inc(&cli->sync_sample_count);
+	}
+	wake_up_interruptible(&cli->waitq);
+	/* Reset the flags for the next sample dump */
+	cli->sample_flags = 0;
+
+	return 0;
+}
+
+static int
+kbasep_kinstr_prfcnt_client_start(struct kbase_kinstr_prfcnt_client *cli,
+				  u64 user_data)
+{
+	int ret;
+	u64 tm_start, tm_end;
+
+	WARN_ON(!cli);
+	lockdep_assert_held(&cli->cmd_sync_lock);
+
+	/* If the client is already started, the command is a no-op */
+	if (cli->active)
+		return 0;
+
+	kbase_hwcnt_gpu_enable_map_from_physical(&cli->enable_map,
+						 &cli->config.phys_em);
+
+	mutex_lock(&cli->kinstr_ctx->lock);
+	/* Enable HWC from the configuration of the client creation */
+	ret = kbase_hwcnt_virtualizer_client_set_counters(
+		cli->hvcli, &cli->enable_map, &tm_start, &tm_end, NULL);
+
+	if (!ret) {
+		atomic_set(&cli->sync_sample_count, cli->sample_count);
+		cli->active = true;
+		cli->user_data = user_data;
+		cli->sample_flags = 0;
+
+		if (cli->dump_interval_ns)
+			kbasep_kinstr_prfcnt_reschedule_worker(cli->kinstr_ctx);
+	}
+
+	mutex_unlock(&cli->kinstr_ctx->lock);
+
+	return ret;
+}
+
+static int kbasep_kinstr_prfcnt_client_wait_async_done(
+	struct kbase_kinstr_prfcnt_client *cli)
+{
+	lockdep_assert_held(&cli->cmd_sync_lock);
+
+	return wait_event_interruptible(cli->waitq,
+					atomic_read(&cli->sync_sample_count) ==
+						cli->sample_count);
+}
+
+static int
+kbasep_kinstr_prfcnt_client_stop(struct kbase_kinstr_prfcnt_client *cli,
+				 u64 user_data)
+{
+	int ret;
+	u64 tm_start = 0;
+	u64 tm_end = 0;
+	struct kbase_hwcnt_physical_enable_map phys_em;
+	struct kbase_hwcnt_dump_buffer *tmp_buf = NULL;
+	unsigned int write_idx;
+	unsigned int read_idx;
+
+	WARN_ON(!cli);
+	lockdep_assert_held(&cli->cmd_sync_lock);
+
+	/* If the client is not started, the command is invalid */
+	if (!cli->active)
+		return -EINVAL;
+
+	/* Wait until pending async sample operation done */
+	ret = kbasep_kinstr_prfcnt_client_wait_async_done(cli);
+
+	if (ret < 0)
+		return -ERESTARTSYS;
+
+	phys_em.fe_bm = 0;
+	phys_em.tiler_bm = 0;
+	phys_em.mmu_l2_bm = 0;
+	phys_em.shader_bm = 0;
+
+	kbase_hwcnt_gpu_enable_map_from_physical(&cli->enable_map, &phys_em);
+
+	mutex_lock(&cli->kinstr_ctx->lock);
+
+	/* Check whether one has the buffer to hold the last sample */
+	write_idx = atomic_read(&cli->write_idx);
+	read_idx = atomic_read(&cli->read_idx);
+
+	/* Check if there is a place to save the last stop produced sample */
+	if (write_idx - read_idx < cli->sample_arr.sample_count)
+		tmp_buf = &cli->tmp_buf;
+
+	ret = kbase_hwcnt_virtualizer_client_set_counters(cli->hvcli,
+							  &cli->enable_map,
+							  &tm_start, &tm_end,
+							  &cli->tmp_buf);
+	/* If the last stop sample is in error, set the sample flag */
+	if (ret)
+		cli->sample_flags |= SAMPLE_FLAG_ERROR;
+
+	if (tmp_buf) {
+		write_idx %= cli->sample_arr.sample_count;
+		/* Handle the last stop sample */
+		kbase_hwcnt_gpu_enable_map_from_physical(&cli->enable_map,
+							 &cli->config.phys_em);
+		/* As this is a stop sample, mark it as MANUAL */
+		kbasep_kinstr_prfcnt_client_output_sample(
+			cli, write_idx, user_data, tm_start, tm_end);
+		/* Notify client. Make sure all changes to memory are visible. */
+		wmb();
+		atomic_inc(&cli->write_idx);
+		wake_up_interruptible(&cli->waitq);
+	}
+
+	cli->active = false;
+	cli->user_data = user_data;
+
+	if (cli->dump_interval_ns)
+		kbasep_kinstr_prfcnt_reschedule_worker(cli->kinstr_ctx);
+
+	mutex_unlock(&cli->kinstr_ctx->lock);
+
+	return ret;
+}
+
+static int
+kbasep_kinstr_prfcnt_client_sync_dump(struct kbase_kinstr_prfcnt_client *cli,
+				      u64 user_data)
+{
+	int ret;
+	bool empty_sample = false;
+
+	lockdep_assert_held(&cli->cmd_sync_lock);
+
+	/* If the client is not started, or not manual, the command invalid */
+	if (!cli->active || cli->dump_interval_ns)
+		return -EINVAL;
+
+	/* Wait until pending async sample operation done, this is required to
+	 * satisfy the stated sample sequence following their issuing order,
+	 * reflected by the sample start timestamp.
+	 */
+	if (atomic_read(&cli->sync_sample_count) != cli->sample_count) {
+		/* Return empty sample instead of performing real dump.
+		 * As there is an async dump currently in-flight which will
+		 * have the desired information.
+		 */
+		empty_sample = true;
+		ret = kbasep_kinstr_prfcnt_client_wait_async_done(cli);
+
+		if (ret < 0)
+			return -ERESTARTSYS;
+	}
+
+	mutex_lock(&cli->kinstr_ctx->lock);
+
+	ret = kbasep_kinstr_prfcnt_client_dump(cli,
+					       BASE_HWCNT_READER_EVENT_MANUAL,
+					       user_data, false, empty_sample);
+
+	mutex_unlock(&cli->kinstr_ctx->lock);
+
+	return ret;
+}
+
+static int
+kbasep_kinstr_prfcnt_client_async_dump(struct kbase_kinstr_prfcnt_client *cli,
+				       u64 user_data)
+{
+	unsigned int write_idx;
+	unsigned int read_idx;
+	unsigned int active_async_dumps;
+	unsigned int new_async_buf_idx;
+	int ret;
+
+	lockdep_assert_held(&cli->cmd_sync_lock);
+
+	/* If the client is not started, or not manual, the command invalid */
+	if (!cli->active || cli->dump_interval_ns)
+		return -EINVAL;
+
+	mutex_lock(&cli->kinstr_ctx->lock);
+
+	write_idx = atomic_read(&cli->write_idx);
+	read_idx = atomic_read(&cli->read_idx);
+	active_async_dumps =
+		cli->sample_count - atomic_read(&cli->sync_sample_count);
+	new_async_buf_idx = write_idx + active_async_dumps;
+
+	/* Check if there is a place to copy HWC block into.
+	 * If successful, reserve space in the buffer for the asynchronous
+	 * operation to make sure that it can actually take place.
+	 * Because we reserve space for asynchronous dumps we need to take that
+	 * in consideration here.
+	 */
+	ret = (new_async_buf_idx - read_idx == cli->sample_arr.sample_count) ?
+		      -EBUSY :
+		      0;
+
+	if (ret == -EBUSY) {
+		mutex_unlock(&cli->kinstr_ctx->lock);
+		return ret;
+	}
+
+	if (active_async_dumps > 0) {
+		struct prfcnt_metadata *ptr_md;
+		unsigned int buf_idx =
+			new_async_buf_idx % cli->sample_arr.sample_count;
+		/* Instead of storing user_data, write it directly to future
+		 * empty sample.
+		 */
+		ptr_md = cli->sample_arr.samples[buf_idx].sample_meta;
+		ptr_md->u.sample_md.user_data = user_data;
+
+		atomic_dec(&cli->sync_sample_count);
+	} else {
+		cli->async.user_data = user_data;
+		atomic_dec(&cli->sync_sample_count);
+
+		kbase_hwcnt_virtualizer_queue_work(cli->kinstr_ctx->hvirt,
+						   &cli->async.dump_work);
+	}
+
+	mutex_unlock(&cli->kinstr_ctx->lock);
+
+	return ret;
+}
+
+static int
+kbasep_kinstr_prfcnt_client_discard(struct kbase_kinstr_prfcnt_client *cli)
+{
+	WARN_ON(!cli);
+	lockdep_assert_held(&cli->cmd_sync_lock);
+
+	mutex_lock(&cli->kinstr_ctx->lock);
+
+	/* Discard (Clear) all internally buffered samples */
+	atomic_set(&cli->read_idx, atomic_read(&cli->write_idx));
+
+	mutex_unlock(&cli->kinstr_ctx->lock);
+
+	return 0;
+}
+
+/**
+ * kbasep_kinstr_prfcnt_cmd() - Execute command for a client session.
+ * @cli:         Non-NULL pointer to kinstr_prfcnt client.
+ * @control_cmd: Control command to execute.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_kinstr_prfcnt_cmd(struct kbase_kinstr_prfcnt_client *cli,
+				    struct prfcnt_control_cmd *control_cmd)
+{
+	int ret = 0;
+
+	mutex_lock(&cli->cmd_sync_lock);
+
+	switch (control_cmd->cmd) {
+	case PRFCNT_CONTROL_CMD_START:
+		ret = kbasep_kinstr_prfcnt_client_start(cli,
+							control_cmd->user_data);
+		break;
+	case PRFCNT_CONTROL_CMD_STOP:
+		ret = kbasep_kinstr_prfcnt_client_stop(cli,
+						       control_cmd->user_data);
+		break;
+	case PRFCNT_CONTROL_CMD_SAMPLE_SYNC:
+		ret = kbasep_kinstr_prfcnt_client_sync_dump(
+			cli, control_cmd->user_data);
+		break;
+	case PRFCNT_CONTROL_CMD_SAMPLE_ASYNC:
+		ret = kbasep_kinstr_prfcnt_client_async_dump(
+			cli, control_cmd->user_data);
+		break;
+	case PRFCNT_CONTROL_CMD_DISCARD:
+		ret = kbasep_kinstr_prfcnt_client_discard(cli);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	mutex_unlock(&cli->cmd_sync_lock);
+
+	return ret;
+}
+
+static int
+kbasep_kinstr_prfcnt_get_sample(struct kbase_kinstr_prfcnt_client *cli,
+				struct prfcnt_sample_access *sample_access)
+{
+	unsigned int write_idx;
+	unsigned int read_idx;
+	u64 sample_offset_bytes;
+	struct prfcnt_metadata *sample_meta;
+
+	write_idx = atomic_read(&cli->write_idx);
+	read_idx = atomic_read(&cli->read_idx);
+
+	if (write_idx == read_idx)
+		return -EINVAL;
+
+	read_idx %= cli->sample_arr.sample_count;
+	sample_offset_bytes =
+		(u64)(uintptr_t)cli->sample_arr.samples[read_idx].sample_meta -
+		(u64)(uintptr_t)cli->sample_arr.page_addr;
+	sample_meta =
+		(struct prfcnt_metadata *)cli->sample_arr.samples[read_idx]
+			.sample_meta;
+
+	/* Verify that a valid sample has been dumped in the read_idx.
+	 * There are situations where this may not be the case,
+	 * for instance if the client is trying to get an asynchronous
+	 * sample which has not been dumped yet.
+	 */
+	if (sample_meta->hdr.item_type != PRFCNT_SAMPLE_META_TYPE_SAMPLE)
+		return -EINVAL;
+	if (sample_meta->hdr.item_version != PRFCNT_READER_API_VERSION)
+		return -EINVAL;
+
+	sample_access->sequence = sample_meta->u.sample_md.seq;
+	sample_access->sample_offset_bytes = sample_offset_bytes;
+
+	/* read_idx is not incremented here, because the interface allows
+	 * only one sample to be "in flight" between kernel space and user space.
+	 */
+
+	return 0;
+}
+
+static int
+kbasep_kinstr_prfcnt_put_sample(struct kbase_kinstr_prfcnt_client *cli,
+				struct prfcnt_sample_access *sample_access)
+{
+	unsigned int write_idx;
+	unsigned int read_idx;
+	u64 sample_offset_bytes;
+
+	write_idx = atomic_read(&cli->write_idx);
+	read_idx = atomic_read(&cli->read_idx);
+
+	if (write_idx == read_idx)
+		return -EINVAL;
+
+	if (sample_access->sequence != read_idx)
+		return -EINVAL;
+
+	read_idx %= cli->sample_arr.sample_count;
+	sample_offset_bytes =
+		(u64)(uintptr_t)cli->sample_arr.samples[read_idx].sample_meta -
+		(u64)(uintptr_t)cli->sample_arr.page_addr;
+
+	if (sample_access->sample_offset_bytes != sample_offset_bytes)
+		return -EINVAL;
+
+	atomic_inc(&cli->read_idx);
+
+	return 0;
+}
+
+/**
  * kbasep_kinstr_prfcnt_hwcnt_reader_ioctl() - hwcnt reader's ioctl.
  * @filp:   Non-NULL pointer to file structure.
  * @cmd:    User command.
@@ -243,10 +1029,11 @@ static long kbasep_kinstr_prfcnt_hwcnt_reader_ioctl(struct file *filp,
 						    unsigned int cmd,
 						    unsigned long arg)
 {
-	long rcode;
+	long rcode = 0;
 	struct kbase_kinstr_prfcnt_client *cli;
+	void __user *uarg = (void __user *)arg;
 
-	if (!filp || (_IOC_TYPE(cmd) != KBASE_HWCNT_READER))
+	if (!filp)
 		return -EINVAL;
 
 	cli = filp->private_data;
@@ -255,8 +1042,36 @@ static long kbasep_kinstr_prfcnt_hwcnt_reader_ioctl(struct file *filp,
 		return -EINVAL;
 
 	switch (_IOC_NR(cmd)) {
+	case _IOC_NR(KBASE_IOCTL_KINSTR_PRFCNT_CMD): {
+		struct prfcnt_control_cmd control_cmd;
+		int err;
+
+		err = copy_from_user(&control_cmd, uarg, sizeof(control_cmd));
+		if (err)
+			return -EFAULT;
+		rcode = kbasep_kinstr_prfcnt_cmd(cli, &control_cmd);
+	} break;
+	case _IOC_NR(KBASE_IOCTL_KINSTR_PRFCNT_GET_SAMPLE): {
+		struct prfcnt_sample_access sample_access;
+		int err;
+
+		memset(&sample_access, 0, sizeof(sample_access));
+		rcode = kbasep_kinstr_prfcnt_get_sample(cli, &sample_access);
+		err = copy_to_user(uarg, &sample_access, sizeof(sample_access));
+		if (err)
+			return -EFAULT;
+	} break;
+	case _IOC_NR(KBASE_IOCTL_KINSTR_PRFCNT_PUT_SAMPLE): {
+		struct prfcnt_sample_access sample_access;
+		int err;
+
+		err = copy_from_user(&sample_access, uarg,
+				     sizeof(sample_access));
+		if (err)
+			return -EFAULT;
+		rcode = kbasep_kinstr_prfcnt_put_sample(cli, &sample_access);
+	} break;
 	default:
-		pr_warn("Unknown HWCNT ioctl 0x%x nr:%d", cmd, _IOC_NR(cmd));
 		rcode = -EINVAL;
 		break;
 	}
@@ -279,7 +1094,6 @@ static int kbasep_kinstr_prfcnt_hwcnt_reader_mmap(struct file *filp,
 
 	if (!filp || !vma)
 		return -EINVAL;
-
 	cli = filp->private_data;
 
 	if (!cli)
@@ -334,10 +1148,10 @@ kbasep_kinstr_prfcnt_client_destroy(struct kbase_kinstr_prfcnt_client *cli)
 		return;
 
 	kbase_hwcnt_virtualizer_client_destroy(cli->hvcli);
-	kfree(cli->dump_bufs_meta);
 	kbasep_kinstr_prfcnt_sample_array_free(&cli->sample_arr);
 	kbase_hwcnt_dump_buffer_free(&cli->tmp_buf);
 	kbase_hwcnt_enable_map_free(&cli->enable_map);
+	mutex_destroy(&cli->cmd_sync_lock);
 	kfree(cli);
 }
 
@@ -377,6 +1191,31 @@ static const struct file_operations kinstr_prfcnt_client_fops = {
 	.release = kbasep_kinstr_prfcnt_hwcnt_reader_release,
 };
 
+size_t kbasep_kinstr_prfcnt_get_sample_md_count(const struct kbase_hwcnt_metadata *metadata)
+{
+	size_t grp, blk, blk_inst;
+	size_t md_count = 0;
+
+	if (!metadata)
+		return 0;
+
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+		/* Skip unused blocks */
+		if (!kbase_hwcnt_metadata_block_instance_avail(metadata, grp, blk, blk_inst))
+			continue;
+
+		md_count++;
+	}
+
+	/* add counts for clock_meta and sample meta, respectively */
+	md_count += 2;
+
+	/* Reserve one for last sentinel item. */
+	md_count++;
+
+	return md_count;
+}
+
 static size_t kbasep_kinstr_prfcnt_get_sample_size(
 	const struct kbase_hwcnt_metadata *metadata,
 	struct kbase_hwcnt_dump_buffer *dump_buf)
@@ -384,19 +1223,12 @@ static size_t kbasep_kinstr_prfcnt_get_sample_size(
 	size_t dump_buf_bytes;
 	size_t clk_cnt_buf_bytes;
 	size_t sample_meta_bytes;
-	size_t block_count = 0;
-	size_t grp, blk, blk_inst;
+	size_t md_count = kbasep_kinstr_prfcnt_get_sample_md_count(metadata);
 
 	if (!metadata)
 		return 0;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
-		block_count++;
-
-	/* Reserve one for last sentinel item. */
-	block_count++;
-
-	sample_meta_bytes = sizeof(struct prfcnt_metadata) * block_count;
+	sample_meta_bytes = sizeof(struct prfcnt_metadata) * md_count;
 	dump_buf_bytes = metadata->dump_buf_bytes;
 	clk_cnt_buf_bytes = sizeof(*dump_buf->clk_cnt_buf) * metadata->clk_cnt;
 
@@ -411,7 +1243,68 @@ static size_t kbasep_kinstr_prfcnt_get_sample_size(
  */
 static void kbasep_kinstr_prfcnt_dump_worker(struct work_struct *work)
 {
-	/* Do nothing. */
+	struct kbase_kinstr_prfcnt_context *kinstr_ctx = container_of(
+		work, struct kbase_kinstr_prfcnt_context, dump_work);
+	struct kbase_kinstr_prfcnt_client *pos;
+	u64 cur_time_ns;
+
+	mutex_lock(&kinstr_ctx->lock);
+
+	cur_time_ns = kbasep_kinstr_prfcnt_timestamp_ns();
+
+	list_for_each_entry(pos, &kinstr_ctx->clients, node) {
+		if (pos->active && (pos->next_dump_time_ns != 0) &&
+		    (pos->next_dump_time_ns < cur_time_ns))
+			kbasep_kinstr_prfcnt_client_dump(
+				pos, BASE_HWCNT_READER_EVENT_PERIODIC,
+				pos->user_data, false, false);
+	}
+
+	kbasep_kinstr_prfcnt_reschedule_worker(kinstr_ctx);
+
+	mutex_unlock(&kinstr_ctx->lock);
+}
+
+/**
+ * kbasep_kinstr_prfcnt_async_dump_worker()- Dump worker for a manual client
+ *                                           to take a single asynchronous
+ *                                           sample.
+ * @work: Work structure.
+ */
+static void kbasep_kinstr_prfcnt_async_dump_worker(struct work_struct *work)
+{
+	struct kbase_kinstr_prfcnt_async *cli_async =
+		container_of(work, struct kbase_kinstr_prfcnt_async, dump_work);
+	struct kbase_kinstr_prfcnt_client *cli = container_of(
+		cli_async, struct kbase_kinstr_prfcnt_client, async);
+
+	mutex_lock(&cli->kinstr_ctx->lock);
+	/* While the async operation is in flight, a sync stop might have been
+	 * executed, for which the dump should be skipped. Further as we are
+	 * doing an async dump, we expect that there is reserved buffer for
+	 * this to happen. This is to avoid the rare corner case where the
+	 * user side has issued a stop/start pair before the async work item
+	 * get the chance to execute.
+	 */
+	if (cli->active &&
+	    (atomic_read(&cli->sync_sample_count) < cli->sample_count))
+		kbasep_kinstr_prfcnt_client_dump(cli,
+						 BASE_HWCNT_READER_EVENT_MANUAL,
+						 cli->async.user_data, true,
+						 false);
+
+	/* While the async operation is in flight, more async dump requests
+	 * may have been submitted. In this case, no more async dumps work
+	 * will be queued. Instead space will be reserved for that dump and
+	 * an empty sample will be return after handling the current async
+	 * dump.
+	 */
+	while (cli->active &&
+	       (atomic_read(&cli->sync_sample_count) < cli->sample_count)) {
+		kbasep_kinstr_prfcnt_client_dump(
+			cli, BASE_HWCNT_READER_EVENT_MANUAL, 0, true, true);
+	}
+	mutex_unlock(&cli->kinstr_ctx->lock);
 }
 
 /**
@@ -422,6 +1315,17 @@ static void kbasep_kinstr_prfcnt_dump_worker(struct work_struct *work)
 static enum hrtimer_restart
 kbasep_kinstr_prfcnt_dump_timer(struct hrtimer *timer)
 {
+	struct kbase_kinstr_prfcnt_context *kinstr_ctx = container_of(
+		timer, struct kbase_kinstr_prfcnt_context, dump_timer);
+
+	/* We don't need to check kinstr_ctx->suspend_count here.
+	 * Suspend and resume functions already ensure that the worker
+	 * is cancelled when the driver is suspended, and resumed when
+	 * the suspend_count reaches 0.
+	 */
+	kbase_hwcnt_virtualizer_queue_work(kinstr_ctx->hvirt,
+					   &kinstr_ctx->dump_work);
+
 	return HRTIMER_NORESTART;
 }
 
@@ -555,20 +1459,14 @@ static int kbasep_kinstr_prfcnt_sample_array_alloc(
 	size_t dump_buf_bytes;
 	size_t clk_cnt_buf_bytes;
 	size_t sample_meta_bytes;
-	size_t block_count = 0;
+	size_t md_count;
 	size_t sample_size;
-	size_t grp, blk, blk_inst;
 
 	if (!metadata || !sample_arr)
 		return -EINVAL;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
-		block_count++;
-
-	/* Reserve one for last sentinel item. */
-	block_count++;
-
-	sample_meta_bytes = sizeof(struct prfcnt_metadata) * block_count;
+	md_count = kbasep_kinstr_prfcnt_get_sample_md_count(metadata);
+	sample_meta_bytes = sizeof(struct prfcnt_metadata) * md_count;
 	dump_buf_bytes = metadata->dump_buf_bytes;
 	clk_cnt_buf_bytes =
 		sizeof(*samples->dump_buf.clk_cnt_buf) * metadata->clk_cnt;
@@ -602,7 +1500,8 @@ static int kbasep_kinstr_prfcnt_sample_array_alloc(
 		/* Internal layout in a sample buffer: [sample metadata, dump_buf, clk_cnt_buf]. */
 		samples[sample_idx].dump_buf.metadata = metadata;
 		samples[sample_idx].sample_meta =
-			(u64 *)(uintptr_t)(addr + sample_meta_offset);
+			(struct prfcnt_metadata *)(uintptr_t)(
+				addr + sample_meta_offset);
 		samples[sample_idx].dump_buf.dump_buf =
 			(u64 *)(uintptr_t)(addr + dump_buf_offset);
 		samples[sample_idx].dump_buf.clk_cnt_buf =
@@ -724,6 +1623,31 @@ static int kbasep_kinstr_prfcnt_parse_request_enable(
 }
 
 /**
+ * kbasep_kinstr_prfcnt_parse_request_scope - Parse a scope request
+ * @req_scope: Performance counters scope request to parse.
+ * @config:    Client object the session configuration should be written to.
+ *
+ * This function parses a performance counters scope request.
+ * There are only 2 acceptable outcomes: either the client leaves the scope
+ * as undefined, or all the scope requests are set to the same value.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_kinstr_prfcnt_parse_request_scope(
+	const struct prfcnt_request_scope *req_scope,
+	struct kbase_kinstr_prfcnt_client_config *config)
+{
+	int err = 0;
+
+	if (config->scope == PRFCNT_SCOPE_RESERVED)
+		config->scope = req_scope->scope;
+	else if (config->scope != req_scope->scope)
+		err = -EINVAL;
+
+	return err;
+}
+
+/**
  * kbasep_kinstr_prfcnt_parse_setup - Parse session setup
  * @kinstr_ctx: Pointer to the kinstr_prfcnt context.
  * @setup:      Session setup information to parse.
@@ -742,35 +1666,48 @@ static int kbasep_kinstr_prfcnt_parse_setup(
 {
 	uint32_t i;
 	struct prfcnt_request_item *req_arr;
+	unsigned int item_count = setup->in.request_item_count;
+	unsigned long bytes;
 	int err = 0;
 
-	if (!setup->in.requests_ptr || (setup->in.request_item_count == 0) ||
-	    (setup->in.request_item_size == 0)) {
+	/* Limiting the request items to 2x of the expected: acommodating
+	 * moderate duplications but rejecting excessive abuses.
+	 */
+	if (!setup->in.requests_ptr || (item_count < 2) ||
+	    (setup->in.request_item_size == 0) ||
+	    item_count > 2 * kinstr_ctx->info_item_count) {
 		return -EINVAL;
 	}
 
-	req_arr =
-		(struct prfcnt_request_item *)(uintptr_t)setup->in.requests_ptr;
+	bytes = item_count * sizeof(*req_arr);
+	req_arr = kmalloc(bytes, GFP_KERNEL);
+	if (!req_arr)
+		return -ENOMEM;
 
-	if (req_arr[setup->in.request_item_count - 1].hdr.item_type !=
-	    FLEX_LIST_TYPE_NONE) {
-		return -EINVAL;
+	if (copy_from_user(req_arr, u64_to_user_ptr(setup->in.requests_ptr),
+			   bytes)) {
+		err = -EFAULT;
+		goto free_buf;
 	}
 
-	if (req_arr[setup->in.request_item_count - 1].hdr.item_version != 0)
-		return -EINVAL;
+	if (req_arr[item_count - 1].hdr.item_type != FLEX_LIST_TYPE_NONE ||
+	    req_arr[item_count - 1].hdr.item_version != 0) {
+		err = -EINVAL;
+		goto free_buf;
+	}
 
 	/* The session configuration can only feature one value for some
-	 * properties (like capture mode and block counter set), but the client
-	 * may potential issue multiple requests and try to set more than one
-	 * value for those properties. While issuing multiple requests for the
+	 * properties (like capture mode, block counter set and scope), but the
+	 * client may potential issue multiple requests and try to set more than
+	 * one value for those properties. While issuing multiple requests for the
 	 * same property is allowed by the protocol, asking for different values
 	 * is illegal. Leaving these properties as undefined is illegal, too.
 	 */
 	config->prfcnt_mode = PRFCNT_MODE_RESERVED;
 	config->counter_set = KBASE_HWCNT_SET_UNDEFINED;
+	config->scope = PRFCNT_SCOPE_RESERVED;
 
-	for (i = 0; i < setup->in.request_item_count - 1; i++) {
+	for (i = 0; i < item_count - 1; i++) {
 		if (req_arr[i].hdr.item_version > PRFCNT_READER_API_VERSION) {
 			err = -EINVAL;
 			break;
@@ -797,17 +1734,20 @@ static int kbasep_kinstr_prfcnt_parse_setup(
 				break;
 
 			if (config->prfcnt_mode == PRFCNT_MODE_PERIODIC) {
-				config->period_us =
+				config->period_ns =
 					req_arr[i]
 						.u.req_mode.mode_config.periodic
-						.period_us;
+						.period_ns;
 
-				if ((config->period_us != 0) &&
-				    (config->period_us <
-				     DUMP_INTERVAL_MIN_US)) {
-					config->period_us =
-						DUMP_INTERVAL_MIN_US;
+				if ((config->period_ns != 0) &&
+				    (config->period_ns <
+				     DUMP_INTERVAL_MIN_NS)) {
+					config->period_ns =
+						DUMP_INTERVAL_MIN_NS;
 				}
+
+				if (config->period_ns == 0)
+					err = -EINVAL;
 			}
 			break;
 
@@ -816,6 +1756,11 @@ static int kbasep_kinstr_prfcnt_parse_setup(
 				&req_arr[i].u.req_enable, config);
 			break;
 
+		case PRFCNT_REQUEST_TYPE_SCOPE:
+			err = kbasep_kinstr_prfcnt_parse_request_scope(
+				&req_arr[i].u.req_scope, config);
+			break;
+
 		default:
 			err = -EINVAL;
 			break;
@@ -825,14 +1770,19 @@ static int kbasep_kinstr_prfcnt_parse_setup(
 			break;
 	}
 
-	/* Verify that properties (like capture mode and block counter set)
-	 * have been defined by the user space client.
-	 */
-	if (config->prfcnt_mode == PRFCNT_MODE_RESERVED)
-		err = -EINVAL;
+free_buf:
+	kfree(req_arr);
 
-	if (config->counter_set == KBASE_HWCNT_SET_UNDEFINED)
-		err = -EINVAL;
+	if (!err) {
+		/* Verify that properties (like capture mode and block counter
+		 * set) have been defined by the user space client.
+		 */
+		if (config->prfcnt_mode == PRFCNT_MODE_RESERVED)
+			err = -EINVAL;
+
+		if (config->counter_set == KBASE_HWCNT_SET_UNDEFINED)
+			err = -EINVAL;
+	}
 
 	return err;
 }
@@ -872,8 +1822,12 @@ static int kbasep_kinstr_prfcnt_client_create(
 		goto error;
 
 	cli->config.buffer_count = MAX_BUFFER_COUNT;
-	cli->dump_interval_ns = cli->config.period_us * NSEC_PER_USEC;
+	cli->dump_interval_ns = cli->config.period_ns;
 	cli->next_dump_time_ns = 0;
+	cli->active = false;
+	atomic_set(&cli->write_idx, 0);
+	atomic_set(&cli->read_idx, 0);
+
 	err = kbase_hwcnt_enable_map_alloc(kinstr_ctx->metadata,
 					   &cli->enable_map);
 
@@ -888,6 +1842,7 @@ static int kbasep_kinstr_prfcnt_client_create(
 	kbase_hwcnt_gpu_enable_map_from_physical(&cli->enable_map, &phys_em);
 
 	cli->sample_count = cli->config.buffer_count;
+	atomic_set(&cli->sync_sample_count, cli->sample_count);
 	cli->sample_size = kbasep_kinstr_prfcnt_get_sample_size(
 		kinstr_ctx->metadata, &cli->tmp_buf);
 
@@ -914,15 +1869,6 @@ static int kbasep_kinstr_prfcnt_client_create(
 	if (err < 0)
 		goto error;
 
-	err = -ENOMEM;
-
-	cli->dump_bufs_meta =
-		kmalloc_array(cli->config.buffer_count,
-			      sizeof(*cli->dump_bufs_meta), GFP_KERNEL);
-
-	if (!cli->dump_bufs_meta)
-		goto error;
-
 	err = kbase_hwcnt_virtualizer_client_create(
 		kinstr_ctx->hvirt, &cli->enable_map, &cli->hvcli);
 
@@ -930,6 +1876,9 @@ static int kbasep_kinstr_prfcnt_client_create(
 		goto error;
 
 	init_waitqueue_head(&cli->waitq);
+	INIT_WORK(&cli->async.dump_work,
+		  kbasep_kinstr_prfcnt_async_dump_worker);
+	mutex_init(&cli->cmd_sync_lock);
 	*out_vcli = cli;
 
 	return 0;
@@ -965,48 +1914,11 @@ static void kbasep_kinstr_prfcnt_get_request_info_list(
 	*arr_idx += ARRAY_SIZE(kinstr_prfcnt_supported_requests);
 }
 
-static enum prfcnt_block_type
-kbase_hwcnt_metadata_block_type_to_prfcnt_block_type(u64 type)
-{
-	enum prfcnt_block_type block_type;
-
-	switch (type) {
-	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE:
-	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE2:
-	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE3:
-		block_type = PRFCNT_BLOCK_TYPE_FE;
-		break;
-
-	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER:
-		block_type = PRFCNT_BLOCK_TYPE_TILER;
-		break;
-
-	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC:
-	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2:
-	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC3:
-		block_type = PRFCNT_BLOCK_TYPE_SHADER_CORE;
-		break;
-
-	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS:
-	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2:
-		block_type = PRFCNT_BLOCK_TYPE_MEMORY;
-		break;
-
-	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_UNDEFINED:
-	default:
-		block_type = PRFCNT_BLOCK_TYPE_RESERVED;
-		break;
-	}
-
-	return block_type;
-}
-
-static int kbasep_kinstr_prfcnt_get_block_info_list(
-	const struct kbase_hwcnt_metadata *metadata, size_t block_set,
-	struct prfcnt_enum_item *item_arr, size_t *arr_idx)
+int kbasep_kinstr_prfcnt_get_block_info_list(const struct kbase_hwcnt_metadata *metadata,
+					     size_t block_set, struct prfcnt_enum_item *item_arr,
+					     size_t *arr_idx)
 {
-	size_t grp;
-	size_t blk;
+	size_t grp, blk;
 
 	if (!metadata || !item_arr || !arr_idx)
 		return -EINVAL;
@@ -1015,19 +1927,30 @@ static int kbasep_kinstr_prfcnt_get_block_info_list(
 		for (blk = 0;
 		     blk < kbase_hwcnt_metadata_block_count(metadata, grp);
 		     blk++, (*arr_idx)++) {
+			size_t blk_inst;
+			size_t unused_blk_inst_count = 0;
+			size_t blk_inst_count =
+				kbase_hwcnt_metadata_block_instance_count(metadata, grp, blk);
+
 			item_arr[*arr_idx].hdr.item_type =
 				PRFCNT_ENUM_TYPE_BLOCK;
 			item_arr[*arr_idx].hdr.item_version =
 				PRFCNT_READER_API_VERSION;
 			item_arr[*arr_idx].u.block_counter.set = block_set;
-
 			item_arr[*arr_idx].u.block_counter.block_type =
 				kbase_hwcnt_metadata_block_type_to_prfcnt_block_type(
 					kbase_hwcnt_metadata_block_type(
 						metadata, grp, blk));
+
+			/* Count number of unused blocks to updated number of instances */
+			for (blk_inst = 0; blk_inst < blk_inst_count; blk_inst++) {
+				if (!kbase_hwcnt_metadata_block_instance_avail(metadata, grp, blk,
+									       blk_inst))
+					unused_blk_inst_count++;
+			}
+
 			item_arr[*arr_idx].u.block_counter.num_instances =
-				kbase_hwcnt_metadata_block_instance_count(
-					metadata, grp, blk);
+				blk_inst_count - unused_blk_inst_count;
 			item_arr[*arr_idx].u.block_counter.num_values =
 				kbase_hwcnt_metadata_block_values_count(
 					metadata, grp, blk);
@@ -1086,8 +2009,11 @@ static int kbasep_kinstr_prfcnt_enum_info_list(
 	if (enum_info->info_item_count != kinstr_ctx->info_item_count)
 		return -EINVAL;
 
-	prfcnt_item_arr =
-		(struct prfcnt_enum_item *)(uintptr_t)enum_info->info_list_ptr;
+	prfcnt_item_arr = kcalloc(enum_info->info_item_count,
+				  sizeof(*prfcnt_item_arr), GFP_KERNEL);
+	if (!prfcnt_item_arr)
+		return -ENOMEM;
+
 	kbasep_kinstr_prfcnt_get_request_info_list(kinstr_ctx, prfcnt_item_arr,
 						   &arr_idx);
 	metadata = kbase_hwcnt_virtualizer_metadata(kinstr_ctx->hvirt);
@@ -1118,6 +2044,16 @@ static int kbasep_kinstr_prfcnt_enum_info_list(
 		FLEX_LIST_TYPE_NONE;
 	prfcnt_item_arr[enum_info->info_item_count - 1].hdr.item_version = 0;
 
+	if (!err) {
+		unsigned long bytes =
+			enum_info->info_item_count * sizeof(*prfcnt_item_arr);
+
+		if (copy_to_user(u64_to_user_ptr(enum_info->info_list_ptr),
+				 prfcnt_item_arr, bytes))
+			err = -EFAULT;
+	}
+
+	kfree(prfcnt_item_arr);
 	return err;
 }
 
diff --git a/mali_kbase/mali_kbase_kinstr_prfcnt.h b/mali_kbase/mali_kbase_kinstr_prfcnt.h
index 83d76be..c42408b 100644
--- a/mali_kbase/mali_kbase_kinstr_prfcnt.h
+++ b/mali_kbase/mali_kbase_kinstr_prfcnt.h
@@ -26,6 +26,8 @@
 #ifndef _KBASE_KINSTR_PRFCNT_H_
 #define _KBASE_KINSTR_PRFCNT_H_
 
+#include <uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h>
+
 struct kbase_kinstr_prfcnt_context;
 struct kbase_hwcnt_virtualizer;
 struct kbase_ioctl_hwcnt_reader_setup;
@@ -76,6 +78,49 @@ void kbase_kinstr_prfcnt_suspend(struct kbase_kinstr_prfcnt_context *kinstr_ctx)
  */
 void kbase_kinstr_prfcnt_resume(struct kbase_kinstr_prfcnt_context *kinstr_ctx);
 
+#if MALI_KERNEL_TEST_API
+/**
+ * kbasep_kinstr_prfcnt_get_block_info_list() - Get list of all block types
+ *                                              with their information.
+ * @metadata:  Non-NULL pointer to the hardware counter metadata.
+ * @block_set: Which SET the blocks will represent.
+ * @item_arr:  Non-NULL pointer to array of enumeration items to populate.
+ * @arr_idx:   Non-NULL pointer to index of array @item_arr.
+ *
+ * Populate list of counter blocks with information for enumeration.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbasep_kinstr_prfcnt_get_block_info_list(const struct kbase_hwcnt_metadata *metadata,
+					     size_t block_set, struct prfcnt_enum_item *item_arr,
+					     size_t *arr_idx);
+
+/**
+ * kbasep_kinstr_prfcnt_get_sample_md_count() - Get count of sample
+ *                                              metadata items.
+ * @metadata: Non-NULL pointer to the hardware counter metadata.
+ *
+ * Return: Number of metadata items for available blocks in each sample.
+ */
+size_t kbasep_kinstr_prfcnt_get_sample_md_count(const struct kbase_hwcnt_metadata *metadata);
+
+/**
+ * kbasep_kinstr_prfcnt_set_block_meta_items() - Populate a sample's block meta
+ *                                               item array.
+ * @dst:             Non-NULL pointer to the sample's dump buffer object.
+ * @block_meta_base: Non-NULL double pointer to the start of the block meta
+ *                   data items.
+ * @base_addr:       Address of allocated pages for array of samples. Used
+ *                   to calculate offset of block values.
+ * @counter_set:     The SET which blocks represent.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbasep_kinstr_prfcnt_set_block_meta_items(struct kbase_hwcnt_dump_buffer *dst,
+					      struct prfcnt_metadata **block_meta_base,
+					      u64 base_addr, u8 counter_set);
+#endif /* MALI_KERNEL_TEST_API */
+
 /**
  * kbase_kinstr_prfcnt_enum_info - Enumerate performance counter information.
  * @kinstr_ctx: Non-NULL pointer to the kinstr_prfcnt context.
diff --git a/mali_kbase/mali_kbase_mem.c b/mali_kbase/mali_kbase_mem.c
index 320ffef..de854f3 100644
--- a/mali_kbase/mali_kbase_mem.c
+++ b/mali_kbase/mali_kbase_mem.c
@@ -4468,8 +4468,8 @@ void kbase_trace_jit_report_gpu_mem_trace_enabled(struct kbase_context *kctx,
 
 	addr_start = reg->heap_info_gpu_addr - jit_report_gpu_mem_offset;
 
-	ptr = kbase_vmap(kctx, addr_start, KBASE_JIT_REPORT_GPU_MEM_SIZE,
-			&mapping);
+	ptr = kbase_vmap_prot(kctx, addr_start, KBASE_JIT_REPORT_GPU_MEM_SIZE,
+			KBASE_REG_CPU_RD, &mapping);
 	if (!ptr) {
 		dev_warn(kctx->kbdev->dev,
 				"%s: JIT start=0x%llx unable to map memory near end pointer %llx\n",
diff --git a/mali_kbase/mali_kbase_mem.h b/mali_kbase/mali_kbase_mem.h
index 95533f5..9cb4088 100644
--- a/mali_kbase/mali_kbase_mem.h
+++ b/mali_kbase/mali_kbase_mem.h
@@ -287,6 +287,8 @@ static inline struct kbase_mem_phy_alloc *kbase_mem_phy_alloc_put(struct kbase_m
  *          that triggered incremental rendering by growing too much.
  * @rbtree:          Backlink to the red-black tree of memory regions.
  * @start_pfn:       The Page Frame Number in GPU virtual address space.
+ * @user_data:       The address of GPU command queue when VA region represents
+ *                   a ring buffer.
  * @nr_pages:        The size of the region in pages.
  * @initial_commit:  Initial commit, for aligning the start address and
  *                   correctly growing KBASE_REG_TILER_ALIGN_TOP regions.
@@ -324,6 +326,7 @@ struct kbase_va_region {
 	struct list_head link;
 	struct rb_root *rbtree;
 	u64 start_pfn;
+	void *user_data;
 	size_t nr_pages;
 	size_t initial_commit;
 	size_t threshold_pages;
@@ -476,6 +479,7 @@ struct kbase_va_region {
 	struct list_head jit_node;
 	u16 jit_usage_id;
 	u8 jit_bin_id;
+
 #if MALI_JIT_PRESSURE_LIMIT_BASE
 	/* Pointer to an object in GPU memory defining an end of an allocated
 	 * region
diff --git a/mali_kbase/mali_kbase_mem_linux.c b/mali_kbase/mali_kbase_mem_linux.c
index 527bec4..d252373 100644
--- a/mali_kbase/mali_kbase_mem_linux.c
+++ b/mali_kbase/mali_kbase_mem_linux.c
@@ -1029,7 +1029,7 @@ int kbase_mem_do_sync_imported(struct kbase_context *kctx,
 		struct kbase_va_region *reg, enum kbase_sync_type sync_fn)
 {
 	int ret = -EINVAL;
-	struct dma_buf *dma_buf;
+	struct dma_buf __maybe_unused *dma_buf;
 	enum dma_data_direction dir = DMA_BIDIRECTIONAL;
 
 	lockdep_assert_held(&kctx->reg_lock);
@@ -3214,8 +3214,12 @@ static unsigned long get_queue_doorbell_pfn(struct kbase_device *kbdev,
 	 * assigned one, otherwise a dummy page. Always return the
 	 * dummy page in no mali builds.
 	 */
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	return PFN_DOWN(as_phys_addr_t(kbdev->csf.dummy_db_page));
+#else
 	if (queue->doorbell_nr == KBASEP_USER_DB_NR_INVALID)
 		return PFN_DOWN(as_phys_addr_t(kbdev->csf.dummy_db_page));
+#endif
 	return (PFN_DOWN(kbdev->reg_start + CSF_HW_DOORBELL_PAGE_OFFSET +
 			 (u64)queue->doorbell_nr * CSF_HW_DOORBELL_PAGE_SIZE));
 }
@@ -3461,8 +3465,12 @@ static vm_fault_t kbase_csf_user_reg_vm_fault(struct vm_fault *vmf)
 	/* Don't map in the actual register page if GPU is powered down.
 	 * Always map in the dummy page in no mali builds.
 	 */
+#if IS_ENABLED(CONFIG_MALI_NO_MALI)
+	pfn = PFN_DOWN(as_phys_addr_t(kbdev->csf.dummy_user_reg_page));
+#else
 	if (!kbdev->pm.backend.gpu_powered)
 		pfn = PFN_DOWN(as_phys_addr_t(kbdev->csf.dummy_user_reg_page));
+#endif
 
 	ret = mgm_dev->ops.mgm_vmf_insert_pfn_prot(mgm_dev,
 						   KBASE_MEM_GROUP_CSF_FW, vma,
diff --git a/mali_kbase/mali_kbase_pm.c b/mali_kbase/mali_kbase_pm.c
index 4078da1..af154d5 100644
--- a/mali_kbase/mali_kbase_pm.c
+++ b/mali_kbase/mali_kbase_pm.c
@@ -144,7 +144,7 @@ void kbase_pm_context_idle(struct kbase_device *kbdev)
 
 KBASE_EXPORT_TEST_API(kbase_pm_context_idle);
 
-void kbase_pm_driver_suspend(struct kbase_device *kbdev)
+int kbase_pm_driver_suspend(struct kbase_device *kbdev)
 {
 	KBASE_DEBUG_ASSERT(kbdev);
 
@@ -162,7 +162,7 @@ void kbase_pm_driver_suspend(struct kbase_device *kbdev)
 	mutex_lock(&kbdev->pm.lock);
 	if (WARN_ON(kbase_pm_is_suspending(kbdev))) {
 		mutex_unlock(&kbdev->pm.lock);
-		return;
+		return 0;
 	}
 	kbdev->pm.suspending = true;
 	mutex_unlock(&kbdev->pm.lock);
@@ -193,7 +193,12 @@ void kbase_pm_driver_suspend(struct kbase_device *kbdev)
 	 */
 	kbasep_js_suspend(kbdev);
 #else
-	kbase_csf_scheduler_pm_suspend(kbdev);
+	if (kbase_csf_scheduler_pm_suspend(kbdev)) {
+		mutex_lock(&kbdev->pm.lock);
+		kbdev->pm.suspending = false;
+		mutex_unlock(&kbdev->pm.lock);
+		return -1;
+	}
 #endif
 
 	/* Wait for the active count to reach zero. This is not the same as
@@ -209,7 +214,12 @@ void kbase_pm_driver_suspend(struct kbase_device *kbdev)
 	/* NOTE: We synchronize with anything that was just finishing a
 	 * kbase_pm_context_idle() call by locking the pm.lock below
 	 */
-	kbase_hwaccess_pm_suspend(kbdev);
+	if (kbase_hwaccess_pm_suspend(kbdev)) {
+		mutex_lock(&kbdev->pm.lock);
+		kbdev->pm.suspending = false;
+		mutex_unlock(&kbdev->pm.lock);
+		return -1;
+	}
 
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
 	if (kbdev->arb.arb_if) {
@@ -218,6 +228,8 @@ void kbase_pm_driver_suspend(struct kbase_device *kbdev)
 		mutex_unlock(&kbdev->pm.arb_vm_state->vm_state_lock);
 	}
 #endif /* CONFIG_MALI_ARBITER_SUPPORT */
+
+	return 0;
 }
 
 void kbase_pm_driver_resume(struct kbase_device *kbdev, bool arb_gpu_start)
@@ -273,16 +285,19 @@ void kbase_pm_driver_resume(struct kbase_device *kbdev, bool arb_gpu_start)
 	kbase_kinstr_prfcnt_resume(kbdev->kinstr_prfcnt_ctx);
 }
 
-void kbase_pm_suspend(struct kbase_device *kbdev)
+int kbase_pm_suspend(struct kbase_device *kbdev)
 {
+	int result = 0;
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
 	if (kbdev->arb.arb_if)
 		kbase_arbiter_pm_vm_event(kbdev, KBASE_VM_OS_SUSPEND_EVENT);
 	else
-		kbase_pm_driver_suspend(kbdev);
+		result = kbase_pm_driver_suspend(kbdev);
 #else
-	kbase_pm_driver_suspend(kbdev);
+	result = kbase_pm_driver_suspend(kbdev);
 #endif /* CONFIG_MALI_ARBITER_SUPPORT */
+
+	return result;
 }
 
 void kbase_pm_resume(struct kbase_device *kbdev)
diff --git a/mali_kbase/mali_kbase_pm.h b/mali_kbase/mali_kbase_pm.h
index 980a8d1..730feea 100644
--- a/mali_kbase/mali_kbase_pm.h
+++ b/mali_kbase/mali_kbase_pm.h
@@ -165,8 +165,10 @@ void kbase_pm_context_idle(struct kbase_device *kbdev);
  * @note the mechanisms used here rely on all user-space threads being frozen
  * by the OS before we suspend. Otherwise, an IOCTL could occur that powers up
  * the GPU e.g. via atom submission.
+ *
+ * Return: 0 on success.
  */
-void kbase_pm_suspend(struct kbase_device *kbdev);
+int kbase_pm_suspend(struct kbase_device *kbdev);
 
 /**
  * Resume the GPU, allow register accesses to it, and resume running atoms on
@@ -207,8 +209,10 @@ void kbase_pm_vsync_callback(int buffer_updated, void *data);
  * @note the mechanisms used here rely on all user-space threads being frozen
  * by the OS before we suspend. Otherwise, an IOCTL could occur that powers up
  * the GPU e.g. via atom submission.
+ *
+ * Return: 0 on success.
  */
-void kbase_pm_driver_suspend(struct kbase_device *kbdev);
+int kbase_pm_driver_suspend(struct kbase_device *kbdev);
 
 /**
  * kbase_pm_driver_resume() - Put GPU and driver in resume
diff --git a/mali_kbase/mali_kbase_regs_history_debugfs.h b/mali_kbase/mali_kbase_regs_history_debugfs.h
index 26decb4..1b4196d 100644
--- a/mali_kbase/mali_kbase_regs_history_debugfs.h
+++ b/mali_kbase/mali_kbase_regs_history_debugfs.h
@@ -69,7 +69,7 @@ void kbase_io_history_dump(struct kbase_device *kbdev);
  */
 void kbasep_regs_history_debugfs_init(struct kbase_device *kbdev);
 
-#else /* defined(CONFIG_DEBUG_FS) && !IS_ENABLED(CONFIG_MALI_NO_MALI) */
+#else /* !defined(CONFIG_DEBUG_FS) || IS_ENABLED(CONFIG_MALI_NO_MALI) */
 
 #define kbase_io_history_init(...) ((int)0)
 
diff --git a/mali_kbase/mali_kbase_softjobs.c b/mali_kbase/mali_kbase_softjobs.c
index bee3513..df34854 100644
--- a/mali_kbase/mali_kbase_softjobs.c
+++ b/mali_kbase/mali_kbase_softjobs.c
@@ -95,7 +95,8 @@ static int kbasep_read_soft_event_status(
 	unsigned char *mapped_evt;
 	struct kbase_vmap_struct map;
 
-	mapped_evt = kbase_vmap(kctx, evt, sizeof(*mapped_evt), &map);
+	mapped_evt = kbase_vmap_prot(kctx, evt, sizeof(*mapped_evt),
+				     KBASE_REG_CPU_RD, &map);
 	if (!mapped_evt)
 		return -EFAULT;
 
@@ -116,7 +117,8 @@ static int kbasep_write_soft_event_status(
 	    (new_status != BASE_JD_SOFT_EVENT_RESET))
 		return -EINVAL;
 
-	mapped_evt = kbase_vmap(kctx, evt, sizeof(*mapped_evt), &map);
+	mapped_evt = kbase_vmap_prot(kctx, evt, sizeof(*mapped_evt),
+				     KBASE_REG_CPU_WR, &map);
 	if (!mapped_evt)
 		return -EFAULT;
 
@@ -1203,8 +1205,8 @@ static int kbase_jit_allocate_process(struct kbase_jd_atom *katom)
 		 * Write the address of the JIT allocation to the user provided
 		 * GPU allocation.
 		 */
-		ptr = kbase_vmap(kctx, info->gpu_alloc_addr, sizeof(*ptr),
-				&mapping);
+		ptr = kbase_vmap_prot(kctx, info->gpu_alloc_addr, sizeof(*ptr),
+				KBASE_REG_CPU_WR, &mapping);
 		if (!ptr) {
 			/*
 			 * Leave the allocations "live" as the JIT free atom
@@ -1649,7 +1651,12 @@ int kbase_prepare_soft_job(struct kbase_jd_atom *katom)
 			if (copy_to_user((__user void *)(uintptr_t)katom->jc,
 					 &fence, sizeof(fence)) != 0) {
 				kbase_sync_fence_out_remove(katom);
-				kbase_sync_fence_close_fd(fd);
+				/* fd should have been closed here, but there's
+				 * no good way of doing that. Since
+				 * copy_to_user() very rarely fails, and the fd
+				 * will get closed on process termination this
+				 * won't be a problem.
+				 */
 				fence.basep.fd = -EINVAL;
 				return -EINVAL;
 			}
diff --git a/mali_kbase/mali_kbase_sync.h b/mali_kbase/mali_kbase_sync.h
index ad05cdf..11cb8b9 100644
--- a/mali_kbase/mali_kbase_sync.h
+++ b/mali_kbase/mali_kbase_sync.h
@@ -157,21 +157,6 @@ void kbase_sync_fence_in_remove(struct kbase_jd_atom *katom);
 void kbase_sync_fence_out_remove(struct kbase_jd_atom *katom);
 #endif /* !MALI_USE_CSF */
 
-/**
- * kbase_sync_fence_close_fd() - Close a file descriptor representing a fence
- * @fd: File descriptor to close
- */
-static inline void kbase_sync_fence_close_fd(int fd)
-{
-#if KERNEL_VERSION(5, 11, 0) <= LINUX_VERSION_CODE
-	close_fd(fd);
-#elif KERNEL_VERSION(4, 17, 0) <= LINUX_VERSION_CODE
-	ksys_close(fd);
-#else
-	sys_close(fd);
-#endif
-}
-
 #if !MALI_USE_CSF
 /**
  * kbase_sync_fence_in_info_get() - Retrieves information about input fence
diff --git a/mali_kbase/mali_malisw.h b/mali_kbase/mali_malisw.h
index 3ddfcd9..92c8d31 100644
--- a/mali_kbase/mali_malisw.h
+++ b/mali_kbase/mali_malisw.h
@@ -98,7 +98,14 @@
 
 /* LINUX_VERSION_CODE < 5.4 */
 #if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-#define fallthrough	CSTD_NOP(...)	/* fallthrough */
-#endif
+#if defined(GCC_VERSION) && GCC_VERSION >= 70000
+#ifndef __fallthrough
+#define __fallthrough  __attribute__((fallthrough))
+#endif /* __fallthrough */
+#define fallthrough    __fallthrough
+#else
+#define fallthrough	   CSTD_NOP(...) /* fallthrough */
+#endif /* GCC_VERSION >= 70000 */
+#endif /* KERNEL_VERSION(5, 4, 0) */
 
 #endif /* _MALISW_H_ */
diff --git a/mali_kbase/mmu/mali_kbase_mmu.c b/mali_kbase/mmu/mali_kbase_mmu.c
index 5f6cc7a..a450d38 100644
--- a/mali_kbase/mmu/mali_kbase_mmu.c
+++ b/mali_kbase/mmu/mali_kbase_mmu.c
@@ -41,10 +41,91 @@
 #include <mmu/mali_kbase_mmu_internal.h>
 #include <mali_kbase_cs_experimental.h>
 #include <device/mali_kbase_device.h>
+#include <uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h>
+#if !MALI_USE_CSF
+#include <mali_kbase_hwaccess_jm.h>
+#endif
 
 #include <mali_kbase_trace_gpu_mem.h>
 
 /**
+ * mmu_flush_cache_on_gpu_ctrl() - Check if cache flush needs to be done
+ * through GPU_CONTROL interface
+ * @kbdev:         kbase device to check GPU model ID on.
+ *
+ * This function returns whether a cache flush for page table update should
+ * run through GPU_CONTROL interface or MMU_AS_CONTROL interface.
+ *
+ * Return: True if cache flush should be done on GPU command.
+ */
+static bool mmu_flush_cache_on_gpu_ctrl(struct kbase_device *kbdev)
+{
+	uint32_t const arch_maj_cur = (kbdev->gpu_props.props.raw_props.gpu_id &
+				       GPU_ID2_ARCH_MAJOR) >>
+				      GPU_ID2_ARCH_MAJOR_SHIFT;
+
+	return arch_maj_cur > 11;
+}
+
+/**
+ * mmu_flush_invalidate_on_gpu_ctrl() - Flush and invalidate the GPU caches
+ * through GPU_CONTROL interface.
+ * @kbdev:         kbase device to issue the MMU operation on.
+ * @as:            address space to issue the MMU operation on.
+ * @op_param:      parameters for the operation.
+ *
+ * This wrapper function alternates AS_COMMAND_FLUSH_PT and AS_COMMAND_FLUSH_MEM
+ * to equivalent GPU_CONTROL command FLUSH_CACHES.
+ * The function first issue LOCK to MMU-AS with kbase_mmu_hw_do_operation().
+ * And issues cache-flush with kbase_gpu_cache_flush_and_busy_wait() function
+ * then issue UNLOCK to MMU-AS with kbase_mmu_hw_do_operation().
+ *
+ * Return: Zero if the operation was successful, non-zero otherwise.
+ */
+static int
+mmu_flush_invalidate_on_gpu_ctrl(struct kbase_device *kbdev,
+				 struct kbase_as *as,
+				 struct kbase_mmu_hw_op_param *op_param)
+{
+	u32 flush_op;
+	int ret, ret2;
+
+	if (WARN_ON(kbdev == NULL) ||
+	    WARN_ON(as == NULL) ||
+	    WARN_ON(op_param == NULL))
+		return -EINVAL;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+	lockdep_assert_held(&kbdev->mmu_hw_mutex);
+
+	/* Translate operation to command */
+	if (op_param->op == KBASE_MMU_OP_FLUSH_PT) {
+		flush_op = GPU_COMMAND_CACHE_CLN_INV_L2;
+	} else if (op_param->op == KBASE_MMU_OP_FLUSH_MEM) {
+		flush_op = GPU_COMMAND_CACHE_CLN_INV_L2_LSC;
+	} else {
+		dev_warn(kbdev->dev, "Invalid flush request (op = %d)\n",
+			 op_param->op);
+		return -EINVAL;
+	}
+
+	/* 1. Issue MMU_AS_CONTROL.COMMAND.LOCK operation. */
+	op_param->op = KBASE_MMU_OP_LOCK;
+	ret = kbase_mmu_hw_do_operation(kbdev, as, op_param);
+	if (ret)
+		return ret;
+
+	/* 2. Issue GPU_CONTROL.COMMAND.FLUSH_CACHES operation */
+	ret = kbase_gpu_cache_flush_and_busy_wait(kbdev, flush_op);
+
+	/* 3. Issue MMU_AS_CONTROL.COMMAND.UNLOCK operation. */
+	op_param->op = KBASE_MMU_OP_UNLOCK;
+	ret2 = kbase_mmu_hw_do_operation(kbdev, as, op_param);
+
+	return ret ?: ret2;
+}
+
+/**
  * kbase_mmu_flush_invalidate() - Flush and invalidate the GPU caches.
  * @kctx: The KBase context.
  * @vpfn: The virtual page frame number to start the flush on.
@@ -244,7 +325,11 @@ static void kbase_gpu_mmu_handle_write_faulting_as(struct kbase_device *kbdev,
 		.kctx_id = kctx_id,
 		.mmu_sync_info = mmu_sync_info,
 	};
-	kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param);
+	if (mmu_flush_cache_on_gpu_ctrl(kbdev)) {
+		mmu_flush_invalidate_on_gpu_ctrl(kbdev, faulting_as, &op_param);
+	} else {
+		kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param);
+	}
 
 	mutex_unlock(&kbdev->mmu_hw_mutex);
 
@@ -934,7 +1019,13 @@ page_fault_retry:
 			.kctx_id = kctx->id,
 			.mmu_sync_info = mmu_sync_info,
 		};
-		kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param);
+		if (mmu_flush_cache_on_gpu_ctrl(kbdev)) {
+			mmu_flush_invalidate_on_gpu_ctrl(kbdev, faulting_as,
+							 &op_param);
+		} else {
+			kbase_mmu_hw_do_operation(kbdev, faulting_as,
+						  &op_param);
+		}
 
 		mutex_unlock(&kbdev->mmu_hw_mutex);
 		/* AS transaction end */
@@ -1046,11 +1137,7 @@ static phys_addr_t kbase_mmu_alloc_pgd(struct kbase_device *kbdev,
 	int i;
 	struct page *p;
 
-#ifdef CONFIG_MALI_2MB_ALLOC
-	p = kbase_mem_pool_alloc(&kbdev->mem_pools.large[mmut->group_id]);
-#else /* CONFIG_MALI_2MB_ALLOC */
 	p = kbase_mem_pool_alloc(&kbdev->mem_pools.small[mmut->group_id]);
-#endif /* CONFIG_MALI_2MB_ALLOC */
 	if (!p)
 		return 0;
 
@@ -1087,12 +1174,7 @@ static phys_addr_t kbase_mmu_alloc_pgd(struct kbase_device *kbdev,
 	return page_to_phys(p);
 
 alloc_free:
-
-#ifdef CONFIG_MALI_2MB_ALLOC
-	kbase_mem_pool_free(&kbdev->mem_pools.large[mmut->group_id], p, false);
-#else /* CONFIG_MALI_2MB_ALLOC */
 	kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id], p, false);
-#endif /* CONFIG_MALI_2MB_ALLOC */
 
 	return 0;
 }
@@ -1341,11 +1423,7 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 			 */
 			mutex_unlock(&kctx->mmu.mmu_lock);
 			err = kbase_mem_pool_grow(
-#ifdef CONFIG_MALI_2MB_ALLOC
-				&kbdev->mem_pools.large[
-#else
 				&kbdev->mem_pools.small[
-#endif
 					kctx->mmu.group_id],
 				MIDGARD_MMU_BOTTOMLEVEL);
 			mutex_lock(&kctx->mmu.mmu_lock);
@@ -1433,11 +1511,7 @@ static void kbase_mmu_free_pgd(struct kbase_device *kbdev,
 
 	p = pfn_to_page(PFN_DOWN(pgd));
 
-#ifdef CONFIG_MALI_2MB_ALLOC
-	kbase_mem_pool_free(&kbdev->mem_pools.large[mmut->group_id],
-#else
 	kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id],
-#endif
 			    p, dirty);
 
 	atomic_sub(1, &kbdev->memdev.used_pages);
@@ -1523,11 +1597,7 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
 			 */
 			mutex_unlock(&mmut->mmu_lock);
 			err = kbase_mem_pool_grow(
-#ifdef CONFIG_MALI_2MB_ALLOC
-				&kbdev->mem_pools.large[mmut->group_id],
-#else
 				&kbdev->mem_pools.small[mmut->group_id],
-#endif
 				cur_level);
 			mutex_lock(&mmut->mmu_lock);
 		} while (!err);
@@ -1681,8 +1751,15 @@ static void kbase_mmu_flush_invalidate_noretain(struct kbase_context *kctx,
 		.kctx_id = kctx->id,
 		.mmu_sync_info = mmu_sync_info,
 	};
-	err = kbase_mmu_hw_do_operation(kbdev, &kbdev->as[kctx->as_nr],
-					&op_param);
+
+	if (mmu_flush_cache_on_gpu_ctrl(kbdev)) {
+		err = mmu_flush_invalidate_on_gpu_ctrl(
+			kbdev, &kbdev->as[kctx->as_nr], &op_param);
+	} else {
+		err = kbase_mmu_hw_do_operation(kbdev, &kbdev->as[kctx->as_nr],
+						&op_param);
+	}
+
 	if (err) {
 		/* Flush failed to complete, assume the
 		 * GPU has hung and perform a reset to recover
@@ -1744,7 +1821,13 @@ kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev, struct kbase_as *as,
 	else
 		op_param.op = KBASE_MMU_OP_FLUSH_PT;
 
-	err = kbase_mmu_hw_do_operation(kbdev, as, &op_param);
+	if (mmu_flush_cache_on_gpu_ctrl(kbdev)) {
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		err = mmu_flush_invalidate_on_gpu_ctrl(kbdev, as, &op_param);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	} else {
+		err = kbase_mmu_hw_do_operation(kbdev, as, &op_param);
+	}
 
 	if (err) {
 		/* Flush failed to complete, assume the GPU has hung and
@@ -1850,6 +1933,15 @@ void kbase_mmu_disable(struct kbase_context *kctx)
 	kbase_mmu_flush_invalidate_noretain(kctx, 0, ~0);
 
 	kctx->kbdev->mmu_mode->disable_as(kctx->kbdev, kctx->as_nr);
+#if !MALI_USE_CSF
+	/*
+	 * JM GPUs has some L1 read only caches that need to be invalidated
+	 * with START_FLUSH configuration. Purge the MMU disabled kctx from
+	 * the slot_rb tracking field so such invalidation is performed when
+	 * a new katom is executed on the affected slots.
+	 */
+	kbase_backend_slot_kctx_purge_locked(kctx->kbdev, kctx);
+#endif
 }
 KBASE_EXPORT_TEST_API(kbase_mmu_disable);
 
@@ -2271,11 +2363,7 @@ int kbase_mmu_init(struct kbase_device *const kbdev,
 		int err;
 
 		err = kbase_mem_pool_grow(
-#ifdef CONFIG_MALI_2MB_ALLOC
-			&kbdev->mem_pools.large[mmut->group_id],
-#else
 			&kbdev->mem_pools.small[mmut->group_id],
-#endif
 			MIDGARD_MMU_BOTTOMLEVEL);
 		if (err) {
 			kbase_mmu_term(kbdev, mmut);
diff --git a/mali_kbase/mmu/mali_kbase_mmu.h b/mali_kbase/mmu/mali_kbase_mmu.h
index 45a628c..fe721fc 100644
--- a/mali_kbase/mmu/mali_kbase_mmu.h
+++ b/mali_kbase/mmu/mali_kbase_mmu.h
@@ -30,8 +30,9 @@ struct kbase_context;
 struct kbase_mmu_table;
 
 /**
- * MMU-synchronous caller info. A pointer to this type is passed down from the outer-most callers
- * in the kbase module - where the information resides as to the synchronous / asynchronous
+ * enum kbase_caller_mmu_sync_info - MMU-synchronous caller info.
+ * A pointer to this type is passed down from the outer-most callers in the kbase
+ * module - where the information resides as to the synchronous / asynchronous
  * nature of the call flow, with respect to MMU operations. ie - does the call flow relate to
  * existing GPU work does it come from requests (like ioctl) from user-space, power management,
  * etc.
diff --git a/mali_kbase/mmu/mali_kbase_mmu_hw.h b/mali_kbase/mmu/mali_kbase_mmu_hw.h
index 7c0e95e..7cdf426 100644
--- a/mali_kbase/mmu/mali_kbase_mmu_hw.h
+++ b/mali_kbase/mmu/mali_kbase_mmu_hw.h
@@ -78,7 +78,7 @@ enum kbase_mmu_op_type {
  * struct kbase_mmu_hw_op_param  - parameters for kbase_mmu_hw_do_operation()
  * @vpfn:          MMU Virtual Page Frame Number to start the operation on.
  * @nr:            Number of pages to work on.
- * @type:          Operation type (written to ASn_COMMAND).
+ * @op:            Operation type (written to ASn_COMMAND).
  * @kctx_id:       Kernel context ID for MMU command tracepoint
  * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops.
  */
diff --git a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
index 6306946..0ebc1bc 100644
--- a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
+++ b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
@@ -128,23 +128,19 @@ static int wait_ready(struct kbase_device *kbdev,
 		unsigned int as_nr)
 {
 	unsigned int max_loops = KBASE_AS_INACTIVE_MAX_LOOPS;
-	u32 val = kbase_reg_read(kbdev, MMU_AS_REG(as_nr, AS_STATUS));
 
-	/* Wait for the MMU status to indicate there is no active command, in
-	 * case one is pending. Do not log remaining register accesses.
-	 */
-	while (--max_loops && (val & AS_STATUS_AS_ACTIVE))
-		val = kbase_reg_read(kbdev, MMU_AS_REG(as_nr, AS_STATUS));
+	/* Wait for the MMU status to indicate there is no active command. */
+	while (--max_loops &&
+	       kbase_reg_read(kbdev, MMU_AS_REG(as_nr, AS_STATUS)) &
+		       AS_STATUS_AS_ACTIVE) {
+		;
+	}
 
 	if (max_loops == 0) {
 		dev_err(kbdev->dev, "AS_ACTIVE bit stuck, might be caused by slow/unstable GPU clock or possible faulty FPGA connector\n");
 		return -1;
 	}
 
-	/* If waiting in loop was performed, log last read value. */
-	if (KBASE_AS_INACTIVE_MAX_LOOPS - 1 > max_loops)
-		kbase_reg_read(kbdev, MMU_AS_REG(as_nr, AS_STATUS));
-
 	return 0;
 }
 
@@ -216,6 +212,11 @@ int kbase_mmu_hw_do_operation(struct kbase_device *kbdev, struct kbase_as *as,
 	int ret;
 	u64 lock_addr = 0x0;
 
+	if (WARN_ON(kbdev == NULL) ||
+	    WARN_ON(as == NULL) ||
+	    WARN_ON(op_param == NULL))
+		return -EINVAL;
+
 	lockdep_assert_held(&kbdev->mmu_hw_mutex);
 
 	if (op_param->op == KBASE_MMU_OP_UNLOCK) {
diff --git a/mali_kbase/tests/include/kutf/kutf_utils.h b/mali_kbase/tests/include/kutf/kutf_utils.h
index 18dcc3d..5f6d769 100644
--- a/mali_kbase/tests/include/kutf/kutf_utils.h
+++ b/mali_kbase/tests/include/kutf/kutf_utils.h
@@ -54,6 +54,7 @@
  * Return: Returns pointer to allocated string, or NULL on error.
  */
 const char *kutf_dsprintf(struct kutf_mempool *pool,
-		const char *fmt, ...);
+		const char *fmt, ...) __printf(2, 3);
+
 
 #endif	/* _KERNEL_UTF_UTILS_H_ */
diff --git a/mali_kbase/tl/backend/mali_kbase_timeline_csf.c b/mali_kbase/tl/backend/mali_kbase_timeline_csf.c
index c101563..567c5f1 100644
--- a/mali_kbase/tl/backend/mali_kbase_timeline_csf.c
+++ b/mali_kbase/tl/backend/mali_kbase_timeline_csf.c
@@ -44,6 +44,12 @@ void kbase_create_timeline_objects(struct kbase_device *kbdev)
 			      GPU_ID2_ARCH_MAJOR) >>
 			     GPU_ID2_ARCH_MAJOR_SHIFT;
 	u32 const num_sb_entries = arch_maj >= 11 ? 16 : 8;
+	u32 const supports_gpu_sleep =
+#ifdef KBASE_PM_RUNTIME
+		kbdev->pm.backend.gpu_sleep_supported;
+#else
+		false;
+#endif /* KBASE_PM_RUNTIME */
 
 	/* Summarize the Address Space objects. */
 	for (as_nr = 0; as_nr < kbdev->nr_hw_address_spaces; as_nr++)
@@ -62,11 +68,11 @@ void kbase_create_timeline_objects(struct kbase_device *kbdev)
 				kbdev);
 
 	/* Trace the creation of a new kbase device and set its properties. */
-	__kbase_tlstream_tl_kbase_new_device(
-		summary, kbdev->gpu_props.props.raw_props.gpu_id,
-		kbdev->gpu_props.num_cores, kbdev->csf.global_iface.group_num,
-		kbdev->nr_hw_address_spaces, num_sb_entries,
-		kbdev_has_cross_stream_sync);
+	__kbase_tlstream_tl_kbase_new_device(summary, kbdev->gpu_props.props.raw_props.gpu_id,
+					     kbdev->gpu_props.num_cores,
+					     kbdev->csf.global_iface.group_num,
+					     kbdev->nr_hw_address_spaces, num_sb_entries,
+					     kbdev_has_cross_stream_sync, supports_gpu_sleep);
 
 	/* Lock the context list, to ensure no changes to the list are made
 	 * while we're summarizing the contexts and their contents.
@@ -89,7 +95,7 @@ void kbase_create_timeline_objects(struct kbase_device *kbdev)
 			__kbase_tlstream_tl_kbase_device_program_csg(
 				summary,
 				kbdev->gpu_props.props.raw_props.gpu_id,
-				group->kctx->id, group->handle, slot_i);
+				group->kctx->id, group->handle, slot_i, 0);
 	}
 
 	/* Reset body stream buffers while holding the kctx lock.
diff --git a/mali_kbase/tl/mali_kbase_tracepoints.c b/mali_kbase/tl/mali_kbase_tracepoints.c
index 54e51f8..abbed05 100644
--- a/mali_kbase/tl/mali_kbase_tracepoints.c
+++ b/mali_kbase/tl/mali_kbase_tracepoints.c
@@ -120,8 +120,14 @@ enum tl_msg_id_obj {
 	KBASE_TL_KBASE_KCPUQUEUE_EXECUTE_ERROR_BARRIER,
 	KBASE_TL_KBASE_KCPUQUEUE_EXECUTE_GROUP_SUSPEND_START,
 	KBASE_TL_KBASE_KCPUQUEUE_EXECUTE_GROUP_SUSPEND_END,
+	KBASE_TL_KBASE_CSFFW_FW_RELOADING,
+	KBASE_TL_KBASE_CSFFW_FW_ENABLING,
+	KBASE_TL_KBASE_CSFFW_FW_REQUEST_SLEEP,
+	KBASE_TL_KBASE_CSFFW_FW_REQUEST_WAKEUP,
+	KBASE_TL_KBASE_CSFFW_FW_REQUEST_HALT,
+	KBASE_TL_KBASE_CSFFW_FW_DISABLING,
+	KBASE_TL_KBASE_CSFFW_FW_OFF,
 	KBASE_TL_KBASE_CSFFW_TLSTREAM_OVERFLOW,
-	KBASE_TL_KBASE_CSFFW_RESET,
 	KBASE_TL_JS_SCHED_START,
 	KBASE_TL_JS_SCHED_END,
 	KBASE_TL_JD_SUBMIT_ATOM_START,
@@ -312,12 +318,12 @@ enum tl_msg_id_aux {
 		"gpu") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_NEW_DEVICE, \
 		"New KBase Device", \
-		"@IIIIII", \
-		"kbase_device_id,kbase_device_gpu_core_count,kbase_device_max_num_csgs,kbase_device_as_count,kbase_device_sb_entry_count,kbase_device_has_cross_stream_sync") \
+		"@IIIIIII", \
+		"kbase_device_id,kbase_device_gpu_core_count,kbase_device_max_num_csgs,kbase_device_as_count,kbase_device_sb_entry_count,kbase_device_has_cross_stream_sync,kbase_device_supports_gpu_sleep") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_DEVICE_PROGRAM_CSG, \
 		"CSG is programmed to a slot", \
-		"@IIII", \
-		"kbase_device_id,kernel_ctx_id,gpu_cmdq_grp_handle,kbase_device_csg_slot_index") \
+		"@IIIII", \
+		"kbase_device_id,kernel_ctx_id,gpu_cmdq_grp_handle,kbase_device_csg_slot_index,kbase_device_csg_slot_resumed") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_DEVICE_DEPROGRAM_CSG, \
 		"CSG is deprogrammed from a slot", \
 		"@II", \
@@ -506,14 +512,38 @@ enum tl_msg_id_aux {
 		"KCPU Queue ends a group suspend", \
 		"@pI", \
 		"kcpu_queue,execute_error") \
+	TRACEPOINT_DESC(KBASE_TL_KBASE_CSFFW_FW_RELOADING, \
+		"CSF FW is being reloaded", \
+		"@L", \
+		"csffw_cycle") \
+	TRACEPOINT_DESC(KBASE_TL_KBASE_CSFFW_FW_ENABLING, \
+		"CSF FW is being enabled", \
+		"@L", \
+		"csffw_cycle") \
+	TRACEPOINT_DESC(KBASE_TL_KBASE_CSFFW_FW_REQUEST_SLEEP, \
+		"CSF FW sleep is requested", \
+		"@L", \
+		"csffw_cycle") \
+	TRACEPOINT_DESC(KBASE_TL_KBASE_CSFFW_FW_REQUEST_WAKEUP, \
+		"CSF FW wake up is requested", \
+		"@L", \
+		"csffw_cycle") \
+	TRACEPOINT_DESC(KBASE_TL_KBASE_CSFFW_FW_REQUEST_HALT, \
+		"CSF FW halt is requested", \
+		"@L", \
+		"csffw_cycle") \
+	TRACEPOINT_DESC(KBASE_TL_KBASE_CSFFW_FW_DISABLING, \
+		"CSF FW is being disabled", \
+		"@L", \
+		"csffw_cycle") \
+	TRACEPOINT_DESC(KBASE_TL_KBASE_CSFFW_FW_OFF, \
+		"CSF FW is off", \
+		"@L", \
+		"csffw_cycle") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_CSFFW_TLSTREAM_OVERFLOW, \
 		"An overflow has happened with the CSFFW Timeline stream", \
 		"@LL", \
 		"csffw_timestamp,csffw_cycle") \
-	TRACEPOINT_DESC(KBASE_TL_KBASE_CSFFW_RESET, \
-		"A reset has happened with the CSFFW", \
-		"@L", \
-		"csffw_cycle") \
 	TRACEPOINT_DESC(KBASE_TL_JS_SCHED_START, \
 		"Scheduling starts", \
 		"@I", \
@@ -2046,7 +2076,8 @@ void __kbase_tlstream_tl_kbase_new_device(
 	u32 kbase_device_max_num_csgs,
 	u32 kbase_device_as_count,
 	u32 kbase_device_sb_entry_count,
-	u32 kbase_device_has_cross_stream_sync)
+	u32 kbase_device_has_cross_stream_sync,
+	u32 kbase_device_supports_gpu_sleep)
 {
 	const u32 msg_id = KBASE_TL_KBASE_NEW_DEVICE;
 	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
@@ -2056,6 +2087,7 @@ void __kbase_tlstream_tl_kbase_new_device(
 		+ sizeof(kbase_device_as_count)
 		+ sizeof(kbase_device_sb_entry_count)
 		+ sizeof(kbase_device_has_cross_stream_sync)
+		+ sizeof(kbase_device_supports_gpu_sleep)
 		;
 	char *buffer;
 	unsigned long acq_flags;
@@ -2077,6 +2109,8 @@ void __kbase_tlstream_tl_kbase_new_device(
 		pos, &kbase_device_sb_entry_count, sizeof(kbase_device_sb_entry_count));
 	pos = kbasep_serialize_bytes(buffer,
 		pos, &kbase_device_has_cross_stream_sync, sizeof(kbase_device_has_cross_stream_sync));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &kbase_device_supports_gpu_sleep, sizeof(kbase_device_supports_gpu_sleep));
 
 	kbase_tlstream_msgbuf_release(stream, acq_flags);
 }
@@ -2086,7 +2120,8 @@ void __kbase_tlstream_tl_kbase_device_program_csg(
 	u32 kbase_device_id,
 	u32 kernel_ctx_id,
 	u32 gpu_cmdq_grp_handle,
-	u32 kbase_device_csg_slot_index)
+	u32 kbase_device_csg_slot_index,
+	u32 kbase_device_csg_slot_resumed)
 {
 	const u32 msg_id = KBASE_TL_KBASE_DEVICE_PROGRAM_CSG;
 	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
@@ -2094,6 +2129,7 @@ void __kbase_tlstream_tl_kbase_device_program_csg(
 		+ sizeof(kernel_ctx_id)
 		+ sizeof(gpu_cmdq_grp_handle)
 		+ sizeof(kbase_device_csg_slot_index)
+		+ sizeof(kbase_device_csg_slot_resumed)
 		;
 	char *buffer;
 	unsigned long acq_flags;
@@ -2111,6 +2147,8 @@ void __kbase_tlstream_tl_kbase_device_program_csg(
 		pos, &gpu_cmdq_grp_handle, sizeof(gpu_cmdq_grp_handle));
 	pos = kbasep_serialize_bytes(buffer,
 		pos, &kbase_device_csg_slot_index, sizeof(kbase_device_csg_slot_index));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &kbase_device_csg_slot_resumed, sizeof(kbase_device_csg_slot_resumed));
 
 	kbase_tlstream_msgbuf_release(stream, acq_flags);
 }
@@ -3309,14 +3347,12 @@ void __kbase_tlstream_tl_kbase_kcpuqueue_execute_group_suspend_end(
 	kbase_tlstream_msgbuf_release(stream, acq_flags);
 }
 
-void __kbase_tlstream_tl_kbase_csffw_tlstream_overflow(
+void __kbase_tlstream_tl_kbase_csffw_fw_reloading(
 	struct kbase_tlstream *stream,
-	u64 csffw_timestamp,
 	u64 csffw_cycle)
 {
-	const u32 msg_id = KBASE_TL_KBASE_CSFFW_TLSTREAM_OVERFLOW;
+	const u32 msg_id = KBASE_TL_KBASE_CSFFW_FW_RELOADING;
 	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
-		+ sizeof(csffw_timestamp)
 		+ sizeof(csffw_cycle)
 		;
 	char *buffer;
@@ -3328,18 +3364,104 @@ void __kbase_tlstream_tl_kbase_csffw_tlstream_overflow(
 	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
 	pos = kbasep_serialize_timestamp(buffer, pos);
 	pos = kbasep_serialize_bytes(buffer,
-		pos, &csffw_timestamp, sizeof(csffw_timestamp));
+		pos, &csffw_cycle, sizeof(csffw_cycle));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_kbase_csffw_fw_enabling(
+	struct kbase_tlstream *stream,
+	u64 csffw_cycle)
+{
+	const u32 msg_id = KBASE_TL_KBASE_CSFFW_FW_ENABLING;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(csffw_cycle)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &csffw_cycle, sizeof(csffw_cycle));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_kbase_csffw_fw_request_sleep(
+	struct kbase_tlstream *stream,
+	u64 csffw_cycle)
+{
+	const u32 msg_id = KBASE_TL_KBASE_CSFFW_FW_REQUEST_SLEEP;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(csffw_cycle)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &csffw_cycle, sizeof(csffw_cycle));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_kbase_csffw_fw_request_wakeup(
+	struct kbase_tlstream *stream,
+	u64 csffw_cycle)
+{
+	const u32 msg_id = KBASE_TL_KBASE_CSFFW_FW_REQUEST_WAKEUP;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(csffw_cycle)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &csffw_cycle, sizeof(csffw_cycle));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_kbase_csffw_fw_request_halt(
+	struct kbase_tlstream *stream,
+	u64 csffw_cycle)
+{
+	const u32 msg_id = KBASE_TL_KBASE_CSFFW_FW_REQUEST_HALT;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(csffw_cycle)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
 	pos = kbasep_serialize_bytes(buffer,
 		pos, &csffw_cycle, sizeof(csffw_cycle));
 
 	kbase_tlstream_msgbuf_release(stream, acq_flags);
 }
 
-void __kbase_tlstream_tl_kbase_csffw_reset(
+void __kbase_tlstream_tl_kbase_csffw_fw_disabling(
 	struct kbase_tlstream *stream,
 	u64 csffw_cycle)
 {
-	const u32 msg_id = KBASE_TL_KBASE_CSFFW_RESET;
+	const u32 msg_id = KBASE_TL_KBASE_CSFFW_FW_DISABLING;
 	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
 		+ sizeof(csffw_cycle)
 		;
@@ -3357,6 +3479,54 @@ void __kbase_tlstream_tl_kbase_csffw_reset(
 	kbase_tlstream_msgbuf_release(stream, acq_flags);
 }
 
+void __kbase_tlstream_tl_kbase_csffw_fw_off(
+	struct kbase_tlstream *stream,
+	u64 csffw_cycle)
+{
+	const u32 msg_id = KBASE_TL_KBASE_CSFFW_FW_OFF;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(csffw_cycle)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &csffw_cycle, sizeof(csffw_cycle));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_kbase_csffw_tlstream_overflow(
+	struct kbase_tlstream *stream,
+	u64 csffw_timestamp,
+	u64 csffw_cycle)
+{
+	const u32 msg_id = KBASE_TL_KBASE_CSFFW_TLSTREAM_OVERFLOW;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(csffw_timestamp)
+		+ sizeof(csffw_cycle)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &csffw_timestamp, sizeof(csffw_timestamp));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &csffw_cycle, sizeof(csffw_cycle));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
 void __kbase_tlstream_tl_js_sched_start(
 	struct kbase_tlstream *stream,
 	u32 dummy)
diff --git a/mali_kbase/tl/mali_kbase_tracepoints.h b/mali_kbase/tl/mali_kbase_tracepoints.h
index 3fc871c..aa10bc0 100644
--- a/mali_kbase/tl/mali_kbase_tracepoints.h
+++ b/mali_kbase/tl/mali_kbase_tracepoints.h
@@ -310,13 +310,15 @@ void __kbase_tlstream_tl_kbase_new_device(
 	u32 kbase_device_max_num_csgs,
 	u32 kbase_device_as_count,
 	u32 kbase_device_sb_entry_count,
-	u32 kbase_device_has_cross_stream_sync);
+	u32 kbase_device_has_cross_stream_sync,
+	u32 kbase_device_supports_gpu_sleep);
 void __kbase_tlstream_tl_kbase_device_program_csg(
 	struct kbase_tlstream *stream,
 	u32 kbase_device_id,
 	u32 kernel_ctx_id,
 	u32 gpu_cmdq_grp_handle,
-	u32 kbase_device_csg_slot_index);
+	u32 kbase_device_csg_slot_index,
+	u32 kbase_device_csg_slot_resumed);
 void __kbase_tlstream_tl_kbase_device_deprogram_csg(
 	struct kbase_tlstream *stream,
 	u32 kbase_device_id,
@@ -498,13 +500,31 @@ void __kbase_tlstream_tl_kbase_kcpuqueue_execute_group_suspend_end(
 	struct kbase_tlstream *stream,
 	const void *kcpu_queue,
 	u32 execute_error);
-void __kbase_tlstream_tl_kbase_csffw_tlstream_overflow(
+void __kbase_tlstream_tl_kbase_csffw_fw_reloading(
+	struct kbase_tlstream *stream,
+	u64 csffw_cycle);
+void __kbase_tlstream_tl_kbase_csffw_fw_enabling(
+	struct kbase_tlstream *stream,
+	u64 csffw_cycle);
+void __kbase_tlstream_tl_kbase_csffw_fw_request_sleep(
+	struct kbase_tlstream *stream,
+	u64 csffw_cycle);
+void __kbase_tlstream_tl_kbase_csffw_fw_request_wakeup(
+	struct kbase_tlstream *stream,
+	u64 csffw_cycle);
+void __kbase_tlstream_tl_kbase_csffw_fw_request_halt(
 	struct kbase_tlstream *stream,
-	u64 csffw_timestamp,
 	u64 csffw_cycle);
-void __kbase_tlstream_tl_kbase_csffw_reset(
+void __kbase_tlstream_tl_kbase_csffw_fw_disabling(
 	struct kbase_tlstream *stream,
 	u64 csffw_cycle);
+void __kbase_tlstream_tl_kbase_csffw_fw_off(
+	struct kbase_tlstream *stream,
+	u64 csffw_cycle);
+void __kbase_tlstream_tl_kbase_csffw_tlstream_overflow(
+	struct kbase_tlstream *stream,
+	u64 csffw_timestamp,
+	u64 csffw_cycle);
 void __kbase_tlstream_tl_js_sched_start(
 	struct kbase_tlstream *stream,
 	u32 dummy);
@@ -1684,6 +1704,7 @@ struct kbase_tlstream;
  * @kbase_device_sb_entry_count: The number of entries each scoreboard set in the
  * physical hardware has available
  * @kbase_device_has_cross_stream_sync: Whether cross-stream synchronization is supported
+ * @kbase_device_supports_gpu_sleep: Whether GPU sleep is supported
  */
 #if MALI_USE_CSF
 #define KBASE_TLSTREAM_TL_KBASE_NEW_DEVICE(	\
@@ -1693,14 +1714,15 @@ struct kbase_tlstream;
 	kbase_device_max_num_csgs,	\
 	kbase_device_as_count,	\
 	kbase_device_sb_entry_count,	\
-	kbase_device_has_cross_stream_sync	\
+	kbase_device_has_cross_stream_sync,	\
+	kbase_device_supports_gpu_sleep	\
 	)	\
 	do {	\
 		int enabled = atomic_read(&kbdev->timeline_flags);	\
 		if (enabled & BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS)	\
 			__kbase_tlstream_tl_kbase_new_device(	\
 				__TL_DISPATCH_STREAM(kbdev, obj),	\
-				kbase_device_id, kbase_device_gpu_core_count, kbase_device_max_num_csgs, kbase_device_as_count, kbase_device_sb_entry_count, kbase_device_has_cross_stream_sync);	\
+				kbase_device_id, kbase_device_gpu_core_count, kbase_device_max_num_csgs, kbase_device_as_count, kbase_device_sb_entry_count, kbase_device_has_cross_stream_sync, kbase_device_supports_gpu_sleep);	\
 	} while (0)
 #else
 #define KBASE_TLSTREAM_TL_KBASE_NEW_DEVICE(	\
@@ -1710,7 +1732,8 @@ struct kbase_tlstream;
 	kbase_device_max_num_csgs,	\
 	kbase_device_as_count,	\
 	kbase_device_sb_entry_count,	\
-	kbase_device_has_cross_stream_sync	\
+	kbase_device_has_cross_stream_sync,	\
+	kbase_device_supports_gpu_sleep	\
 	)	\
 	do { } while (0)
 #endif /* MALI_USE_CSF */
@@ -1724,6 +1747,7 @@ struct kbase_tlstream;
  * @kernel_ctx_id: Unique ID for the KBase Context
  * @gpu_cmdq_grp_handle: GPU Command Queue Group handle which will match userspace
  * @kbase_device_csg_slot_index: The index of the slot in the scheduler being programmed
+ * @kbase_device_csg_slot_resumed: Whether the csg is being resumed
  */
 #if MALI_USE_CSF
 #define KBASE_TLSTREAM_TL_KBASE_DEVICE_PROGRAM_CSG(	\
@@ -1731,14 +1755,15 @@ struct kbase_tlstream;
 	kbase_device_id,	\
 	kernel_ctx_id,	\
 	gpu_cmdq_grp_handle,	\
-	kbase_device_csg_slot_index	\
+	kbase_device_csg_slot_index,	\
+	kbase_device_csg_slot_resumed	\
 	)	\
 	do {	\
 		int enabled = atomic_read(&kbdev->timeline_flags);	\
 		if (enabled & BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS)	\
 			__kbase_tlstream_tl_kbase_device_program_csg(	\
 				__TL_DISPATCH_STREAM(kbdev, obj),	\
-				kbase_device_id, kernel_ctx_id, gpu_cmdq_grp_handle, kbase_device_csg_slot_index);	\
+				kbase_device_id, kernel_ctx_id, gpu_cmdq_grp_handle, kbase_device_csg_slot_index, kbase_device_csg_slot_resumed);	\
 	} while (0)
 #else
 #define KBASE_TLSTREAM_TL_KBASE_DEVICE_PROGRAM_CSG(	\
@@ -1746,7 +1771,8 @@ struct kbase_tlstream;
 	kbase_device_id,	\
 	kernel_ctx_id,	\
 	gpu_cmdq_grp_handle,	\
-	kbase_device_csg_slot_index	\
+	kbase_device_csg_slot_index,	\
+	kbase_device_csg_slot_resumed	\
 	)	\
 	do { } while (0)
 #endif /* MALI_USE_CSF */
@@ -3146,59 +3172,221 @@ struct kbase_tlstream;
 #endif /* MALI_USE_CSF */
 
 /**
- * KBASE_TLSTREAM_TL_KBASE_CSFFW_TLSTREAM_OVERFLOW -
- *   An overflow has happened with the CSFFW Timeline stream
+ * KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_RELOADING -
+ *   CSF FW is being reloaded
  *
  * @kbdev: Kbase device
- * @csffw_timestamp: Timestamp of a CSFFW event
  * @csffw_cycle: Cycle number of a CSFFW event
  */
 #if MALI_USE_CSF
-#define KBASE_TLSTREAM_TL_KBASE_CSFFW_TLSTREAM_OVERFLOW(	\
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_RELOADING(	\
 	kbdev,	\
-	csffw_timestamp,	\
 	csffw_cycle	\
 	)	\
 	do {	\
 		int enabled = atomic_read(&kbdev->timeline_flags);	\
 		if (enabled & BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)	\
-			__kbase_tlstream_tl_kbase_csffw_tlstream_overflow(	\
+			__kbase_tlstream_tl_kbase_csffw_fw_reloading(	\
 				__TL_DISPATCH_STREAM(kbdev, obj),	\
-				csffw_timestamp, csffw_cycle);	\
+				csffw_cycle);	\
 	} while (0)
 #else
-#define KBASE_TLSTREAM_TL_KBASE_CSFFW_TLSTREAM_OVERFLOW(	\
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_RELOADING(	\
 	kbdev,	\
-	csffw_timestamp,	\
 	csffw_cycle	\
 	)	\
 	do { } while (0)
 #endif /* MALI_USE_CSF */
 
 /**
- * KBASE_TLSTREAM_TL_KBASE_CSFFW_RESET -
- *   A reset has happened with the CSFFW
+ * KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_ENABLING -
+ *   CSF FW is being enabled
  *
  * @kbdev: Kbase device
  * @csffw_cycle: Cycle number of a CSFFW event
  */
 #if MALI_USE_CSF
-#define KBASE_TLSTREAM_TL_KBASE_CSFFW_RESET(	\
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_ENABLING(	\
 	kbdev,	\
 	csffw_cycle	\
 	)	\
 	do {	\
 		int enabled = atomic_read(&kbdev->timeline_flags);	\
 		if (enabled & BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)	\
-			__kbase_tlstream_tl_kbase_csffw_reset(	\
+			__kbase_tlstream_tl_kbase_csffw_fw_enabling(	\
 				__TL_DISPATCH_STREAM(kbdev, obj),	\
 				csffw_cycle);	\
 	} while (0)
 #else
-#define KBASE_TLSTREAM_TL_KBASE_CSFFW_RESET(	\
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_ENABLING(	\
+	kbdev,	\
+	csffw_cycle	\
+	)	\
+	do { } while (0)
+#endif /* MALI_USE_CSF */
+
+/**
+ * KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_REQUEST_SLEEP -
+ *   CSF FW sleep is requested
+ *
+ * @kbdev: Kbase device
+ * @csffw_cycle: Cycle number of a CSFFW event
+ */
+#if MALI_USE_CSF
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_REQUEST_SLEEP(	\
+	kbdev,	\
+	csffw_cycle	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)	\
+			__kbase_tlstream_tl_kbase_csffw_fw_request_sleep(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				csffw_cycle);	\
+	} while (0)
+#else
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_REQUEST_SLEEP(	\
+	kbdev,	\
+	csffw_cycle	\
+	)	\
+	do { } while (0)
+#endif /* MALI_USE_CSF */
+
+/**
+ * KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_REQUEST_WAKEUP -
+ *   CSF FW wake up is requested
+ *
+ * @kbdev: Kbase device
+ * @csffw_cycle: Cycle number of a CSFFW event
+ */
+#if MALI_USE_CSF
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_REQUEST_WAKEUP(	\
+	kbdev,	\
+	csffw_cycle	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)	\
+			__kbase_tlstream_tl_kbase_csffw_fw_request_wakeup(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				csffw_cycle);	\
+	} while (0)
+#else
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_REQUEST_WAKEUP(	\
+	kbdev,	\
+	csffw_cycle	\
+	)	\
+	do { } while (0)
+#endif /* MALI_USE_CSF */
+
+/**
+ * KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_REQUEST_HALT -
+ *   CSF FW halt is requested
+ *
+ * @kbdev: Kbase device
+ * @csffw_cycle: Cycle number of a CSFFW event
+ */
+#if MALI_USE_CSF
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_REQUEST_HALT(	\
 	kbdev,	\
 	csffw_cycle	\
 	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)	\
+			__kbase_tlstream_tl_kbase_csffw_fw_request_halt(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				csffw_cycle);	\
+	} while (0)
+#else
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_REQUEST_HALT(	\
+	kbdev,	\
+	csffw_cycle	\
+	)	\
+	do { } while (0)
+#endif /* MALI_USE_CSF */
+
+/**
+ * KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_DISABLING -
+ *   CSF FW is being disabled
+ *
+ * @kbdev: Kbase device
+ * @csffw_cycle: Cycle number of a CSFFW event
+ */
+#if MALI_USE_CSF
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_DISABLING(	\
+	kbdev,	\
+	csffw_cycle	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)	\
+			__kbase_tlstream_tl_kbase_csffw_fw_disabling(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				csffw_cycle);	\
+	} while (0)
+#else
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_DISABLING(	\
+	kbdev,	\
+	csffw_cycle	\
+	)	\
+	do { } while (0)
+#endif /* MALI_USE_CSF */
+
+/**
+ * KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_OFF -
+ *   CSF FW is off
+ *
+ * @kbdev: Kbase device
+ * @csffw_cycle: Cycle number of a CSFFW event
+ */
+#if MALI_USE_CSF
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_OFF(	\
+	kbdev,	\
+	csffw_cycle	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)	\
+			__kbase_tlstream_tl_kbase_csffw_fw_off(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				csffw_cycle);	\
+	} while (0)
+#else
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_FW_OFF(	\
+	kbdev,	\
+	csffw_cycle	\
+	)	\
+	do { } while (0)
+#endif /* MALI_USE_CSF */
+
+/**
+ * KBASE_TLSTREAM_TL_KBASE_CSFFW_TLSTREAM_OVERFLOW -
+ *   An overflow has happened with the CSFFW Timeline stream
+ *
+ * @kbdev: Kbase device
+ * @csffw_timestamp: Timestamp of a CSFFW event
+ * @csffw_cycle: Cycle number of a CSFFW event
+ */
+#if MALI_USE_CSF
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_TLSTREAM_OVERFLOW(	\
+	kbdev,	\
+	csffw_timestamp,	\
+	csffw_cycle	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS)	\
+			__kbase_tlstream_tl_kbase_csffw_tlstream_overflow(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				csffw_timestamp, csffw_cycle);	\
+	} while (0)
+#else
+#define KBASE_TLSTREAM_TL_KBASE_CSFFW_TLSTREAM_OVERFLOW(	\
+	kbdev,	\
+	csffw_timestamp,	\
+	csffw_cycle	\
+	)	\
 	do { } while (0)
 #endif /* MALI_USE_CSF */