Mali Valhall Android DDK r34p0-00dev1

Provenance: 046d23c969 (collaborate/google/android/v_r34p0-00dev1) VX504X08X-BU-00000-r34p0-00dev1 - Valhall Android DDK VX504X08X-SW-99006-r34p0-00dev1 - Valhall Android Renderscript AOSP parts Documentation from VX504X08X-BU-00000 omitted. Signed-off-by: Jesse Hall <jessehall@google.com> Change-Id: I4ebbb3a3af709bd39f883eed3b35bf4657a95797
author: Jesse Hall <jessehall@google.com> 2021-11-23 14:38:46 -0800
committer: Jesse Hall <jessehall@google.com> 2021-11-23 14:38:46 -0800
commit: 0c596dc70431fa2c70021fa1685e3efc969a852d (patch)
tree: 8c6cfe8da5d3bea214e991cc4438988f65d9081e
parent: bbbb1cf6bb211bb2094dd66656966277c326867f (diff)
download: gpu-0c596dc70431fa2c70021fa1685e3efc969a852d.tar.gz
140 files changed, 9841 insertions, 2569 deletions
diff --git a/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h b/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h
index 78c328c..f5f859e 100644
--- a/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h
+++ b/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h
@@ -186,17 +186,17 @@
 #define BASE_MEM_FLAGS_RESERVED \
 	BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_20
 
-#define BASEP_MEM_INVALID_HANDLE               (0ull  << 12)
-#define BASE_MEM_MMU_DUMP_HANDLE               (1ull  << 12)
-#define BASE_MEM_TRACE_BUFFER_HANDLE           (2ull  << 12)
-#define BASE_MEM_MAP_TRACKING_HANDLE           (3ull  << 12)
-#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE     (4ull  << 12)
+#define BASEP_MEM_INVALID_HANDLE (0ul)
+#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT)
+#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT)
 /* reserved handles ..-47<<PAGE_SHIFT> for future special handles */
-#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE     (47ul  << 12)
-#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE     (48ul  << 12)
-#define BASE_MEM_COOKIE_BASE                   (64ul  << 12)
-#define BASE_MEM_FIRST_FREE_ADDRESS            ((BITS_PER_LONG << 12) + \
-						BASE_MEM_COOKIE_BASE)
+#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT)
+#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_FIRST_FREE_ADDRESS                                            \
+	((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE)
 
 #define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \
 	((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \
@@ -301,7 +301,6 @@ typedef __u32 base_context_create_flags;
  */
 #define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32)
 
-#if MALI_UNIT_TEST
 /**
  * enum base_kcpu_command_type - Kernel CPU queue command type.
  * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL:       fence_signal,
@@ -331,42 +330,8 @@ enum base_kcpu_command_type {
 	BASE_KCPU_COMMAND_TYPE_JIT_ALLOC,
 	BASE_KCPU_COMMAND_TYPE_JIT_FREE,
 	BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND,
-	BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER,
-	BASE_KCPU_COMMAND_TYPE_SAMPLE_TIME,
+	BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER
 };
-#else
-/**
- * enum base_kcpu_command_type - Kernel CPU queue command type.
- * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL:       fence_signal,
- * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT:         fence_wait,
- * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT:           cqs_wait,
- * @BASE_KCPU_COMMAND_TYPE_CQS_SET:            cqs_set,
- * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation,
- * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION:  cqs_set_operation,
- * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT:         map_import,
- * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT:       unmap_import,
- * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force,
- * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC:          jit_alloc,
- * @BASE_KCPU_COMMAND_TYPE_JIT_FREE:           jit_free,
- * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND:      group_suspend,
- * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER:      error_barrier,
- */
-enum base_kcpu_command_type {
-	BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL,
-	BASE_KCPU_COMMAND_TYPE_FENCE_WAIT,
-	BASE_KCPU_COMMAND_TYPE_CQS_WAIT,
-	BASE_KCPU_COMMAND_TYPE_CQS_SET,
-	BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION,
-	BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION,
-	BASE_KCPU_COMMAND_TYPE_MAP_IMPORT,
-	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT,
-	BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE,
-	BASE_KCPU_COMMAND_TYPE_JIT_ALLOC,
-	BASE_KCPU_COMMAND_TYPE_JIT_FREE,
-	BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND,
-	BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER,
-};
-#endif /* MALI_UNIT_TEST */
 
 /**
  * enum base_queue_group_priority - Priority of a GPU Command Queue Group.
@@ -568,11 +533,6 @@ struct base_kcpu_command_group_suspend_info {
 	__u8 padding[3];
 };
 
-#if MALI_UNIT_TEST
-struct base_kcpu_command_sample_time_info {
-	__u64 time;
-};
-#endif /* MALI_UNIT_TEST */
 
 /**
  * struct base_kcpu_command - kcpu command.
@@ -603,9 +563,6 @@ struct base_kcpu_command {
 		struct base_kcpu_command_jit_alloc_info jit_alloc;
 		struct base_kcpu_command_jit_free_info jit_free;
 		struct base_kcpu_command_group_suspend_info suspend_buf_copy;
-#if MALI_UNIT_TEST
-		struct base_kcpu_command_sample_time_info sample_time;
-#endif /* MALI_UNIT_TEST */
 		__u64 padding[2]; /* No sub-struct should be larger */
 	} info;
 };
diff --git a/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h b/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h
index 06cc4c2..a5dc745 100644
--- a/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h
+++ b/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h
@@ -20,7 +20,8 @@
  */
 
 /*
- * This header was autogenerated, it should not be edited.
+ * This header was originally autogenerated, but it is now ok (and
+ * expected) to have to add to it.
  */
 
 #ifndef _UAPI_GPU_CSF_REGISTERS_H_
@@ -212,7 +213,6 @@
 #define GLB_PWROFF_TIMER 0x0014 /* () Global shader core power off timer */
 #define GLB_ALLOC_EN_LO 0x0018 /* () Global shader core allocation enable mask, low word */
 #define GLB_ALLOC_EN_HI 0x001C /* () Global shader core allocation enable mask, high word */
-#define GLB_PROTM_COHERENCY 0x0020 /* () Configure COHERENCY_ENABLE register value to use in protected mode execution */
 
 #define GLB_PRFCNT_JASID 0x0024 /* () Performance counter address space */
 #define GLB_PRFCNT_BASE_LO 0x0028 /* () Performance counter buffer address, low word */
@@ -653,7 +653,9 @@
 	(((reg_val) & ~CS_FAULT_EXCEPTION_TYPE_MASK) |  \
 	 (((value) << CS_FAULT_EXCEPTION_TYPE_SHIFT) & CS_FAULT_EXCEPTION_TYPE_MASK))
 /* CS_FAULT_EXCEPTION_TYPE values */
+#define CS_FAULT_EXCEPTION_TYPE_KABOOM 0x05
 #define CS_FAULT_EXCEPTION_TYPE_CS_RESOURCE_TERMINATED 0x0F
+#define CS_FAULT_EXCEPTION_TYPE_CS_BUS_FAULT 0x48
 #define CS_FAULT_EXCEPTION_TYPE_CS_INHERIT_FAULT 0x4B
 #define CS_FAULT_EXCEPTION_TYPE_INSTR_INVALID_PC 0x50
 #define CS_FAULT_EXCEPTION_TYPE_INSTR_INVALID_ENC 0x51
@@ -1164,6 +1166,13 @@
 	(((reg_val) & ~GLB_REQ_FIRMWARE_CONFIG_UPDATE_MASK) |                  \
 	 (((value) << GLB_REQ_FIRMWARE_CONFIG_UPDATE_SHIFT) &                  \
 	  GLB_REQ_FIRMWARE_CONFIG_UPDATE_MASK))
+#define GLB_REQ_SLEEP_SHIFT 12
+#define GLB_REQ_SLEEP_MASK (0x1 << GLB_REQ_SLEEP_SHIFT)
+#define GLB_REQ_SLEEP_GET(reg_val) \
+	(((reg_val) & GLB_REQ_SLEEP_MASK) >> GLB_REQ_SLEEP_SHIFT)
+#define GLB_REQ_SLEEP_SET(reg_val, value) \
+	(((reg_val) & ~GLB_REQ_SLEEP_MASK) | \
+	 (((value) << GLB_REQ_SLEEP_SHIFT) & GLB_REQ_SLEEP_MASK))
 #define GLB_REQ_INACTIVE_COMPUTE_SHIFT 20
 #define GLB_REQ_INACTIVE_COMPUTE_MASK (0x1 << GLB_REQ_INACTIVE_COMPUTE_SHIFT)
 #define GLB_REQ_INACTIVE_COMPUTE_GET(reg_val) \
@@ -1391,19 +1400,6 @@
 #define GLB_ALLOC_EN_MASK_SET(reg_val, value) \
 	(((reg_val) & ~GLB_ALLOC_EN_MASK_MASK) | (((value) << GLB_ALLOC_EN_MASK_SHIFT) & GLB_ALLOC_EN_MASK_MASK))
 
-/* GLB_PROTM_COHERENCY register */
-#define GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_SHIFT 0
-#define GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_MASK \
-	(0xFFFFFFFF << GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_SHIFT)
-#define GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_GET(reg_val)     \
-	(((reg_val)&GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_MASK) >> \
-	 GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_SHIFT)
-#define GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_SET(reg_val, value) \
-	(((reg_val) & ~GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_MASK) |  \
-	 (((value) << GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_SHIFT) &  \
-	  GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_MASK))
-/* End of GLB_INPUT_BLOCK register set definitions */
-
 /* GLB_OUTPUT_BLOCK register set definitions */
 
 /* GLB_ACK register */
@@ -1485,4 +1481,28 @@
 	(((reg_val) & ~CSG_STATUS_STATE_IDLE_MASK) |  \
 	(((value) << CSG_STATUS_STATE_IDLE_SHIFT) & CSG_STATUS_STATE_IDLE_MASK))
 
+/* GLB_FEATURES_ITER_TRACE_SUPPORTED register */
+#define GLB_FEATURES_ITER_TRACE_SUPPORTED_SHIFT GPU_U(4)
+#define GLB_FEATURES_ITER_TRACE_SUPPORTED_MASK                                 \
+	(GPU_U(0x1) << GLB_FEATURES_ITER_TRACE_SUPPORTED_SHIFT)
+#define GLB_FEATURES_ITER_TRACE_SUPPORTED_GET(reg_val)                         \
+	(((reg_val)&GLB_FEATURES_ITER_TRACE_SUPPORTED_MASK) >>                 \
+	 GLB_FEATURES_ITER_TRACE_SUPPORTED_SHIFT)
+#define GLB_FEATURES_ITER_TRACE_SUPPORTED_SET(reg_val, value)                  \
+	(((reg_val) & ~GLB_FEATURES_ITER_TRACE_SUPPORTED_MASK) |               \
+	 (((value) << GLB_FEATURES_ITER_TRACE_SUPPORTED_SHIFT) &               \
+	  GLB_FEATURES_ITER_TRACE_SUPPORTED_MASK))
+
+/* GLB_REQ_ITER_TRACE_ENABLE register */
+#define GLB_REQ_ITER_TRACE_ENABLE_SHIFT GPU_U(11)
+#define GLB_REQ_ITER_TRACE_ENABLE_MASK                                         \
+	(GPU_U(0x1) << GLB_REQ_ITER_TRACE_ENABLE_SHIFT)
+#define GLB_REQ_ITER_TRACE_ENABLE_GET(reg_val)                                 \
+	(((reg_val)&GLB_REQ_ITER_TRACE_ENABLE_MASK) >>                         \
+	 GLB_REQ_ITER_TRACE_ENABLE_SHIFT)
+#define GLB_REQ_ITER_TRACE_ENABLE_SET(reg_val, value)                          \
+	(((reg_val) & ~GLB_REQ_ITER_TRACE_ENABLE_MASK) |                       \
+	 (((value) << GLB_REQ_ITER_TRACE_ENABLE_SHIFT) &                       \
+	  GLB_REQ_ITER_TRACE_ENABLE_MASK))
+
 #endif /* _UAPI_GPU_CSF_REGISTERS_H_ */
diff --git a/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h b/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h
index d2d7ce2..ec4870c 100644
--- a/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h
+++ b/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h
@@ -44,6 +44,8 @@
  * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new
  *   queue registration call with extended format for supporting CS
  *   trace configurations with CSF trace_command.
+ * 1.6:
+ * - Added new HW performance counters interface to all GPUs.
  */
 
 #define BASE_UK_VERSION_MAJOR 1
diff --git a/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h
index 2041739..4001a4c 100644
--- a/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h
+++ b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h
@@ -28,8 +28,13 @@
 #error "Cannot be compiled with JM"
 #endif
 
-/* IPA control registers */
+/* GPU_CONTROL_MCU base address */
+#define GPU_CONTROL_MCU_BASE 0x3000
+
+/* MCU_SUBSYSTEM base address */
+#define MCU_SUBSYSTEM_BASE 0x20000
 
+/* IPA control registers */
 #define IPA_CONTROL_BASE       0x40000
 #define IPA_CONTROL_REG(r)     (IPA_CONTROL_BASE+(r))
 #define COMMAND                0x000 /* (WO) Command register */
@@ -63,8 +68,6 @@
 #define VALUE_SHADER_REG_LO(n) (VALUE_SHADER_BASE + ((n) << 3))     /* (RO) Counter value #n, low word */
 #define VALUE_SHADER_REG_HI(n) (VALUE_SHADER_BASE + ((n) << 3) + 4) /* (RO) Counter value #n, high word */
 
-#include "../../csf/mali_gpu_csf_control_registers.h"
-
 /* Set to implementation defined, outer caching */
 #define AS_MEMATTR_AARCH64_OUTER_IMPL_DEF 0x88ull
 /* Set to write back memory, outer caching */
@@ -117,6 +120,9 @@
 #define MCU_CNTRL_AUTO          (1 << 1)
 #define MCU_CNTRL_DISABLE       (0)
 
+#define MCU_CNTRL_DOORBELL_DISABLE_SHIFT (31)
+#define MCU_CNTRL_DOORBELL_DISABLE_MASK (1 << MCU_CNTRL_DOORBELL_DISABLE_SHIFT)
+
 #define MCU_STATUS_HALTED        (1 << 1)
 
 #define PRFCNT_BASE_LO   0x060  /* (RW) Performance counter memory
@@ -181,11 +187,19 @@
 #define GPU_COMMAND_TIME_DISABLE 0x00 /* Disable cycle counter */
 #define GPU_COMMAND_TIME_ENABLE  0x01 /* Enable cycle counter */
 
-/* GPU_COMMAND_FLUSH_CACHES payloads */
-#define GPU_COMMAND_FLUSH_PAYLOAD_NONE             0x00 /* No flush */
-#define GPU_COMMAND_FLUSH_PAYLOAD_CLEAN            0x01 /* Clean the caches */
-#define GPU_COMMAND_FLUSH_PAYLOAD_INVALIDATE       0x02 /* Invalidate the caches */
-#define GPU_COMMAND_FLUSH_PAYLOAD_CLEAN_INVALIDATE 0x03 /* Clean and invalidate the caches */
+/* GPU_COMMAND_FLUSH_CACHES payloads bits for L2 caches */
+#define GPU_COMMAND_FLUSH_PAYLOAD_L2_NONE 0x000 /* No flush */
+#define GPU_COMMAND_FLUSH_PAYLOAD_L2_CLEAN 0x001 /* CLN only */
+#define GPU_COMMAND_FLUSH_PAYLOAD_L2_CLEAN_INVALIDATE 0x003 /* CLN + INV */
+
+/* GPU_COMMAND_FLUSH_CACHES payloads bits for Load-store caches */
+#define GPU_COMMAND_FLUSH_PAYLOAD_LSC_NONE 0x000 /* No flush */
+#define GPU_COMMAND_FLUSH_PAYLOAD_LSC_CLEAN 0x010 /* CLN only */
+#define GPU_COMMAND_FLUSH_PAYLOAD_LSC_CLEAN_INVALIDATE 0x030 /* CLN + INV */
+
+/* GPU_COMMAND_FLUSH_CACHES payloads bits for Other caches */
+#define GPU_COMMAND_FLUSH_PAYLOAD_OTHER_NONE 0x000 /* No flush */
+#define GPU_COMMAND_FLUSH_PAYLOAD_OTHER_INVALIDATE 0x200 /* INV only */
 
 /* GPU_COMMAND command + payload */
 #define GPU_COMMAND_CODE_PAYLOAD(opcode, payload) \
@@ -220,13 +234,21 @@
 #define GPU_COMMAND_CYCLE_COUNT_STOP \
 	GPU_COMMAND_CODE_PAYLOAD(GPU_COMMAND_CODE_TIME, GPU_COMMAND_TIME_DISABLE)
 
-/* Clean all caches */
-#define GPU_COMMAND_CLEAN_CACHES \
-	GPU_COMMAND_CODE_PAYLOAD(GPU_COMMAND_CODE_FLUSH_CACHES, GPU_COMMAND_FLUSH_PAYLOAD_CLEAN)
-
-/* Clean and invalidate all caches */
-#define GPU_COMMAND_CLEAN_INV_CACHES \
-	GPU_COMMAND_CODE_PAYLOAD(GPU_COMMAND_CODE_FLUSH_CACHES, GPU_COMMAND_FLUSH_PAYLOAD_CLEAN_INVALIDATE)
+/* Clean and invalidate L2 cache (Equivalent to FLUSH_PT) */
+#define GPU_COMMAND_CACHE_CLN_INV_L2                                           \
+	GPU_COMMAND_CODE_PAYLOAD(                                              \
+		GPU_COMMAND_CODE_FLUSH_CACHES,                                 \
+		(GPU_COMMAND_FLUSH_PAYLOAD_L2_CLEAN_INVALIDATE |               \
+		 GPU_COMMAND_FLUSH_PAYLOAD_LSC_NONE |                          \
+		 GPU_COMMAND_FLUSH_PAYLOAD_OTHER_NONE))
+
+/* Clean and invalidate L2 and LSC caches (Equivalent to FLUSH_MEM) */
+#define GPU_COMMAND_CACHE_CLN_INV_L2_LSC                                       \
+	GPU_COMMAND_CODE_PAYLOAD(                                              \
+		GPU_COMMAND_CODE_FLUSH_CACHES,                                 \
+		(GPU_COMMAND_FLUSH_PAYLOAD_L2_CLEAN_INVALIDATE |               \
+		 GPU_COMMAND_FLUSH_PAYLOAD_LSC_CLEAN_INVALIDATE |              \
+		 GPU_COMMAND_FLUSH_PAYLOAD_OTHER_NONE))
 
 /* Places the GPU in protected mode */
 #define GPU_COMMAND_SET_PROTECTED_MODE \
diff --git a/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h
index 1be3541..dcadcc7 100644
--- a/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h
+++ b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h
@@ -261,6 +261,10 @@
 #define GPU_COMMAND_CLEAN_INV_CACHES   0x08 /* Clean and invalidate all caches */
 #define GPU_COMMAND_SET_PROTECTED_MODE 0x09 /* Places the GPU in protected mode */
 
+/* GPU_COMMAND cache flush alias to CSF command payload */
+#define GPU_COMMAND_CACHE_CLN_INV_L2 GPU_COMMAND_CLEAN_INV_CACHES
+#define GPU_COMMAND_CACHE_CLN_INV_L2_LSC GPU_COMMAND_CLEAN_INV_CACHES
+
 /* IRQ flags */
 #define GPU_FAULT               (1 << 0)    /* A GPU Fault has occurred */
 #define MULTIPLE_GPU_FAULTS     (1 << 7)    /* More than one GPU Fault occurred.  */
diff --git a/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h
index d093ce4..666b0af 100644
--- a/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h
+++ b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h
@@ -53,6 +53,20 @@
 								GPU_ID2_VERSION_MINOR | \
 								GPU_ID2_VERSION_STATUS)
 
+/* Helper macro to construct a value consisting of arch major and revision
+ * using the value of gpu_id.
+ */
+#define ARCH_MAJOR_REV_REG(gpu_id)                                             \
+	((((__u32)gpu_id) & GPU_ID2_ARCH_MAJOR) |                              \
+	 (((__u32)gpu_id) & GPU_ID2_ARCH_REV))
+
+/* Helper macro to create a partial GPU_ID (new format) that defines
+ * a arch major and revision.
+ */
+#define GPU_ID2_ARCH_MAJOR_REV_MAKE(arch_major, arch_rev)                      \
+	((((__u32)arch_major) << GPU_ID2_ARCH_MAJOR_SHIFT) |                   \
+	 (((__u32)arch_rev) << GPU_ID2_ARCH_REV_SHIFT))
+
 /* Helper macro to create a partial GPU_ID (new format) that defines
  * a product ignoring its version.
  */
@@ -109,6 +123,8 @@
 #define GPU_ID2_PRODUCT_TGRX              GPU_ID2_MODEL_MAKE(10, 3)
 #define GPU_ID2_PRODUCT_TVAX              GPU_ID2_MODEL_MAKE(10, 4)
 #define GPU_ID2_PRODUCT_LODX              GPU_ID2_MODEL_MAKE(10, 7)
+#define GPU_ID2_PRODUCT_TTUX              GPU_ID2_MODEL_MAKE(11, 2)
+#define GPU_ID2_PRODUCT_LTUX              GPU_ID2_MODEL_MAKE(11, 3)
 
 /* Helper macro to create a GPU_ID assuming valid values for id, major,
  * minor, status
diff --git a/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_regmap.h b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_regmap.h
index 84fad8d..e223220 100644
--- a/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_regmap.h
+++ b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_regmap.h
@@ -30,6 +30,13 @@
 #include "backend/mali_kbase_gpu_regmap_jm.h"
 #endif
 
+/* GPU_U definition */
+#ifdef __ASSEMBLER__
+#define GPU_U(x) x
+#else
+#define GPU_U(x) x##u
+#endif /* __ASSEMBLER__ */
+
 /* Begin Register Offsets */
 /* GPU control registers */
 
@@ -149,6 +156,10 @@
 #define ASN_HASH(n)             (ASN_HASH_0 + (n)*4)
 #define ASN_HASH_COUNT          3
 
+#define SYSC_ALLOC0             0x0340 /* (RW) System cache allocation hint from source ID */
+#define SYSC_ALLOC(n) (SYSC_ALLOC0 + (n)*4)
+#define SYSC_ALLOC_COUNT 8
+
 #define STACK_PWRTRANS_LO       0xE40   /* (RO) Core stack power transition bitmap, low word */
 #define STACK_PWRTRANS_HI       0xE44   /* (RO) Core stack power transition bitmap, high word */
 
@@ -164,6 +175,7 @@
 #define COHERENCY_FEATURES      0x300   /* (RO) Coherency features present */
 #define COHERENCY_ENABLE        0x304   /* (RW) Coherency enable */
 
+
 #define SHADER_CONFIG           0xF04   /* (RW) Shader core configuration (implementation-specific) */
 #define TILER_CONFIG            0xF08   /* (RW) Tiler core configuration (implementation-specific) */
 #define L2_MMU_CONFIG           0xF0C   /* (RW) L2 cache and MMU configuration (implementation-specific) */
@@ -327,10 +339,6 @@
 #define AS_COMMAND_UPDATE      0x01	/* Broadcasts the values in AS_TRANSTAB and ASn_MEMATTR to all MMUs */
 #define AS_COMMAND_LOCK        0x02	/* Issue a lock region command to all MMUs */
 #define AS_COMMAND_UNLOCK      0x03	/* Issue a flush region command to all MMUs */
-/* Flush all L2 caches then issue a flush region command to all MMUs
- * (deprecated - only for use with T60x)
- */
-#define AS_COMMAND_FLUSH 0x04
 /* Flush all L2 caches then issue a flush region command to all MMUs */
 #define AS_COMMAND_FLUSH_PT 0x04
 /* Wait for memory accesses to complete, flush all the L1s cache then flush all
@@ -338,6 +346,28 @@
  */
 #define AS_COMMAND_FLUSH_MEM 0x05
 
+/* AS_LOCKADDR register */
+#define AS_LOCKADDR_LOCKADDR_SIZE_SHIFT GPU_U(0)
+#define AS_LOCKADDR_LOCKADDR_SIZE_MASK                                         \
+	(GPU_U(0x3F) << AS_LOCKADDR_LOCKADDR_SIZE_SHIFT)
+#define AS_LOCKADDR_LOCKADDR_SIZE_GET(reg_val)                                 \
+	(((reg_val)&AS_LOCKADDR_LOCKADDR_SIZE_MASK) >>                               \
+	 AS_LOCKADDR_LOCKADDR_SIZE_SHIFT)
+#define AS_LOCKADDR_LOCKADDR_SIZE_SET(reg_val, value)                          \
+	(((reg_val) & ~AS_LOCKADDR_LOCKADDR_SIZE_MASK) |                             \
+	 (((value) << AS_LOCKADDR_LOCKADDR_SIZE_SHIFT) &                             \
+	 AS_LOCKADDR_LOCKADDR_SIZE_MASK))
+#define AS_LOCKADDR_LOCKADDR_BASE_SHIFT GPU_U(12)
+#define AS_LOCKADDR_LOCKADDR_BASE_MASK                                         \
+	(GPU_U(0xFFFFFFFFFFFFF) << AS_LOCKADDR_LOCKADDR_BASE_SHIFT)
+#define AS_LOCKADDR_LOCKADDR_BASE_GET(reg_val)                                 \
+	(((reg_val)&AS_LOCKADDR_LOCKADDR_BASE_MASK) >>                               \
+	 AS_LOCKADDR_LOCKADDR_BASE_SHIFT)
+#define AS_LOCKADDR_LOCKADDR_BASE_SET(reg_val, value)                          \
+	(((reg_val) & ~AS_LOCKADDR_LOCKADDR_BASE_MASK) |                             \
+	 (((value) << AS_LOCKADDR_LOCKADDR_BASE_SHIFT) &                             \
+	 AS_LOCKADDR_LOCKADDR_BASE_MASK))
+
 /* GPU_STATUS values */
 #define GPU_STATUS_PRFCNT_ACTIVE            (1 << 2)    /* Set if the performance counters are active. */
 #define GPU_STATUS_CYCLE_COUNT_ACTIVE       (1 << 6)    /* Set if the cycle counter is active. */
@@ -427,8 +457,133 @@
 #define L2_CONFIG_ASN_HASH_ENABLE_MASK         (1ul << L2_CONFIG_ASN_HASH_ENABLE_SHIFT)
 /* End L2_CONFIG register */
 
+
 /* IDVS_GROUP register */
 #define IDVS_GROUP_SIZE_SHIFT (16)
 #define IDVS_GROUP_MAX_SIZE (0x3F)
 
+/* SYSC_ALLOC read IDs */
+#define SYSC_ALLOC_ID_R_OTHER       0x00
+#define SYSC_ALLOC_ID_R_CSF         0x02
+#define SYSC_ALLOC_ID_R_MMU         0x04
+#define SYSC_ALLOC_ID_R_TILER_VERT  0x08
+#define SYSC_ALLOC_ID_R_TILER_PTR   0x09
+#define SYSC_ALLOC_ID_R_TILER_INDEX 0x0A
+#define SYSC_ALLOC_ID_R_TILER_OTHER 0x0B
+#define SYSC_ALLOC_ID_R_IC          0x10
+#define SYSC_ALLOC_ID_R_ATTR        0x11
+#define SYSC_ALLOC_ID_R_SCM         0x12
+#define SYSC_ALLOC_ID_R_FSDC        0x13
+#define SYSC_ALLOC_ID_R_VL          0x14
+#define SYSC_ALLOC_ID_R_PLR         0x15
+#define SYSC_ALLOC_ID_R_TEX         0x18
+#define SYSC_ALLOC_ID_R_LSC         0x1c
+
+/* SYSC_ALLOC write IDs */
+#define SYSC_ALLOC_ID_W_OTHER            0x00
+#define SYSC_ALLOC_ID_W_CSF              0x02
+#define SYSC_ALLOC_ID_W_PCB              0x07
+#define SYSC_ALLOC_ID_W_TILER_PTR        0x09
+#define SYSC_ALLOC_ID_W_TILER_VERT_PLIST 0x0A
+#define SYSC_ALLOC_ID_W_TILER_OTHER      0x0B
+#define SYSC_ALLOC_ID_W_L2_EVICT         0x0C
+#define SYSC_ALLOC_ID_W_L2_FLUSH         0x0D
+#define SYSC_ALLOC_ID_W_TIB_COLOR        0x10
+#define SYSC_ALLOC_ID_W_TIB_COLOR_AFBCH  0x11
+#define SYSC_ALLOC_ID_W_TIB_COLOR_AFBCB  0x12
+#define SYSC_ALLOC_ID_W_TIB_CRC          0x13
+#define SYSC_ALLOC_ID_W_TIB_DS           0x14
+#define SYSC_ALLOC_ID_W_TIB_DS_AFBCH     0x15
+#define SYSC_ALLOC_ID_W_TIB_DS_AFBCB     0x16
+#define SYSC_ALLOC_ID_W_LSC              0x1C
+
+/* SYSC_ALLOC values */
+#define SYSC_ALLOC_L2_ALLOC 0x0
+#define SYSC_ALLOC_NEVER_ALLOC 0x2
+#define SYSC_ALLOC_ALWAYS_ALLOC 0x3
+#define SYSC_ALLOC_PTL_ALLOC 0x4
+#define SYSC_ALLOC_L2_PTL_ALLOC 0x5
+
+/* SYSC_ALLOC register */
+#define SYSC_ALLOC_R_SYSC_ALLOC0_SHIFT (0)
+#define SYSC_ALLOC_R_SYSC_ALLOC0_MASK ((0xF) << SYSC_ALLOC_R_SYSC_ALLOC0_SHIFT)
+#define SYSC_ALLOC_R_SYSC_ALLOC0_GET(reg_val)                                  \
+	(((reg_val)&SYSC_ALLOC_R_SYSC_ALLOC0_MASK) >>                          \
+	 SYSC_ALLOC_R_SYSC_ALLOC0_SHIFT)
+#define SYSC_ALLOC_R_SYSC_ALLOC0_SET(reg_val, value)                           \
+	(((reg_val) & ~SYSC_ALLOC_R_SYSC_ALLOC0_MASK) |                        \
+	 (((value) << SYSC_ALLOC_R_SYSC_ALLOC0_SHIFT) &                        \
+	  SYSC_ALLOC_R_SYSC_ALLOC0_MASK))
+/* End of SYSC_ALLOC_R_SYSC_ALLOC0 values */
+#define SYSC_ALLOC_W_SYSC_ALLOC0_SHIFT (4)
+#define SYSC_ALLOC_W_SYSC_ALLOC0_MASK ((0xF) << SYSC_ALLOC_W_SYSC_ALLOC0_SHIFT)
+#define SYSC_ALLOC_W_SYSC_ALLOC0_GET(reg_val)                                  \
+	(((reg_val)&SYSC_ALLOC_W_SYSC_ALLOC0_MASK) >>                          \
+	 SYSC_ALLOC_W_SYSC_ALLOC0_SHIFT)
+#define SYSC_ALLOC_W_SYSC_ALLOC0_SET(reg_val, value)                           \
+	(((reg_val) & ~SYSC_ALLOC_W_SYSC_ALLOC0_MASK) |                        \
+	 (((value) << SYSC_ALLOC_W_SYSC_ALLOC0_SHIFT) &                        \
+	  SYSC_ALLOC_W_SYSC_ALLOC0_MASK))
+/* End of SYSC_ALLOC_W_SYSC_ALLOC0 values */
+#define SYSC_ALLOC_R_SYSC_ALLOC1_SHIFT (8)
+#define SYSC_ALLOC_R_SYSC_ALLOC1_MASK ((0xF) << SYSC_ALLOC_R_SYSC_ALLOC1_SHIFT)
+#define SYSC_ALLOC_R_SYSC_ALLOC1_GET(reg_val)                                  \
+	(((reg_val)&SYSC_ALLOC_R_SYSC_ALLOC1_MASK) >>                          \
+	 SYSC_ALLOC_R_SYSC_ALLOC1_SHIFT)
+#define SYSC_ALLOC_R_SYSC_ALLOC1_SET(reg_val, value)                           \
+	(((reg_val) & ~SYSC_ALLOC_R_SYSC_ALLOC1_MASK) |                        \
+	 (((value) << SYSC_ALLOC_R_SYSC_ALLOC1_SHIFT) &                        \
+	  SYSC_ALLOC_R_SYSC_ALLOC1_MASK))
+/* End of SYSC_ALLOC_R_SYSC_ALLOC1 values */
+#define SYSC_ALLOC_W_SYSC_ALLOC1_SHIFT (12)
+#define SYSC_ALLOC_W_SYSC_ALLOC1_MASK ((0xF) << SYSC_ALLOC_W_SYSC_ALLOC1_SHIFT)
+#define SYSC_ALLOC_W_SYSC_ALLOC1_GET(reg_val)                                  \
+	(((reg_val)&SYSC_ALLOC_W_SYSC_ALLOC1_MASK) >>                          \
+	 SYSC_ALLOC_W_SYSC_ALLOC1_SHIFT)
+#define SYSC_ALLOC_W_SYSC_ALLOC1_SET(reg_val, value)                           \
+	(((reg_val) & ~SYSC_ALLOC_W_SYSC_ALLOC1_MASK) |                        \
+	 (((value) << SYSC_ALLOC_W_SYSC_ALLOC1_SHIFT) &                        \
+	  SYSC_ALLOC_W_SYSC_ALLOC1_MASK))
+/* End of SYSC_ALLOC_W_SYSC_ALLOC1 values */
+#define SYSC_ALLOC_R_SYSC_ALLOC2_SHIFT (16)
+#define SYSC_ALLOC_R_SYSC_ALLOC2_MASK ((0xF) << SYSC_ALLOC_R_SYSC_ALLOC2_SHIFT)
+#define SYSC_ALLOC_R_SYSC_ALLOC2_GET(reg_val)                                  \
+	(((reg_val)&SYSC_ALLOC_R_SYSC_ALLOC2_MASK) >>                          \
+	 SYSC_ALLOC_R_SYSC_ALLOC2_SHIFT)
+#define SYSC_ALLOC_R_SYSC_ALLOC2_SET(reg_val, value)                           \
+	(((reg_val) & ~SYSC_ALLOC_R_SYSC_ALLOC2_MASK) |                        \
+	 (((value) << SYSC_ALLOC_R_SYSC_ALLOC2_SHIFT) &                        \
+	  SYSC_ALLOC_R_SYSC_ALLOC2_MASK))
+/* End of SYSC_ALLOC_R_SYSC_ALLOC2 values */
+#define SYSC_ALLOC_W_SYSC_ALLOC2_SHIFT (20)
+#define SYSC_ALLOC_W_SYSC_ALLOC2_MASK ((0xF) << SYSC_ALLOC_W_SYSC_ALLOC2_SHIFT)
+#define SYSC_ALLOC_W_SYSC_ALLOC2_GET(reg_val)                                  \
+	(((reg_val)&SYSC_ALLOC_W_SYSC_ALLOC2_MASK) >>                          \
+	 SYSC_ALLOC_W_SYSC_ALLOC2_SHIFT)
+#define SYSC_ALLOC_W_SYSC_ALLOC2_SET(reg_val, value)                           \
+	(((reg_val) & ~SYSC_ALLOC_W_SYSC_ALLOC2_MASK) |                        \
+	 (((value) << SYSC_ALLOC_W_SYSC_ALLOC2_SHIFT) &                        \
+	  SYSC_ALLOC_W_SYSC_ALLOC2_MASK))
+/* End of SYSC_ALLOC_W_SYSC_ALLOC2 values */
+#define SYSC_ALLOC_R_SYSC_ALLOC3_SHIFT (24)
+#define SYSC_ALLOC_R_SYSC_ALLOC3_MASK ((0xF) << SYSC_ALLOC_R_SYSC_ALLOC3_SHIFT)
+#define SYSC_ALLOC_R_SYSC_ALLOC3_GET(reg_val)                                  \
+	(((reg_val)&SYSC_ALLOC_R_SYSC_ALLOC3_MASK) >>                          \
+	 SYSC_ALLOC_R_SYSC_ALLOC3_SHIFT)
+#define SYSC_ALLOC_R_SYSC_ALLOC3_SET(reg_val, value)                           \
+	(((reg_val) & ~SYSC_ALLOC_R_SYSC_ALLOC3_MASK) |                        \
+	 (((value) << SYSC_ALLOC_R_SYSC_ALLOC3_SHIFT) &                        \
+	  SYSC_ALLOC_R_SYSC_ALLOC3_MASK))
+/* End of SYSC_ALLOC_R_SYSC_ALLOC3 values */
+#define SYSC_ALLOC_W_SYSC_ALLOC3_SHIFT (28)
+#define SYSC_ALLOC_W_SYSC_ALLOC3_MASK ((0xF) << SYSC_ALLOC_W_SYSC_ALLOC3_SHIFT)
+#define SYSC_ALLOC_W_SYSC_ALLOC3_GET(reg_val)                                  \
+	(((reg_val)&SYSC_ALLOC_W_SYSC_ALLOC3_MASK) >>                          \
+	 SYSC_ALLOC_W_SYSC_ALLOC3_SHIFT)
+#define SYSC_ALLOC_W_SYSC_ALLOC3_SET(reg_val, value)                           \
+	(((reg_val) & ~SYSC_ALLOC_W_SYSC_ALLOC3_MASK) |                        \
+	 (((value) << SYSC_ALLOC_W_SYSC_ALLOC3_SHIFT) &                        \
+	  SYSC_ALLOC_W_SYSC_ALLOC3_MASK))
+/* End of SYSC_ALLOC_W_SYSC_ALLOC3 values */
+
 #endif /* _UAPI_KBASE_GPU_REGMAP_H_ */
diff --git a/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h b/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h
index 749e1fa..7a52fbf 100644
--- a/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h
+++ b/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h
@@ -192,15 +192,15 @@
 #define BASE_MEM_FLAGS_RESERVED \
 	(BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_19)
 
-#define BASEP_MEM_INVALID_HANDLE               (0ull  << 12)
-#define BASE_MEM_MMU_DUMP_HANDLE               (1ull  << 12)
-#define BASE_MEM_TRACE_BUFFER_HANDLE           (2ull  << 12)
-#define BASE_MEM_MAP_TRACKING_HANDLE           (3ull  << 12)
-#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE     (4ull  << 12)
+#define BASEP_MEM_INVALID_HANDLE (0ul)
+#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT)
+#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT)
 /* reserved handles ..-47<<PAGE_SHIFT> for future special handles */
-#define BASE_MEM_COOKIE_BASE                   (64ul  << 12)
-#define BASE_MEM_FIRST_FREE_ADDRESS            ((BITS_PER_LONG << 12) + \
-						BASE_MEM_COOKIE_BASE)
+#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT)
+#define BASE_MEM_FIRST_FREE_ADDRESS                                            \
+	((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE)
 
 /* Similar to BASE_MEM_TILER_ALIGN_TOP, memory starting from the end of the
  * initial commit is aligned to 'extension' pages, where 'extension' must be a power
diff --git a/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h b/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h
index 72d75cb..2598e20 100644
--- a/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h
+++ b/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h
@@ -119,6 +119,8 @@
  * 11.31:
  * - Added BASE_JD_REQ_LIMITED_CORE_MASK.
  * - Added ioctl 55: set_limited_core_count.
+ * 11.32:
+ * - Added new HW performance counters interface to all GPUs.
  */
 #define BASE_UK_VERSION_MAJOR 11
 #define BASE_UK_VERSION_MINOR 31
diff --git a/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h b/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h
index a46c41f..410d54e 100644
--- a/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h
+++ b/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h
@@ -42,18 +42,6 @@ struct base_mem_handle {
 
 #define BASE_MAX_COHERENT_GROUPS 16
 
-#if defined(CDBG_ASSERT)
-#define LOCAL_ASSERT CDBG_ASSERT
-#elif defined(KBASE_DEBUG_ASSERT)
-#define LOCAL_ASSERT KBASE_DEBUG_ASSERT
-#else
-#if defined(__KERNEL__)
-#error assert macro not defined!
-#else
-#define LOCAL_ASSERT(...)	((void)#__VA_ARGS__)
-#endif
-#endif
-
 #if defined(PAGE_MASK) && defined(PAGE_SHIFT)
 #define LOCAL_PAGE_SHIFT PAGE_SHIFT
 #define LOCAL_PAGE_LSB ~PAGE_MASK
@@ -635,7 +623,7 @@ struct mali_base_gpu_coherent_group_info {
  * @thread_max_barrier_size: Maximum number of threads per barrier
  * @thread_features: Thread features
  * @coherency_mode: Note: This is the _selected_ coherency mode rather than the
- * 	available modes as exposed in the coherency_features register
+ *                  available modes as exposed in the coherency_features register
  * @thread_tls_alloc: Number of threads per core that TLS must be allocated for
  * @gpu_features: GPU features
  *
@@ -699,7 +687,7 @@ struct gpu_raw_gpu_props {
  * values from which the value of the other members are derived. The derived
  * members exist to allow for efficient access and/or shielding the details
  * of the layout of the registers.
- * */
+ */
 struct base_gpu_props {
 	struct mali_base_gpu_core_props core_props;
 	struct mali_base_gpu_l2_cache_props l2_props;
@@ -716,82 +704,24 @@ struct base_gpu_props {
 #include "jm/mali_base_jm_kernel.h"
 #endif
 
-/**
- * base_mem_group_id_get() - Get group ID from flags
- * @flags: Flags to pass to base_mem_alloc
- *
- * This inline function extracts the encoded group ID from flags
- * and converts it into numeric value (0~15).
- *
- * Return: group ID(0~15) extracted from the parameter
- */
-static __inline__ int base_mem_group_id_get(base_mem_alloc_flags flags)
-{
-	LOCAL_ASSERT((flags & ~BASE_MEM_FLAGS_INPUT_MASK) == 0);
-	return (int)((flags & BASE_MEM_GROUP_ID_MASK) >>
-			BASEP_MEM_GROUP_ID_SHIFT);
-}
-
-/**
- * base_mem_group_id_set() - Set group ID into base_mem_alloc_flags
- * @id: group ID(0~15) you want to encode
- *
- * This inline function encodes specific group ID into base_mem_alloc_flags.
- * Parameter 'id' should lie in-between 0 to 15.
- *
- * Return: base_mem_alloc_flags with the group ID (id) encoded
- *
- * The return value can be combined with other flags against base_mem_alloc
- * to identify a specific memory group.
- */
-static __inline__ base_mem_alloc_flags base_mem_group_id_set(int id)
-{
-	if ((id < 0) || (id >= BASE_MEM_GROUP_COUNT)) {
-		/* Set to default value when id is out of range. */
-		id = BASE_MEM_GROUP_DEFAULT;
-	}
+#define BASE_MEM_GROUP_ID_GET(flags)                                           \
+	((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT)
 
-	return ((base_mem_alloc_flags)id << BASEP_MEM_GROUP_ID_SHIFT) &
-		BASE_MEM_GROUP_ID_MASK;
-}
+#define BASE_MEM_GROUP_ID_SET(id)                                              \
+	(((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ?      \
+					 BASE_MEM_GROUP_DEFAULT :              \
+					 id)                                   \
+	  << BASEP_MEM_GROUP_ID_SHIFT) &                                       \
+	 BASE_MEM_GROUP_ID_MASK)
 
-/**
- * base_context_mmu_group_id_set - Encode a memory group ID in
- *                                 base_context_create_flags
- *
- * Memory allocated for GPU page tables will come from the specified group.
- *
- * @group_id: Physical memory group ID. Range is 0..(BASE_MEM_GROUP_COUNT-1).
- *
- * Return: Bitmask of flags to pass to base_context_init.
- */
-static __inline__ base_context_create_flags base_context_mmu_group_id_set(
-	int const group_id)
-{
-	LOCAL_ASSERT(group_id >= 0);
-	LOCAL_ASSERT(group_id < BASE_MEM_GROUP_COUNT);
-	return BASEP_CONTEXT_MMU_GROUP_ID_MASK &
-		((base_context_create_flags)group_id <<
-		BASEP_CONTEXT_MMU_GROUP_ID_SHIFT);
-}
+#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id)                                \
+	(BASEP_CONTEXT_MMU_GROUP_ID_MASK &                                     \
+	 ((base_context_create_flags)(group_id)                                \
+	  << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT))
 
-/**
- * base_context_mmu_group_id_get - Decode a memory group ID from
- *                                 base_context_create_flags
- *
- * Memory allocated for GPU page tables will come from the returned group.
- *
- * @flags: Bitmask of flags to pass to base_context_init.
- *
- * Return: Physical memory group ID. Valid range is 0..(BASE_MEM_GROUP_COUNT-1).
- */
-static __inline__ int base_context_mmu_group_id_get(
-	base_context_create_flags const flags)
-{
-	LOCAL_ASSERT(flags == (flags & BASEP_CONTEXT_CREATE_ALLOWED_FLAGS));
-	return (int)((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >>
-			BASEP_CONTEXT_MMU_GROUP_ID_SHIFT);
-}
+#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags)                                   \
+	((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >>                          \
+	 BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)
 
 /*
  * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These
diff --git a/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h b/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h
index 9baaec1..15843ee 100644
--- a/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h
+++ b/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h
@@ -91,6 +91,7 @@ enum base_hwcnt_reader_event {
 #define KBASE_HWCNT_READER_API_VERSION_NO_FEATURE (0)
 #define KBASE_HWCNT_READER_API_VERSION_FEATURE_CYCLES_TOP (1 << 0)
 #define KBASE_HWCNT_READER_API_VERSION_FEATURE_CYCLES_SHADER_CORES (1 << 1)
+
 /**
  * struct kbase_hwcnt_reader_api_version - hwcnt reader API version
  * @version:  API version
@@ -101,5 +102,263 @@ struct kbase_hwcnt_reader_api_version {
 	__u32 features;
 };
 
+/** Hardware counters reader API version */
+#define PRFCNT_READER_API_VERSION (0)
+
+/**
+ * enum prfcnt_list_type - Type of list item
+ * @PRFCNT_LIST_TYPE_ENUM:        Enumeration of performance counters.
+ * @PRFCNT_LIST_TYPE_REQUEST:     Request for configuration setup.
+ * @PRFCNT_LIST_TYPE_SAMPLE_META: Sample metadata.
+ */
+enum prfcnt_list_type {
+	PRFCNT_LIST_TYPE_ENUM,
+	PRFCNT_LIST_TYPE_REQUEST,
+	PRFCNT_LIST_TYPE_SAMPLE_META,
+};
+
+#define FLEX_LIST_TYPE(type, subtype)                                          \
+	(__u16)(((type & 0xf) << 12) | (subtype & 0xfff))
+#define FLEX_LIST_TYPE_NONE FLEX_LIST_TYPE(0, 0)
+
+#define PRFCNT_ENUM_TYPE_BLOCK FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_ENUM, 0)
+#define PRFCNT_ENUM_TYPE_REQUEST FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_ENUM, 1)
+
+#define PRFCNT_REQUEST_TYPE_MODE FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_REQUEST, 0)
+#define PRFCNT_REQUEST_TYPE_ENABLE FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_REQUEST, 1)
+
+#define PRFCNT_SAMPLE_META_TYPE_SAMPLE                                         \
+	FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_SAMPLE_META, 0)
+#define PRFCNT_SAMPLE_META_TYPE_CLOCK                                          \
+	FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_SAMPLE_META, 1)
+#define PRFCNT_SAMPLE_META_TYPE_BLOCK                                          \
+	FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_SAMPLE_META, 2)
+
+/**
+ * struct prfcnt_item_header - Header for an item of the list.
+ * @item_type:    Type of item.
+ * @item_version: Protocol version.
+ */
+struct prfcnt_item_header {
+	__u16 item_type;
+	__u16 item_version;
+};
+
+/**
+ * enum prfcnt_block_type - Type of performance counter block.
+ * @PRFCNT_BLOCK_TYPE_FE:          Front End.
+ * @PRFCNT_BLOCK_TYPE_TILER:       Tiler.
+ * @PRFCNT_BLOCK_TYPE_MEMORY:      Memory System.
+ * @PRFCNT_BLOCK_TYPE_SHADER_CORE: Shader Core.
+ */
+enum prfcnt_block_type {
+	PRFCNT_BLOCK_TYPE_FE,
+	PRFCNT_BLOCK_TYPE_TILER,
+	PRFCNT_BLOCK_TYPE_MEMORY,
+	PRFCNT_BLOCK_TYPE_SHADER_CORE,
+	PRFCNT_BLOCK_TYPE_RESERVED = 255,
+};
+
+/**
+ * enum prfcnt_block_set - Type of performance counter block set.
+ * @PRFCNT_SET_PRIMARY:   Primary.
+ * @PRFCNT_SET_SECONDARY: Secondary.
+ * @PRFCNT_SET_TERTIARY:  Tertiary.
+ */
+enum prfcnt_set {
+	PRFCNT_SET_PRIMARY,
+	PRFCNT_SET_SECONDARY,
+	PRFCNT_SET_TERTIARY,
+	PRFCNT_SET_RESERVED = 255,
+};
+
+/**
+ * struct prfcnt_enum_block_counter - Performance counter block descriptor.
+ * @block_type:    Type of performance counter block.
+ * @set:           Which SET this represents: primary, secondary or tertiary.
+ * @num_instances: How many instances of this block type exist in the hardware.
+ * @num_values:    How many entries in the values array there are for samples
+ *                 from this block.
+ * @pad:           Padding bytes.
+ * @counter_mask:  Bitmask that indicates the availability of counters in this
+ *                 block.
+ */
+struct prfcnt_enum_block_counter {
+	__u8 block_type;
+	__u8 set;
+	__u8 num_instances;
+	__u8 num_values;
+	__u8 pad[4];
+	__u64 counter_mask[2];
+};
+
+/**
+ * struct prfcnt_enum_request - Request descriptor.
+ * @request_item_type:       Type of request.
+ * @pad:                     Padding bytes.
+ * @versions_mask: Bitmask of versions that support this request.
+ */
+struct prfcnt_enum_request {
+	__u16 request_item_type;
+	__u16 pad;
+	__u32 versions_mask;
+};
+
+/**
+ * struct prfcnt_enum_item - Performance counter enumeration item.
+ * @hdr:           Header describing the type of item in the list.
+ * @block_counter: Performance counter block descriptor.
+ * @request:       Request descriptor.
+ */
+struct prfcnt_enum_item {
+	struct prfcnt_item_header hdr;
+	union {
+		struct prfcnt_enum_block_counter block_counter;
+		struct prfcnt_enum_request request;
+	} u;
+};
+
+/**
+ * enum prfcnt_mode - Capture mode for counter sampling.
+ * @PRFCNT_MODE_MANUAL:   Manual sampling mode.
+ * @PRFCNT_MODE_PERIODIC: Periodic sampling mode.
+ */
+enum prfcnt_mode {
+	PRFCNT_MODE_MANUAL,
+	PRFCNT_MODE_PERIODIC,
+	PRFCNT_MODE_RESERVED = 255,
+};
+
+/**
+ * struct prfcnt_request_mode - Mode request descriptor.
+ * @mode:      Capture mode for the session, either manual or periodic.
+ * @pad:       Padding bytes.
+ * @period_us: Period in microseconds, for periodic mode.
+ */
+struct prfcnt_request_mode {
+	__u8 mode;
+	__u8 pad[7];
+	union {
+		struct {
+			__u64 period_us;
+		} periodic;
+	} mode_config;
+};
+
+/**
+ * struct prfcnt_request_enable - Enable request descriptor.
+ * @block_type:  Type of performance counter block.
+ * @set:         Which SET to use: primary, secondary or tertiary.
+ * @pad:         Padding bytes.
+ * @enable_mask: Bitmask that indicates which performance counters to enable.
+ *               Unavailable counters will be ignored.
+ */
+struct prfcnt_request_enable {
+	__u8 block_type;
+	__u8 set;
+	__u8 pad[6];
+	__u64 enable_mask[2];
+};
+
+/**
+ * struct prfcnt_request_item - Performance counter request item.
+ * @hdr:        Header describing the type of item in the list.
+ * @req_mode:   Mode request descriptor.
+ * @req_enable: Enable request descriptor.
+ */
+struct prfcnt_request_item {
+	struct prfcnt_item_header hdr;
+	union {
+		struct prfcnt_request_mode req_mode;
+		struct prfcnt_request_enable req_enable;
+	} u;
+};
+
+/**
+ * enum prfcnt_request_type - Type of request descriptor.
+ * @PRFCNT_REQUEST_MODE:   Specify the capture mode to be used for the session.
+ * @PRFCNT_REQUEST_ENABLE: Specify which performance counters to capture.
+ */
+enum prfcnt_request_type {
+	PRFCNT_REQUEST_MODE,
+	PRFCNT_REQUEST_ENABLE,
+};
+
+/**
+ * struct prfcnt_sample_metadata - Metadata for counter sample data.
+ * @timestamp_start: Earliest timestamp that values in this sample represent.
+ * @timestamp_end:   Latest timestamp that values in this sample represent.
+ * @seq:             Sequence number of this sample. Must match the value from
+ *                   GET_SAMPLE.
+ * @user_data:       User data provided to HWC_CMD_START or HWC_CMD_SAMPLE_*
+ * @flags:           Property flags.
+ */
+struct prfcnt_sample_metadata {
+	__u64 timestamp_start;
+	__u64 timestamp_end;
+	__u64 seq;
+	__u64 user_data;
+	__u32 flags;
+	__u32 pad;
+};
+
+/**
+ * struct prfcnt_clock_metadata - Metadata for clock cycles.
+ * @num_domains: Number of domains this metadata refers to.
+ * @cycles:      Number of cycles elapsed in each counter domain between
+ *               timestamp_start and timestamp_end.
+ */
+struct prfcnt_clock_metadata {
+	__u32 num_domains;
+	__u32 pad;
+	__u64 *cycles;
+};
+
+/* This block was powered on for at least some portion of the sample */
+#define BLOCK_STATE_ON (1 << 0)
+/* This block was powered off for at least some portion of the sample */
+#define BLOCK_STATE_OFF (1 << 1)
+/* This block was available to this VM for at least some portion of the sample */
+#define BLOCK_STATE_AVAILABLE (1 << 2)
+/* This block was not available to this VM for at least some portion of the sample
+ *  Note that no data is collected when the block is not available to the VM.
+ */
+#define BLOCK_STATE_UNAVAILABLE (1 << 3)
+/* This block was operating in "normal" (non-protected) mode for at least some portion of the sample */
+#define BLOCK_STATE_NORMAL (1 << 4)
+/* This block was operating in "protected" mode for at least some portion of the sample.
+ * Note that no data is collected when the block is in protected mode.
+ */
+#define BLOCK_STATE_PROTECTED (1 << 5)
+
+/**
+ * struct prfcnt_block_metadata - Metadata for counter block.
+ * @block_type:    Type of performance counter block.
+ * @block_idx:     Index of performance counter block.
+ * @set:           Set of performance counter block.
+ * @block_state:   Bits set indicate the states which the block is known
+ *                 to have operated in during this sample.
+ * @values_offset: Offset from the start of the mmapped region, to the values
+ *                 for this block. The values themselves are an array of __u64.
+ */
+struct prfcnt_block_metadata {
+	__u8 block_type;
+	__u8 block_idx;
+	__u8 set;
+	__u8 pad_u8;
+	__u32 block_state;
+	__u32 values_offset;
+	__u32 pad_u32;
+};
+
+struct prfcnt_metadata {
+	struct prfcnt_item_header hdr;
+	union {
+		struct prfcnt_sample_metadata sample_md;
+		struct prfcnt_clock_metadata clock_md;
+		struct prfcnt_block_metadata block_md;
+	} u;
+};
+
 #endif /* _UAPI_KBASE_HWCNT_READER_H_ */
 
diff --git a/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h b/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h
index 29ff32a..8e1ed55 100644
--- a/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h
+++ b/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h
@@ -186,12 +186,15 @@ struct kbase_ioctl_hwcnt_enable {
 	__u32 mmu_l2_bm;
 };
 
+/* This IOCTL is deprecated as of R33, and will be removed in R35. */
 #define KBASE_IOCTL_HWCNT_ENABLE \
 	_IOW(KBASE_IOCTL_TYPE, 9, struct kbase_ioctl_hwcnt_enable)
 
+/* This IOCTL is deprecated as of R33, and will be removed in R35. */
 #define KBASE_IOCTL_HWCNT_DUMP \
 	_IO(KBASE_IOCTL_TYPE, 10)
 
+/* This IOCTL is deprecated as of R33, and will be removed in R35. */
 #define KBASE_IOCTL_HWCNT_CLEAR \
 	_IO(KBASE_IOCTL_TYPE, 11)
 
@@ -686,6 +689,55 @@ struct kbase_ioctl_set_limited_core_count {
 #define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \
 	_IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count)
 
+/**
+ * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter
+ *                                              information
+ * @info_item_size:  Performance counter item size in bytes.
+ * @info_item_count: Performance counter item count in the info_list_ptr.
+ * @info_list_ptr:   Performance counter item list pointer which points to a
+ *                   list with info_item_count of items.
+ *
+ * On success: returns info_item_size and info_item_count if info_list_ptr is
+ * NULL, returns performance counter information if info_list_ptr is not NULL.
+ * On error: returns a negative error code.
+ */
+struct kbase_ioctl_kinstr_prfcnt_enum_info {
+	__u32 info_item_size;
+	__u32 info_item_count;
+	__u64 info_list_ptr;
+};
+
+#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO                                    \
+	_IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info)
+
+/**
+ * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader
+ * @in: input parameters.
+ * @in.request_item_count: Number of requests in the requests array.
+ * @in.request_item_size:  Size in bytes of each request in the requests array.
+ * @in.requests_ptr:       Pointer to the requests array.
+ * @out: output parameters.
+ * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for
+ *                                 each sample.
+ * @out.prfcnt_mmap_size_bytes:    Size in bytes that user-space should mmap
+ *                                 for reading performance counter samples.
+ *
+ * A fd is returned from the ioctl if successful, or a negative value on error.
+ */
+union kbase_ioctl_kinstr_prfcnt_setup {
+	struct {
+		__u32 request_item_count;
+		__u32 request_item_size;
+		__u64 requests_ptr;
+	} in;
+	struct {
+		__u32 prfcnt_metadata_item_size;
+		__u32 prfcnt_mmap_size_bytes;
+	} out;
+};
+
+#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP                                        \
+	_IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup)
 
 /***************
  * test ioctls *
diff --git a/mali_kbase/Kbuild b/mali_kbase/Kbuild
index c520597..e253f1c 100644
--- a/mali_kbase/Kbuild
+++ b/mali_kbase/Kbuild
@@ -48,6 +48,10 @@ ifeq ($(CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND),n)
     $(error CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND must be set in Kernel configuration)
 endif
 
+ifeq ($(CONFIG_FW_LOADER), n)
+    $(error CONFIG_FW_LOADER must be set in Kernel configuration)
+endif
+
 ifeq ($(CONFIG_MALI_PRFCNT_SET_SELECT_VIA_DEBUG_FS), y)
     ifneq ($(CONFIG_DEBUG_FS), y)
         $(error CONFIG_MALI_PRFCNT_SET_SELECT_VIA_DEBUG_FS depends on CONFIG_DEBUG_FS to be set in Kernel configuration)
@@ -67,7 +71,7 @@ endif
 #
 
 # Driver version string which is returned to userspace via an ioctl
-MALI_RELEASE_NAME ?= '"r32p1-01eac0"'
+MALI_RELEASE_NAME ?= '"r34p0-00dev1"'
 # Set up defaults if not defined by build system
 ifeq ($(CONFIG_MALI_DEBUG), y)
     MALI_UNIT_TEST = 1
@@ -91,6 +95,7 @@ else
     MALI_USE_CSF ?= 0
 endif
 
+
 ifneq ($(CONFIG_MALI_KUTF), n)
     MALI_KERNEL_TEST_API ?= 1
 else
@@ -156,9 +161,11 @@ mali_kbase-y := \
     mali_kbase_gpuprops.o \
     mali_kbase_pm.o \
     mali_kbase_config.o \
+    mali_kbase_kinstr_prfcnt.o \
     mali_kbase_vinstr.o \
     mali_kbase_hwcnt.o \
     mali_kbase_hwcnt_gpu.o \
+    mali_kbase_hwcnt_gpu_narrow.o \
     mali_kbase_hwcnt_legacy.o \
     mali_kbase_hwcnt_types.o \
     mali_kbase_hwcnt_virtualizer.o \
@@ -180,7 +187,10 @@ mali_kbase-y := \
     mali_kbase_regs_history_debugfs.o \
     mali_kbase_dvfs_debugfs.o \
     mali_power_gpu_frequency_trace.o \
-    mali_kbase_trace_gpu_mem.o
+    mali_kbase_trace_gpu_mem.o \
+    mali_kbase_pbha.o
+
+mali_kbase-$(CONFIG_DEBUG_FS) += mali_kbase_pbha_debugfs.o
 
 mali_kbase-$(CONFIG_MALI_CINSTR_GWT) += mali_kbase_gwt.o
 
diff --git a/mali_kbase/Kconfig b/mali_kbase/Kconfig
index 9f1a6e3..a563d35 100644
--- a/mali_kbase/Kconfig
+++ b/mali_kbase/Kconfig
@@ -24,6 +24,7 @@ menuconfig MALI_MIDGARD
 	select DMA_SHARED_BUFFER
 	select PM_DEVFREQ
 	select DEVFREQ_THERMAL
+	select FW_LOADER
 	default n
 	help
 	  Enable this option to build support for a ARM Mali Midgard GPU.
@@ -39,7 +40,7 @@ config MALI_PLATFORM_NAME
 	default "devicetree"
 	help
 	  Enter the name of the desired platform configuration directory to
-	  include in the build. 'platform/$(MALI_PLATFORM_NAME)/Makefile' must
+	  include in the build. 'platform/$(MALI_PLATFORM_NAME)/Kbuild' must
 	  exist.
 
 config MALI_REAL_HW
@@ -365,7 +366,7 @@ config MALI_HW_ERRATA_1485982_USE_CLOCK_ALTERNATIVE
 endif
 
 config MALI_ARBITRATION
-	bool "Enable Virtualization reference code"
+	tristate "Enable Virtualization reference code"
 	depends on MALI_MIDGARD
 	default n
 	help
diff --git a/mali_kbase/Makefile b/mali_kbase/Makefile
index 4384e80..099da33 100644
--- a/mali_kbase/Makefile
+++ b/mali_kbase/Makefile
@@ -55,7 +55,7 @@ ifeq ($(CONFIG_MALI_MIDGARD),m)
         CONFIG_MALI_DMA_BUF_LEGACY_COMPAT = n
     endif
 
-    ifeq ($(CONFIG_BSP_HAS_HYPERVISOR),y)
+    ifeq ($(CONFIG_XEN),y)
         ifneq ($(CONFIG_MALI_ARBITRATION), n)
             CONFIG_MALI_XEN ?= m
         endif
diff --git a/mali_kbase/Mconfig b/mali_kbase/Mconfig
index d71a113..1b66978 100644
--- a/mali_kbase/Mconfig
+++ b/mali_kbase/Mconfig
@@ -35,7 +35,7 @@ config MALI_PLATFORM_NAME
 	default "devicetree"
 	help
 	  Enter the name of the desired platform configuration directory to
-	  include in the build. 'platform/$(MALI_PLATFORM_NAME)/Makefile' must
+	  include in the build. 'platform/$(MALI_PLATFORM_NAME)/Kbuild' must
 	  exist.
 
 	  When PLATFORM_CUSTOM is set, this needs to be set manually to
diff --git a/mali_kbase/arbiter/mali_kbase_arbiter_defs.h b/mali_kbase/arbiter/mali_kbase_arbiter_defs.h
index 570a82a..65cfc7b 100644
--- a/mali_kbase/arbiter/mali_kbase_arbiter_defs.h
+++ b/mali_kbase/arbiter/mali_kbase_arbiter_defs.h
@@ -20,7 +20,6 @@
  */
 
 /**
- * @file
  * Mali structures define to support arbitration feature
  */
 
diff --git a/mali_kbase/arbiter/mali_kbase_arbiter_interface.h b/mali_kbase/arbiter/mali_kbase_arbiter_interface.h
index c0137f7..3c60878 100644
--- a/mali_kbase/arbiter/mali_kbase_arbiter_interface.h
+++ b/mali_kbase/arbiter/mali_kbase_arbiter_interface.h
@@ -20,7 +20,6 @@
  */
 
 /**
- * @file
  * Defines the Mali arbiter interface
  */
 
@@ -61,58 +60,47 @@ struct arbiter_if_dev;
  * the arbiter arbiter_if_vm_arb_ops callbacks below.
  * For example vm_arb_gpu_stopped() may be called as a side effect of
  * arb_vm_gpu_stop() being called here.
+ *
+ * @arb_vm_gpu_stop: Callback to ask VM to stop using GPU.
+ *                   dev: The arbif kernel module device.
+ *
+ *                   Informs KBase to stop using the GPU as soon as possible.
+ *                   Note: Once the driver is no longer using the GPU, a call
+ *                   to vm_arb_gpu_stopped is expected by the arbiter.
+ * @arb_vm_gpu_granted: Callback to indicate that GPU has been granted to VM.
+ *                      dev: The arbif kernel module device.
+ *
+ *                      Informs KBase that the GPU can now be used by the VM.
+ * @arb_vm_gpu_lost: Callback to indicate that VM has lost the GPU.
+ *                   dev: The arbif kernel module device.
+ *
+ *                   This is called if KBase takes too long to respond to the
+ *                   arbiter stop request.
+ *                   Once this is called, KBase will assume that access to the
+ *                   GPU has been lost and will fail all running jobs and
+ *                   reset its internal state.
+ *                   If successful, will respond with a vm_arb_gpu_stopped
+ *                   message.
+ * @arb_vm_max_config: Callback to send the max config info to the VM.
+ *                     dev: The arbif kernel module device.
+ *                     max_l2_slices: The maximum number of L2 slices.
+ *                     max_core_mask: The largest core mask.
+ *
+ *                     Informs KBase the maximum resources that can be
+ *                     allocated to the partition in use.
+ * @arb_vm_update_freq: Callback to notify that GPU clock frequency has been
+ *                      updated.
+ *                      dev: The arbif kernel module device.
+ *                      freq: GPU clock frequency value reported from arbiter
+ *
+ *                      Informs KBase that the GPU clock frequency has been updated.
  */
 struct arbiter_if_arb_vm_ops {
-	/**
-	 * arb_vm_gpu_stop() - Ask VM to stop using GPU
-	 * @dev: The arbif kernel module device.
-	 *
-	 * Informs KBase to stop using the GPU as soon as possible.
-	 * @Note: Once the driver is no longer using the GPU, a call to
-	 *        vm_arb_gpu_stopped is expected by the arbiter.
-	 */
 	void (*arb_vm_gpu_stop)(struct device *dev);
-
-	/**
-	 * arb_vm_gpu_granted() - GPU has been granted to VM
-	 * @dev: The arbif kernel module device.
-	 *
-	 * Informs KBase that the GPU can now be used by the VM.
-	 */
 	void (*arb_vm_gpu_granted)(struct device *dev);
-
-	/**
-	 * arb_vm_gpu_lost() - VM has lost the GPU
-	 * @dev: The arbif kernel module device.
-	 *
-	 * This is called if KBase takes too long to respond to the arbiter
-	 * stop request.
-	 * Once this is called, KBase will assume that access to the GPU
-	 * has been lost and will fail all running jobs and reset its
-	 * internal state.
-	 * If successful, will respond with a vm_arb_gpu_stopped message.
-	 */
 	void (*arb_vm_gpu_lost)(struct device *dev);
-
-	/**
-	 * arb_vm_max_config() - Send max config info to the VM
-	 * @dev: The arbif kernel module device.
-	 * @max_l2_slices: The maximum number of L2 slices.
-	 * @max_core_mask: The largest core mask.
-	 *
-	 * Informs KBase the maximum resources that can be allocated to the
-	 * partition in use.
-	 */
 	void (*arb_vm_max_config)(struct device *dev, uint32_t max_l2_slices,
 				  uint32_t max_core_mask);
-
-	/**
-	 * arb_vm_update_freq() - GPU clock frequency has been updated
-	 * @dev: The arbif kernel module device.
-	 * @freq: GPU clock frequency value reported from arbiter
-	 *
-	 * Informs KBase that the GPU clock frequency has been updated.
-	 */
 	void (*arb_vm_update_freq)(struct device *dev, uint32_t freq);
 };
 
@@ -124,60 +112,45 @@ struct arbiter_if_arb_vm_ops {
  *
  * Note that we must not make any synchronous calls back in to the VM
  * (via arbiter_if_arb_vm_ops above) in the context of these callbacks.
+ *
+ * @vm_arb_register_dev: Callback to register VM device driver callbacks.
+ *                       arbif_dev: The arbiter interface to register
+ *                                  with for device callbacks
+ *                       dev: The device structure to supply in the callbacks.
+ *                       ops: The callbacks that the device driver supports
+ *                            (none are optional).
+ *
+ *                       Returns
+ *                       0		- successful.
+ *                       -EINVAL	- invalid argument.
+ *                       -EPROBE_DEFER	- module dependencies are not yet
+ *                                        available.
+ * @vm_arb_unregister_dev: Callback to unregister VM device driver callbacks.
+ *                         arbif_dev: The arbiter interface to unregistering
+ *                                    from.
+ * @vm_arb_get_max_config: Callback to Request the max config from the Arbiter.
+ *                         arbif_dev: The arbiter interface to issue the
+ *                                    request to.
+ * @vm_arb_gpu_request: Callback to ask the arbiter interface for GPU access.
+ *                      arbif_dev: The arbiter interface to issue the request
+ *                                 to.
+ * @vm_arb_gpu_active: Callback to inform arbiter that driver has gone active.
+ *                     arbif_dev: The arbiter interface device to notify.
+ * @vm_arb_gpu_idle: Callback to inform the arbiter that driver has gone idle.
+ *                   arbif_dev: The arbiter interface device to notify.
+ * @vm_arb_gpu_stopped: Callback to inform arbiter that driver has stopped
+ *                      using the GPU
+ *                      arbif_dev: The arbiter interface device to notify.
+ *                      gpu_required: The GPU is still needed to do more work.
  */
 struct arbiter_if_vm_arb_ops {
-	/**
-	 * vm_arb_register_dev() - Register VM device driver callbacks.
-	 * @arbif_dev: The arbiter interface we are registering device callbacks
-	 * @dev: The device structure to supply in the callbacks.
-	 * @ops: The callbacks that the device driver supports
-	 *       (none are optional).
-	 *
-	 * Return:
-	 * * 0			- successful.
-	 * * -EINVAL		- invalid argument.
-	 * * -EPROBE_DEFER	- module dependencies are not yet available.
-	 */
 	int (*vm_arb_register_dev)(struct arbiter_if_dev *arbif_dev,
 		struct device *dev, struct arbiter_if_arb_vm_ops *ops);
-
-	/**
-	 * vm_arb_unregister_dev() - Unregister VM device driver callbacks.
-	 * @arbif_dev: The arbiter interface we are unregistering from.
-	 */
 	void (*vm_arb_unregister_dev)(struct arbiter_if_dev *arbif_dev);
-
-	/**
-	 * vm_arb_gpu_get_max_config() - Request the max config from the
-	 * Arbiter.
-	 * @arbif_dev: The arbiter interface we want to issue the request.
-	 */
 	void (*vm_arb_get_max_config)(struct arbiter_if_dev *arbif_dev);
-
-	/**
-	 * vm_arb_gpu_request() - Ask the arbiter interface for GPU access.
-	 * @arbif_dev: The arbiter interface we want to issue the request.
-	 */
 	void (*vm_arb_gpu_request)(struct arbiter_if_dev *arbif_dev);
-
-	/**
-	 * vm_arb_gpu_active() - Inform arbiter that the driver has gone active
-	 * @arbif_dev: The arbiter interface device.
-	 */
 	void (*vm_arb_gpu_active)(struct arbiter_if_dev *arbif_dev);
-
-	/**
-	 * vm_arb_gpu_idle() - Inform the arbiter that the driver has gone idle
-	 * @arbif_dev: The arbiter interface device.
-	 */
 	void (*vm_arb_gpu_idle)(struct arbiter_if_dev *arbif_dev);
-
-	/**
-	 * vm_arb_gpu_stopped() - Inform the arbiter that the driver has stopped
-	 *                        using the GPU
-	 * @arbif_dev: The arbiter interface device.
-	 * @gpu_required: The GPU is still needed to do more work.
-	 */
 	void (*vm_arb_gpu_stopped)(struct arbiter_if_dev *arbif_dev,
 		u8 gpu_required);
 };
diff --git a/mali_kbase/arbiter/mali_kbase_arbiter_pm.c b/mali_kbase/arbiter/mali_kbase_arbiter_pm.c
index 5c75686..62ff4fd 100644
--- a/mali_kbase/arbiter/mali_kbase_arbiter_pm.c
+++ b/mali_kbase/arbiter/mali_kbase_arbiter_pm.c
@@ -20,15 +20,12 @@
  */
 
 /**
- * @file
  * Mali arbiter power manager state machine and APIs
  */
 
 #include <mali_kbase.h>
 #include <mali_kbase_pm.h>
-#include <mali_kbase_hwaccess_jm.h>
 #include <backend/gpu/mali_kbase_irq_internal.h>
-#include <mali_kbase_hwcnt_context.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
 #include <tl/mali_kbase_tracepoints.h>
 #include <mali_kbase_gpuprops.h>
@@ -319,6 +316,7 @@ int kbase_arbiter_pm_early_init(struct kbase_device *kbdev)
 	if (kbdev->arb.arb_if) {
 		kbase_arbif_gpu_request(kbdev);
 		dev_dbg(kbdev->dev, "Waiting for initial GPU assignment...\n");
+
 		err = wait_event_timeout(arb_vm_state->vm_state_wait,
 			arb_vm_state->vm_state ==
 					KBASE_VM_STATE_INITIALIZING_WITH_GPU,
@@ -328,8 +326,9 @@ int kbase_arbiter_pm_early_init(struct kbase_device *kbdev)
 			dev_dbg(kbdev->dev,
 			"Kbase probe Deferred after waiting %d ms to receive GPU_GRANT\n",
 			gpu_req_timeout);
-			err = -EPROBE_DEFER;
-			goto arbif_eprobe_defer;
+
+			err = -ENODEV;
+			goto arbif_timeout;
 		}
 
 		dev_dbg(kbdev->dev,
@@ -337,9 +336,10 @@ int kbase_arbiter_pm_early_init(struct kbase_device *kbdev)
 	}
 	return 0;
 
-arbif_eprobe_defer:
+arbif_timeout:
 	kbase_arbiter_pm_early_term(kbdev);
 	return err;
+
 arbif_init_fail:
 	destroy_workqueue(arb_vm_state->vm_arb_wq);
 	kfree(arb_vm_state);
@@ -619,6 +619,18 @@ static void kbase_arbiter_pm_vm_gpu_stop(struct kbase_device *kbdev)
 	case KBASE_VM_STATE_SUSPEND_PENDING:
 		/* Suspend finishes with a stop so nothing else to do */
 		break;
+	case KBASE_VM_STATE_INITIALIZING:
+	case KBASE_VM_STATE_STOPPED_GPU_REQUESTED:
+		/*
+		 * Case stop() is received when in a GPU REQUESTED state, it
+		 * means that the granted() was missed so the GPU needs to be
+		 * requested again.
+		 */
+		dev_dbg(kbdev->dev,
+			"GPU stop while already stopped with GPU requested");
+		kbase_arbif_gpu_stopped(kbdev, true);
+		start_request_timer(kbdev);
+		break;
 	default:
 		dev_warn(kbdev->dev, "GPU_STOP when not expected - state %s\n",
 			kbase_arbiter_pm_vm_state_str(arb_vm_state->vm_state));
@@ -656,9 +668,20 @@ static void kbase_gpu_lost(struct kbase_device *kbdev)
 		break;
 	case KBASE_VM_STATE_SUSPENDED:
 	case KBASE_VM_STATE_STOPPED:
-	case KBASE_VM_STATE_STOPPED_GPU_REQUESTED:
 		dev_dbg(kbdev->dev, "GPU lost while already stopped");
 		break;
+	case KBASE_VM_STATE_INITIALIZING:
+	case KBASE_VM_STATE_STOPPED_GPU_REQUESTED:
+		/*
+		 * Case lost() is received when in a GPU REQUESTED state, it
+		 * means that the granted() and stop() were missed so the GPU
+		 * needs to be requested again. Very unlikely to happen.
+		 */
+		dev_dbg(kbdev->dev,
+			"GPU lost while already stopped with GPU requested");
+		kbase_arbif_gpu_request(kbdev);
+		start_request_timer(kbdev);
+		break;
 	case KBASE_VM_STATE_SUSPEND_WAIT_FOR_GRANT:
 		dev_dbg(kbdev->dev, "GPU lost while waiting to suspend");
 		kbase_arbiter_pm_vm_set_state(kbdev, KBASE_VM_STATE_SUSPENDED);
@@ -1020,8 +1043,8 @@ int kbase_arbiter_pm_ctx_active_handle_suspend(struct kbase_device *kbdev,
 /**
  * kbase_arbiter_pm_update_gpu_freq() - Updates GPU clock frequency received
  * from arbiter.
- * @arb_freq - Pointer to struchture holding GPU clock frequenecy data
- * @freq - New frequency value in KHz
+ * @arb_freq: Pointer to struchture holding GPU clock frequenecy data
+ * @freq: New frequency value in KHz
  */
 void kbase_arbiter_pm_update_gpu_freq(struct kbase_arbiter_freq *arb_freq,
 	uint32_t freq)
@@ -1045,8 +1068,8 @@ void kbase_arbiter_pm_update_gpu_freq(struct kbase_arbiter_freq *arb_freq,
 
 /**
  * enumerate_arb_gpu_clk() - Enumerate a GPU clock on the given index
- * @kbdev - kbase_device pointer
- * @index - GPU clock index
+ * @kbdev: kbase_device pointer
+ * @index: GPU clock index
  *
  * Returns pointer to structure holding GPU clock frequency data reported from
  * arbiter, only index 0 is valid.
@@ -1061,8 +1084,8 @@ static void *enumerate_arb_gpu_clk(struct kbase_device *kbdev,
 
 /**
  * get_arb_gpu_clk_rate() - Get the current rate of GPU clock frequency value
- * @kbdev - kbase_device pointer
- * @index - GPU clock index
+ * @kbdev: kbase_device pointer
+ * @index: GPU clock index
  *
  * Returns the GPU clock frequency value saved when gpu is granted from arbiter
  */
@@ -1082,9 +1105,9 @@ static unsigned long get_arb_gpu_clk_rate(struct kbase_device *kbdev,
 
 /**
  * arb_gpu_clk_notifier_register() - Register a clock rate change notifier.
- * @kbdev          - kbase_device pointer
- * @gpu_clk_handle - Handle unique to the enumerated GPU clock
- * @nb             - notifier block containing the callback function pointer
+ * @kbdev:           kbase_device pointer
+ * @gpu_clk_handle:  Handle unique to the enumerated GPU clock
+ * @nb:              notifier block containing the callback function pointer
  *
  * Returns 0 on success, negative error code otherwise.
  *
@@ -1108,9 +1131,9 @@ static int arb_gpu_clk_notifier_register(struct kbase_device *kbdev,
 
 /**
  * gpu_clk_notifier_unregister() - Unregister clock rate change notifier
- * @kbdev          - kbase_device pointer
- * @gpu_clk_handle - Handle unique to the enumerated GPU clock
- * @nb             - notifier block containing the callback function pointer
+ * @kbdev:           kbase_device pointer
+ * @gpu_clk_handle:  Handle unique to the enumerated GPU clock
+ * @nb:              notifier block containing the callback function pointer
  *
  * This function pointer is used to unregister a callback function that
  * was previously registered to get notified of a frequency change of the
diff --git a/mali_kbase/arbiter/mali_kbase_arbiter_pm.h b/mali_kbase/arbiter/mali_kbase_arbiter_pm.h
index 1f570bb..091b431 100644
--- a/mali_kbase/arbiter/mali_kbase_arbiter_pm.h
+++ b/mali_kbase/arbiter/mali_kbase_arbiter_pm.h
@@ -20,7 +20,6 @@
  */
 
 /**
- * @file
  * Mali arbiter power manager state machine and APIs
  */
 
@@ -108,6 +107,7 @@ int kbase_arbiter_pm_install_interrupts(struct kbase_device *kbdev);
 /**
  * kbase_arbiter_pm_vm_event() - Dispatch VM event to the state machine
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ * @event: The event to dispatch
  *
  * The state machine function. Receives events and transitions states
  * according the event received and the current state
diff --git a/mali_kbase/arbitration/Kconfig b/mali_kbase/arbitration/Kconfig
index 95125f9..b4d6202 100644
--- a/mali_kbase/arbitration/Kconfig
+++ b/mali_kbase/arbitration/Kconfig
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note OR MIT
 #
 # (C) COPYRIGHT 2012-2021 ARM Limited. All rights reserved.
 #
@@ -19,7 +19,7 @@
 #
 
 config MALI_XEN
-	bool "Enable Xen Interface reference code"
+	tristate "Enable Xen Interface reference code"
 	depends on MALI_ARBITRATION && XEN
 	default n
 	help
@@ -27,13 +27,5 @@ config MALI_XEN
 	  virtualization setup for Mali
 	  If unsure, say N.
 
-config MALI_KUTF_ARBITRATION_TEST
-	bool "Enable Arbitration Test reference code"
-	depends on MALI_KUTF && MALI_ARBITRATION
-	default n
-	help
-	  Enables the build of test modules used in the reference
-	  virtualization setup for Mali
-	  If unsure, say N.
 
 source "drivers/gpu/arm/midgard/arbitration/ptm/Kconfig"
diff --git a/mali_kbase/arbitration/ptm/Kconfig b/mali_kbase/arbitration/ptm/Kconfig
index e11e674..074ebd5 100644
--- a/mali_kbase/arbitration/ptm/Kconfig
+++ b/mali_kbase/arbitration/ptm/Kconfig
@@ -19,7 +19,7 @@
 #
 
 config MALI_PARTITION_MANAGER
-	bool "Enable compilation of partition manager modules"
+	tristate "Enable compilation of partition manager modules"
 	depends on MALI_ARBITRATION
 	default n
 	help
diff --git a/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.c b/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.c
index e542ccf..9587c70 100644
--- a/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.c
@@ -22,12 +22,22 @@
 #include "backend/gpu/mali_kbase_cache_policy_backend.h"
 #include <device/mali_kbase_device.h>
 
+
 void kbase_cache_set_coherency_mode(struct kbase_device *kbdev,
 		u32 mode)
 {
 	kbdev->current_gpu_coherency_mode = mode;
 
-	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_COHERENCY_REG))
 		kbase_reg_write(kbdev, COHERENCY_ENABLE, mode);
 }
 
+u32 kbase_cache_get_coherency_features(struct kbase_device *kbdev)
+{
+	u32 coherency_features;
+
+		coherency_features = kbase_reg_read(
+			kbdev, GPU_CONTROL_REG(COHERENCY_FEATURES));
+
+	return coherency_features;
+}
+
diff --git a/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.h b/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.h
index 278125a..13c79d6 100644
--- a/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.h
+++ b/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.h
@@ -26,12 +26,21 @@
 #include <uapi/gpu/arm/midgard/mali_base_kernel.h>
 
 /**
-  * kbase_cache_set_coherency_mode() - Sets the system coherency mode
-  *			in the GPU.
-  * @kbdev:	Device pointer
-  * @mode:	Coherency mode. COHERENCY_ACE/ACE_LITE
-  */
+ * kbase_cache_set_coherency_mode() - Sets the system coherency mode
+ *                                    in the GPU.
+ * @kbdev:    Device pointer
+ * @mode:     Coherency mode. COHERENCY_ACE/ACE_LITE
+ */
 void kbase_cache_set_coherency_mode(struct kbase_device *kbdev,
 		u32 mode);
 
-#endif				/* _KBASE_CACHE_POLICY_H_ */
+/**
+ * kbase_cache_get_coherency_features() - Get the coherency features
+ *                                        in the GPU.
+ * @kbdev:    Device pointer
+ *
+ * Return:    Register value to be returned
+ */
+u32 kbase_cache_get_coherency_features(struct kbase_device *kbdev);
+
+#endif /* _KBASE_CACHE_POLICY_BACKEND_H_ */
diff --git a/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.c b/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.c
index 6ad0f58..d6b9750 100644
--- a/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.c
+++ b/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.c
@@ -26,6 +26,7 @@
 #include <mali_kbase.h>
 #include <mali_kbase_config_defaults.h>
 #include <linux/clk.h>
+#include <linux/pm_opp.h>
 #include <asm/div64.h>
 #include "backend/gpu/mali_kbase_clk_rate_trace_mgr.h"
 
@@ -46,7 +47,7 @@
  * Return: Pointer to clk trace ops if supported or NULL.
  */
 static struct kbase_clk_rate_trace_op_conf *
-get_clk_rate_trace_callbacks(struct kbase_device *kbdev __maybe_unused)
+get_clk_rate_trace_callbacks(__maybe_unused struct kbase_device *kbdev)
 {
 	/* base case */
 	struct kbase_clk_rate_trace_op_conf *callbacks =
@@ -71,6 +72,49 @@ get_clk_rate_trace_callbacks(struct kbase_device *kbdev __maybe_unused)
 	return callbacks;
 }
 
+int kbase_lowest_gpu_freq_init(struct kbase_device *kbdev)
+{
+	/* Uses default reference frequency defined in below macro */
+	u64 lowest_freq_khz = DEFAULT_REF_TIMEOUT_FREQ_KHZ;
+
+	/* Only check lowest frequency in cases when OPPs are used and
+	 * present in the device tree.
+	 */
+#ifdef CONFIG_PM_OPP
+	struct dev_pm_opp *opp_ptr;
+	unsigned long found_freq = 0;
+
+	/* find lowest frequency OPP */
+	opp_ptr = dev_pm_opp_find_freq_ceil(kbdev->dev, &found_freq);
+	if (IS_ERR(opp_ptr)) {
+		dev_err(kbdev->dev,
+			"No OPPs found in device tree! Scaling timeouts using %llu kHz",
+			(unsigned long long)lowest_freq_khz);
+	} else {
+#if KERNEL_VERSION(4, 11, 0) <= LINUX_VERSION_CODE
+		dev_pm_opp_put(opp_ptr); /* decrease OPP refcount */
+#endif
+		/* convert found frequency to KHz */
+		found_freq /= 1000;
+
+		/* If lowest frequency in OPP table is still higher
+		 * than the reference, then keep the reference frequency
+		 * as the one to use for scaling .
+		 */
+		if (found_freq < lowest_freq_khz)
+			lowest_freq_khz = found_freq;
+	}
+#else
+	dev_err(kbdev->dev,
+		"No operating-points-v2 node or operating-points property in DT");
+#endif
+
+	kbdev->lowest_gpu_freq_khz = lowest_freq_khz;
+	dev_dbg(kbdev->dev, "Lowest frequency identified is %llu kHz",
+		kbdev->lowest_gpu_freq_khz);
+	return 0;
+}
+
 static int gpu_clk_rate_change_notifier(struct notifier_block *nb,
 			unsigned long event, void *data)
 {
diff --git a/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.h b/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.h
index f7ec9d1..df30b63 100644
--- a/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.h
+++ b/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.h
@@ -61,6 +61,21 @@ struct kbase_clk_data {
 int kbase_clk_rate_trace_manager_init(struct kbase_device *kbdev);
 
 /**
+ * kbase_init_lowest_gpu_freq() - Find the lowest frequency that the GPU can
+ *                                run as using the device tree, and save this
+ *                                within kbdev.
+ *
+ * This function could be called from kbase_clk_rate_trace_manager_init,
+ * but is left separate as it can be called as soon as
+ * dev_pm_opp_of_add_table() has been called to initialize the OPP table.
+ *
+ * @kbdev: Pointer to kbase device.
+ *
+ * Return: 0 in any case.
+ */
+int kbase_lowest_gpu_freq_init(struct kbase_device *kbdev);
+
+/**
  * kbase_clk_rate_trace_manager_term - Terminate GPU clock rate trace manager.
  *
  *  @kbdev:      Device pointer
diff --git a/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c b/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c
index 11088db..7b04286 100644
--- a/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c
@@ -26,6 +26,7 @@
 #include <mali_kbase.h>
 #include <device/mali_kbase_device.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
+#include <backend/gpu/mali_kbase_cache_policy_backend.h>
 #include <mali_kbase_hwaccess_gpuprops.h>
 
 int kbase_backend_gpuprops_get(struct kbase_device *kbdev,
@@ -146,7 +147,7 @@ int kbase_backend_gpuprops_get_curr_config(struct kbase_device *kbdev,
 	curr_config_regdump->l2_present_hi = kbase_reg_read(kbdev,
 					GPU_CONTROL_REG(L2_PRESENT_HI));
 
-	if (WARN_ON(kbase_is_gpu_removed(kbdev)))
+	if (kbase_is_gpu_removed(kbdev))
 		return -EIO;
 
 	return 0;
@@ -156,30 +157,22 @@ int kbase_backend_gpuprops_get_curr_config(struct kbase_device *kbdev,
 int kbase_backend_gpuprops_get_features(struct kbase_device *kbdev,
 					struct kbase_gpuprops_regdump *regdump)
 {
-	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_COHERENCY_REG)) {
-		u32 coherency_features;
+	u32 coherency_features;
+	int error = 0;
 
-		/* Ensure we can access the GPU registers */
-		kbase_pm_register_access_enable(kbdev);
+	/* Ensure we can access the GPU registers */
+	kbase_pm_register_access_enable(kbdev);
 
-		coherency_features = kbase_reg_read(kbdev,
-				GPU_CONTROL_REG(COHERENCY_FEATURES));
+	coherency_features = kbase_cache_get_coherency_features(kbdev);
 
-		if (kbase_is_gpu_removed(kbdev))
-			return -EIO;
+	if (kbase_is_gpu_removed(kbdev))
+		error = -EIO;
 
-		regdump->coherency_features = coherency_features;
+	regdump->coherency_features = coherency_features;
 
-		/* We're done accessing the GPU registers for now. */
-		kbase_pm_register_access_disable(kbdev);
-	} else {
-		/* Pre COHERENCY_FEATURES we only supported ACE_LITE */
-		regdump->coherency_features =
-				COHERENCY_FEATURE_BIT(COHERENCY_NONE) |
-				COHERENCY_FEATURE_BIT(COHERENCY_ACE_LITE);
-	}
+	kbase_pm_register_access_disable(kbdev);
 
-	return 0;
+	return error;
 }
 
 int kbase_backend_gpuprops_get_l2_features(struct kbase_device *kbdev,
@@ -190,13 +183,24 @@ int kbase_backend_gpuprops_get_l2_features(struct kbase_device *kbdev,
 				GPU_CONTROL_REG(L2_FEATURES));
 		u32 l2_config =
 			kbase_reg_read(kbdev, GPU_CONTROL_REG(L2_CONFIG));
+		u32 asn_hash[ASN_HASH_COUNT] = {
+			0,
+		};
+		int i;
 
+		if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_ASN_HASH)) {
+			for (i = 0; i < ASN_HASH_COUNT; i++)
+				asn_hash[i] = kbase_reg_read(
+					kbdev, GPU_CONTROL_REG(ASN_HASH(i)));
+		}
 
 		if (kbase_is_gpu_removed(kbdev))
 			return -EIO;
 
 		regdump->l2_features = l2_features;
 		regdump->l2_config = l2_config;
+		for (i = 0; i < ASN_HASH_COUNT; i++)
+			regdump->l2_asn_hash[i] = asn_hash[i];
 	}
 
 	return 0;
diff --git a/mali_kbase/backend/gpu/mali_kbase_instr_backend.c b/mali_kbase/backend/gpu/mali_kbase_instr_backend.c
index d7edf30..90cc537 100644
--- a/mali_kbase/backend/gpu/mali_kbase_instr_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_instr_backend.c
@@ -53,6 +53,12 @@ int kbase_instr_hwcnt_enable_internal(struct kbase_device *kbdev,
 		goto out_err;
 	}
 
+	if (kbase_is_gpu_removed(kbdev)) {
+		/* GPU has been removed by Arbiter */
+		spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
+		goto out_err;
+	}
+
 	/* Enable interrupt */
 	irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK), irq_mask |
@@ -152,6 +158,14 @@ int kbase_instr_hwcnt_disable_internal(struct kbase_context *kctx)
 	kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_DISABLED;
 	kbdev->hwcnt.backend.triggered = 0;
 
+	if (kbase_is_gpu_removed(kbdev)) {
+		/* GPU has been removed by Arbiter */
+		spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
+		err = 0;
+		goto out;
+	}
+
 	/* Disable interrupt */
 	irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK),
@@ -195,6 +209,11 @@ int kbase_instr_hwcnt_request_dump(struct kbase_context *kctx)
 		goto unlock;
 	}
 
+	if (kbase_is_gpu_removed(kbdev)) {
+		/* GPU has been removed by Arbiter */
+		goto unlock;
+	}
+
 	kbdev->hwcnt.backend.triggered = 0;
 
 	/* Mark that we're dumping - the PF handler can signal that we faulted
@@ -310,6 +329,11 @@ int kbase_instr_hwcnt_clear(struct kbase_context *kctx)
 							KBASE_INSTR_STATE_IDLE)
 		goto out;
 
+	if (kbase_is_gpu_removed(kbdev)) {
+		/* GPU has been removed by Arbiter */
+		goto out;
+	}
+
 	/* Clear the counters */
 	KBASE_KTRACE_ADD(kbdev, CORE_GPU_PRFCNT_CLEAR, NULL, 0);
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
diff --git a/mali_kbase/backend/gpu/mali_kbase_jm_hw.c b/mali_kbase/backend/gpu/mali_kbase_jm_hw.c
index ae0377f..001efd9 100644
--- a/mali_kbase/backend/gpu/mali_kbase_jm_hw.c
+++ b/mali_kbase/backend/gpu/mali_kbase_jm_hw.c
@@ -48,18 +48,13 @@ static u64 kbase_job_write_affinity(struct kbase_device *kbdev,
 				int js, const u64 limited_core_mask)
 {
 	u64 affinity;
+	bool skip_affinity_check = false;
 
 	if ((core_req & (BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T)) ==
 			BASE_JD_REQ_T) {
-		/* Tiler-only atom */
-		/* If the hardware supports XAFFINITY then we'll only enable
-		 * the tiler (which is the default so this is a no-op),
-		 * otherwise enable shader core 0.
-		 */
-		if (!kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_XAFFINITY))
-			affinity = 1;
-		else
-			affinity = 0;
+		/* Tiler-only atom, affinity value can be programed as 0 */
+		affinity = 0;
+		skip_affinity_check = true;
 	} else if ((core_req & (BASE_JD_REQ_COHERENT_GROUP |
 			BASE_JD_REQ_SPECIFIC_COHERENT_GROUP))) {
 		unsigned int num_core_groups = kbdev->gpu_props.num_core_groups;
@@ -89,7 +84,7 @@ static u64 kbase_job_write_affinity(struct kbase_device *kbdev,
 		affinity = kbasep_apply_limited_core_mask(kbdev, affinity, limited_core_mask);
 	}
 
-	if (unlikely(!affinity)) {
+	if (unlikely(!affinity && !skip_affinity_check)) {
 #ifdef CONFIG_MALI_DEBUG
 		u64 shaders_ready =
 			kbase_pm_get_ready_cores(kbdev, KBASE_PM_CORE_SHADER);
@@ -251,18 +246,13 @@ void kbase_job_hw_submit(struct kbase_device *kbdev,
 	    (katom->core_req & BASE_JD_REQ_END_RENDERPASS))
 		cfg |= JS_CONFIG_DISABLE_DESCRIPTOR_WR_BK;
 
-	if (kbase_hw_has_feature(kbdev,
-				BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION)) {
-		if (!kbdev->hwaccess.backend.slot_rb[js].job_chain_flag) {
-			cfg |= JS_CONFIG_JOB_CHAIN_FLAG;
-			katom->atom_flags |= KBASE_KATOM_FLAGS_JOBCHAIN;
-			kbdev->hwaccess.backend.slot_rb[js].job_chain_flag =
-								true;
-		} else {
-			katom->atom_flags &= ~KBASE_KATOM_FLAGS_JOBCHAIN;
-			kbdev->hwaccess.backend.slot_rb[js].job_chain_flag =
-								false;
-		}
+	if (!kbdev->hwaccess.backend.slot_rb[js].job_chain_flag) {
+		cfg |= JS_CONFIG_JOB_CHAIN_FLAG;
+		katom->atom_flags |= KBASE_KATOM_FLAGS_JOBCHAIN;
+		kbdev->hwaccess.backend.slot_rb[js].job_chain_flag = true;
+	} else {
+		katom->atom_flags &= ~KBASE_KATOM_FLAGS_JOBCHAIN;
+		kbdev->hwaccess.backend.slot_rb[js].job_chain_flag = false;
 	}
 
 	kbase_reg_write(kbdev, JOB_SLOT_REG(js, JS_CONFIG_NEXT), cfg);
@@ -621,25 +611,17 @@ void kbasep_job_slot_soft_or_hard_stop_do_action(struct kbase_device *kbdev,
 		/* Mark the point where we issue the soft-stop command */
 		KBASE_TLSTREAM_TL_EVENT_ATOM_SOFTSTOP_ISSUE(kbdev, target_katom);
 
-		if (kbase_hw_has_feature(
-				kbdev,
-				BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION)) {
-			action = (target_katom->atom_flags &
-					KBASE_KATOM_FLAGS_JOBCHAIN) ?
-				JS_COMMAND_SOFT_STOP_1 :
-				JS_COMMAND_SOFT_STOP_0;
-		}
+		action = (target_katom->atom_flags &
+			  KBASE_KATOM_FLAGS_JOBCHAIN) ?
+				 JS_COMMAND_SOFT_STOP_1 :
+				 JS_COMMAND_SOFT_STOP_0;
 	} else if (action == JS_COMMAND_HARD_STOP) {
 		target_katom->atom_flags |= KBASE_KATOM_FLAG_BEEN_HARD_STOPPED;
 
-		if (kbase_hw_has_feature(
-				kbdev,
-				BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION)) {
-			action = (target_katom->atom_flags &
-					KBASE_KATOM_FLAGS_JOBCHAIN) ?
-				JS_COMMAND_HARD_STOP_1 :
-				JS_COMMAND_HARD_STOP_0;
-		}
+		action = (target_katom->atom_flags &
+			  KBASE_KATOM_FLAGS_JOBCHAIN) ?
+				 JS_COMMAND_HARD_STOP_1 :
+				 JS_COMMAND_HARD_STOP_0;
 	}
 
 	kbase_reg_write(kbdev, JOB_SLOT_REG(js, JS_COMMAND), action);
@@ -725,40 +707,11 @@ void kbase_backend_jm_kill_running_jobs_from_kctx(struct kbase_context *kctx)
 		kbase_job_slot_hardstop(kctx, i, NULL);
 }
 
-/**
- * kbase_is_existing_atom_submitted_later_than_ready
- * @ready: sequence number of the ready atom
- * @existing: sequence number of the existing atom
- *
- * Returns true if the existing atom has been submitted later than the
- * ready atom. It is used to understand if an atom that is ready has been
- * submitted earlier than the currently running atom, so that the currently
- * running atom should be preempted to allow the ready atom to run.
- */
-static inline bool kbase_is_existing_atom_submitted_later_than_ready(u64 ready, u64 existing)
-{
-	/* No seq_nr set? */
-	if (!ready || !existing)
-		return false;
-
-	/* Efficiently handle the unlikely case of wrapping.
-	 * The following code assumes that the delta between the sequence number
-	 * of the two atoms is less than INT64_MAX.
-	 * In the extremely unlikely case where the delta is higher, the comparison
-	 * defaults for no preemption.
-	 * The code also assumes that the conversion from unsigned to signed types
-	 * works because the signed integers are 2's complement.
-	 */
-	return (s64)(ready - existing) < 0;
-}
-
 void kbase_job_slot_ctx_priority_check_locked(struct kbase_context *kctx,
 				struct kbase_jd_atom *target_katom)
 {
 	struct kbase_device *kbdev;
-	int js = target_katom->slot_nr;
-	int priority = target_katom->sched_priority;
-	int seq_nr = target_katom->seq_nr;
+	int target_js = target_katom->slot_nr;
 	int i;
 	bool stop_sent = false;
 
@@ -768,26 +721,21 @@ void kbase_job_slot_ctx_priority_check_locked(struct kbase_context *kctx,
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
-	for (i = 0; i < kbase_backend_nr_atoms_on_slot(kbdev, js); i++) {
-		struct kbase_jd_atom *katom;
-
-		katom = kbase_gpu_inspect(kbdev, js, i);
-		if (!katom)
-			continue;
+	for (i = 0; i < kbase_backend_nr_atoms_on_slot(kbdev, target_js); i++) {
+		struct kbase_jd_atom *slot_katom;
 
-		if ((kbdev->js_ctx_scheduling_mode ==
-			KBASE_JS_PROCESS_LOCAL_PRIORITY_MODE) &&
-				(katom->kctx != kctx))
+		slot_katom = kbase_gpu_inspect(kbdev, target_js, i);
+		if (!slot_katom)
 			continue;
 
-		if ((katom->sched_priority > priority) ||
-		    (katom->kctx == kctx && kbase_is_existing_atom_submitted_later_than_ready(seq_nr, katom->seq_nr))) {
+		if (kbase_js_atom_runs_before(kbdev, target_katom, slot_katom,
+					      KBASE_ATOM_ORDERING_FLAG_SEQNR)) {
 			if (!stop_sent)
 				KBASE_TLSTREAM_TL_ATTRIB_ATOM_PRIORITIZED(
 						kbdev,
 						target_katom);
 
-			kbase_job_slot_softstop(kbdev, js, katom);
+			kbase_job_slot_softstop(kbdev, target_js, slot_katom);
 			stop_sent = true;
 		}
 	}
diff --git a/mali_kbase/backend/gpu/mali_kbase_jm_rb.c b/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
index b475d79..1906286 100644
--- a/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
+++ b/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
@@ -387,6 +387,9 @@ static void kbase_gpu_mark_atom_for_return(struct kbase_device *kbdev,
 {
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
+	KBASE_KTRACE_ADD_JM_SLOT_INFO(kbdev, JM_MARK_FOR_RETURN_TO_JS,
+				      katom->kctx, katom, katom->jc,
+				      katom->slot_nr, katom->event_code);
 	kbase_gpu_release_atom(kbdev, katom, NULL);
 	katom->gpu_rb_state = KBASE_ATOM_GPU_RB_RETURN_TO_JS;
 }
@@ -564,7 +567,7 @@ static int kbase_jm_enter_protected_mode(struct kbase_device *kbdev,
 		kbdev->protected_mode_transition = true;
 
 		/* ***TRANSITION TO HIGHER STATE*** */
-		/* fallthrough */
+		fallthrough;
 	case KBASE_ATOM_ENTER_PROTECTED_HWCNT:
 		/* See if we can get away with disabling hwcnt atomically */
 		kbdev->protected_mode_hwcnt_desired = false;
@@ -607,7 +610,7 @@ static int kbase_jm_enter_protected_mode(struct kbase_device *kbdev,
 			kbase_pm_update_cores_state_nolock(kbdev);
 
 		/* ***TRANSITION TO HIGHER STATE*** */
-		/* fallthrough */
+		fallthrough;
 	case KBASE_ATOM_ENTER_PROTECTED_IDLE_L2:
 		/* Avoid unnecessary waiting on non-ACE platforms. */
 		if (kbdev->system_coherency == COHERENCY_ACE) {
@@ -638,7 +641,7 @@ static int kbase_jm_enter_protected_mode(struct kbase_device *kbdev,
 			KBASE_ATOM_ENTER_PROTECTED_SET_COHERENCY;
 
 		/* ***TRANSITION TO HIGHER STATE*** */
-		/* fallthrough */
+		fallthrough;
 	case KBASE_ATOM_ENTER_PROTECTED_SET_COHERENCY:
 		/*
 		 * When entering into protected mode, we must ensure that the
@@ -671,7 +674,7 @@ static int kbase_jm_enter_protected_mode(struct kbase_device *kbdev,
 			return -EAGAIN;
 
 		/* ***TRANSITION TO HIGHER STATE*** */
-		/* fallthrough */
+		fallthrough;
 	case KBASE_ATOM_ENTER_PROTECTED_FINISHED:
 		if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_TGOX_R1_1234)) {
 			/*
@@ -742,7 +745,7 @@ static int kbase_jm_exit_protected_mode(struct kbase_device *kbdev,
 		kbase_pm_update_cores_state_nolock(kbdev);
 
 		/* ***TRANSITION TO HIGHER STATE*** */
-		/* fallthrough */
+		fallthrough;
 	case KBASE_ATOM_EXIT_PROTECTED_IDLE_L2:
 		if (kbdev->pm.backend.l2_state != KBASE_L2_OFF) {
 			/*
@@ -755,7 +758,7 @@ static int kbase_jm_exit_protected_mode(struct kbase_device *kbdev,
 				KBASE_ATOM_EXIT_PROTECTED_RESET;
 
 		/* ***TRANSITION TO HIGHER STATE*** */
-		/* fallthrough */
+		fallthrough;
 	case KBASE_ATOM_EXIT_PROTECTED_RESET:
 		/* Issue the reset to the GPU */
 		err = kbase_gpu_protected_mode_reset(kbdev);
@@ -797,7 +800,7 @@ static int kbase_jm_exit_protected_mode(struct kbase_device *kbdev,
 				KBASE_ATOM_EXIT_PROTECTED_RESET_WAIT;
 
 		/* ***TRANSITION TO HIGHER STATE*** */
-		/* fallthrough */
+		fallthrough;
 	case KBASE_ATOM_EXIT_PROTECTED_RESET_WAIT:
 		/* A GPU reset is issued when exiting protected mode. Once the
 		 * reset is done all atoms' state will also be reset. For this
@@ -854,7 +857,7 @@ void kbase_backend_slot_update(struct kbase_device *kbdev)
 					KBASE_ATOM_GPU_RB_WAITING_PROTECTED_MODE_PREV;
 
 				/* ***TRANSITION TO HIGHER STATE*** */
-				/* fallthrough */
+				fallthrough;
 			case KBASE_ATOM_GPU_RB_WAITING_PROTECTED_MODE_PREV:
 				if (kbase_gpu_check_secure_atoms(kbdev,
 						!kbase_jd_katom_is_protected(
@@ -874,7 +877,7 @@ void kbase_backend_slot_update(struct kbase_device *kbdev)
 					KBASE_ATOM_GPU_RB_WAITING_PROTECTED_MODE_TRANSITION;
 
 				/* ***TRANSITION TO HIGHER STATE*** */
-				/* fallthrough */
+				fallthrough;
 			case KBASE_ATOM_GPU_RB_WAITING_PROTECTED_MODE_TRANSITION:
 
 				/*
@@ -909,7 +912,7 @@ void kbase_backend_slot_update(struct kbase_device *kbdev)
 					KBASE_ATOM_GPU_RB_WAITING_FOR_CORE_AVAILABLE;
 
 				/* ***TRANSITION TO HIGHER STATE*** */
-				/* fallthrough */
+				fallthrough;
 			case KBASE_ATOM_GPU_RB_WAITING_FOR_CORE_AVAILABLE:
 				if (katom[idx]->will_fail_event_code) {
 					kbase_gpu_mark_atom_for_return(kbdev,
@@ -936,6 +939,11 @@ void kbase_backend_slot_update(struct kbase_device *kbdev)
 
 				if (katom[idx]->event_code ==
 						BASE_JD_EVENT_PM_EVENT) {
+					KBASE_KTRACE_ADD_JM_SLOT_INFO(
+						kbdev, JM_MARK_FOR_RETURN_TO_JS,
+						katom[idx]->kctx, katom[idx],
+						katom[idx]->jc, js,
+						katom[idx]->event_code);
 					katom[idx]->gpu_rb_state =
 						KBASE_ATOM_GPU_RB_RETURN_TO_JS;
 					break;
@@ -948,7 +956,7 @@ void kbase_backend_slot_update(struct kbase_device *kbdev)
 					KBASE_ATOM_GPU_RB_READY;
 
 				/* ***TRANSITION TO HIGHER STATE*** */
-				/* fallthrough */
+				fallthrough;
 			case KBASE_ATOM_GPU_RB_READY:
 
 				if (idx == 1) {
@@ -994,7 +1002,7 @@ void kbase_backend_slot_update(struct kbase_device *kbdev)
 					KBASE_ATOM_GPU_RB_SUBMITTED;
 
 				/* ***TRANSITION TO HIGHER STATE*** */
-				/* fallthrough */
+				fallthrough;
 			case KBASE_ATOM_GPU_RB_SUBMITTED:
 
 				/* Inform power management at start/finish of
@@ -1037,9 +1045,55 @@ void kbase_backend_run_atom(struct kbase_device *kbdev,
 	kbase_backend_slot_update(kbdev);
 }
 
-#define HAS_DEP(katom) (katom->pre_dep || katom->atom_flags & \
-	(KBASE_KATOM_FLAG_X_DEP_BLOCKED | KBASE_KATOM_FLAG_FAIL_BLOCKER))
+/**
+ * kbase_rb_atom_might_depend - determine if one atom in the slot ringbuffer
+ *                              might depend on another from the same kctx
+ * @katom_a: dependee atom
+ * @katom_b: atom to query
+ *
+ * This can be used on atoms that belong to different slot ringbuffers
+ *
+ * Return: true if @katom_b might depend on @katom_a, false if it cannot depend.
+ */
+static inline bool
+kbase_rb_atom_might_depend(const struct kbase_jd_atom *katom_a,
+			   const struct kbase_jd_atom *katom_b)
+{
+	if (katom_a->kctx != katom_b->kctx)
+		return false;
+	return (katom_b->pre_dep ||
+		(katom_b->atom_flags & (KBASE_KATOM_FLAG_X_DEP_BLOCKED |
+					KBASE_KATOM_FLAG_FAIL_BLOCKER)));
+}
 
+/**
+ * kbase_gpu_irq_evict - evict a slot's JSn_HEAD_NEXT atom from the HW if it is
+ *                       related to a failed JSn_HEAD atom
+ * @kbdev kbase device
+ * @js job slot to check
+ * @completion_code completion code of the failed atom
+ *
+ * Note: 'STOPPED' atoms are considered 'failed', as they are in the HW, but
+ * unlike other failure codes we _can_ re-run them.
+ *
+ * This forms step 1 in a 2-step process of removing any related atoms from a
+ * slot's JSn_HEAD_NEXT (ringbuffer index 1), should there have
+ * been a 'failure' on an atom in JSn_HEAD (ringbuffer index 0).
+ *
+ * This step only removes the atoms from the HW, and marks them as
+ * (potentially) ready to run again.
+ *
+ * Step 2 is on marking the JSn_HEAD atom as complete
+ * (kbase_gpu_complete_hw()), to dequeue said atoms and return them to the JS
+ * as appropriate, or re-submit them.
+ *
+ * Hence, this function must evict at a minimum the atoms related to the atom
+ * in JSn_HEAD that kbase_gpu_complete_hw() will also dequeue. It is acceptable
+ * if this function evicts more atoms than kbase_gpu_complete_hw() dequeues, as
+ * the next kbase_backend_slot_update() will resubmit any remaining.
+ *
+ * Return: true if an atom was evicted, false otherwise.
+ */
 bool kbase_gpu_irq_evict(struct kbase_device *kbdev, int js,
 				u32 completion_code)
 {
@@ -1051,14 +1105,12 @@ bool kbase_gpu_irq_evict(struct kbase_device *kbdev, int js,
 	katom = kbase_gpu_inspect(kbdev, js, 0);
 	next_katom = kbase_gpu_inspect(kbdev, js, 1);
 
-	if (next_katom && katom->kctx == next_katom->kctx &&
-		next_katom->gpu_rb_state == KBASE_ATOM_GPU_RB_SUBMITTED &&
-		(HAS_DEP(next_katom) || next_katom->sched_priority ==
-				katom->sched_priority) &&
-		(kbase_reg_read(kbdev, JOB_SLOT_REG(js, JS_HEAD_NEXT_LO))
-									!= 0 ||
-		kbase_reg_read(kbdev, JOB_SLOT_REG(js, JS_HEAD_NEXT_HI))
-									!= 0)) {
+	if (next_katom &&
+	    next_katom->gpu_rb_state == KBASE_ATOM_GPU_RB_SUBMITTED &&
+	    (kbase_rb_atom_might_depend(katom, next_katom) ||
+	     kbase_js_atom_runs_before(kbdev, katom, next_katom, 0u)) &&
+	    (kbase_reg_read(kbdev, JOB_SLOT_REG(js, JS_HEAD_NEXT_LO)) != 0 ||
+	     kbase_reg_read(kbdev, JOB_SLOT_REG(js, JS_HEAD_NEXT_HI)) != 0)) {
 		kbase_reg_write(kbdev, JOB_SLOT_REG(js, JS_COMMAND_NEXT),
 				JS_COMMAND_NOP);
 		next_katom->gpu_rb_state = KBASE_ATOM_GPU_RB_READY;
@@ -1083,6 +1135,30 @@ bool kbase_gpu_irq_evict(struct kbase_device *kbdev, int js,
 	return false;
 }
 
+/**
+ * kbase_gpu_complete_hw - complete the atom in a slot's JSn_HEAD
+ * @kbdev kbase device
+ * @js job slot to check
+ * @completion_code completion code of the completed atom
+ * @job_tail value read from JSn_TAIL, for STOPPED atoms
+ * @end_timestamp pointer to approximate ktime value when the katom completed
+ *
+ * Among other operations, this also executes step 2 of a 2-step process of
+ * removing any related atoms from a slot's JSn_HEAD_NEXT (ringbuffer index 1),
+ * should there have been a 'failure' on an atom in JSn_HEAD (ringbuffer index
+ * 0). The first step is done in kbase_gpu_irq_evict().
+ *
+ * Note: 'STOPPED' atoms are considered 'failed', as they are in the HW, but
+ * unlike other failure codes we _can_ re-run them.
+ *
+ * When the JSn_HEAD atom is considered to be 'failed', then this will dequeue
+ * and return to the JS some (usually all) of the atoms evicted from the HW
+ * during the kbase_gpu_irq_evict() for that JSn_HEAD atom. If it dequeues an
+ * atom, that atom must not have been running or must already be evicted, as
+ * otherwise we would be in the incorrect state of having an atom both running
+ * on the HW and returned to the JS.
+ */
+
 void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js,
 				u32 completion_code,
 				u64 job_tail,
@@ -1133,9 +1209,8 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js,
 		 * registers by kbase_gpu_soft_hard_stop_slot(), to ensure that
 		 * the atoms on this slot are returned in the correct order.
 		 */
-		if (next_katom && katom->kctx == next_katom->kctx &&
-				next_katom->sched_priority ==
-				katom->sched_priority) {
+		if (next_katom &&
+		    kbase_js_atom_runs_before(kbdev, katom, next_katom, 0u)) {
 			WARN_ON(next_katom->gpu_rb_state ==
 					KBASE_ATOM_GPU_RB_SUBMITTED);
 			kbase_gpu_dequeue_atom(kbdev, js, end_timestamp);
@@ -1145,12 +1220,14 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js,
 		struct kbasep_js_device_data *js_devdata = &kbdev->js_data;
 		int i;
 
-		if (!kbase_ctx_flag(katom->kctx, KCTX_DYING))
+		if (!kbase_ctx_flag(katom->kctx, KCTX_DYING)) {
 			dev_warn(kbdev->dev, "error detected from slot %d, job status 0x%08x (%s)",
 					js, completion_code,
 					kbase_gpu_exception_name(
 					completion_code));
 
+		}
+
 #if KBASE_KTRACE_DUMP_ON_JOB_SLOT_ERROR != 0
 		KBASE_KTRACE_DUMP(kbdev);
 #endif
@@ -1168,18 +1245,17 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js,
 			struct kbase_jd_atom *katom_idx1 =
 						kbase_gpu_inspect(kbdev, i, 1);
 
-			if (katom_idx0 && katom_idx0->kctx == katom->kctx &&
-					HAS_DEP(katom_idx0) &&
-					katom_idx0->gpu_rb_state !=
-					KBASE_ATOM_GPU_RB_SUBMITTED) {
+			if (katom_idx0 &&
+			    kbase_rb_atom_might_depend(katom, katom_idx0) &&
+			    katom_idx0->gpu_rb_state !=
+				    KBASE_ATOM_GPU_RB_SUBMITTED) {
 				/* Dequeue katom_idx0 from ringbuffer */
 				kbase_gpu_dequeue_atom(kbdev, i, end_timestamp);
 
-				if (katom_idx1 &&
-						katom_idx1->kctx == katom->kctx
-						&& HAS_DEP(katom_idx1) &&
-						katom_idx0->gpu_rb_state !=
-						KBASE_ATOM_GPU_RB_SUBMITTED) {
+				if (katom_idx1 && kbase_rb_atom_might_depend(
+							  katom, katom_idx1) &&
+				    katom_idx0->gpu_rb_state !=
+					    KBASE_ATOM_GPU_RB_SUBMITTED) {
 					/* Dequeue katom_idx1 from ringbuffer */
 					kbase_gpu_dequeue_atom(kbdev, i,
 							end_timestamp);
@@ -1192,11 +1268,10 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js,
 				katom_idx0->event_code = BASE_JD_EVENT_STOPPED;
 				kbase_jm_return_atom_to_js(kbdev, katom_idx0);
 
-			} else if (katom_idx1 &&
-					katom_idx1->kctx == katom->kctx &&
-					HAS_DEP(katom_idx1) &&
-					katom_idx1->gpu_rb_state !=
-					KBASE_ATOM_GPU_RB_SUBMITTED) {
+			} else if (katom_idx1 && kbase_rb_atom_might_depend(
+							 katom, katom_idx1) &&
+				   katom_idx1->gpu_rb_state !=
+					   KBASE_ATOM_GPU_RB_SUBMITTED) {
 				/* Can not dequeue this atom yet - will be
 				 * dequeued when atom at idx0 completes
 				 */
@@ -1369,17 +1444,63 @@ void kbase_backend_reset(struct kbase_device *kbdev, ktime_t *end_timestamp)
 	kbase_pm_protected_override_disable(kbdev);
 }
 
+/**
+ * should_stop_next_atom - given a soft/hard stop action, determine if the next
+ *                         atom on a slot should be stopped
+ * @kbdev: kbase devices
+ * @head_katom: atom currently in the JSn_HEAD
+ * @next_katom: atom currently in the JSn_HEAD_NEXT
+ * @action: JS_COMMAND_<...> action for soft/hard-stop
+ *
+ * This is used in cases where @head_katom is the target of the soft/hard-stop.
+ * It only makes sense to call this when @head_katom and @next_katom are from
+ * the same slot.
+ *
+ * Return: true if @next_katom should also be stopped with the given action,
+ * false otherwise
+ */
+static bool should_stop_next_atom(struct kbase_device *kbdev,
+				  const struct kbase_jd_atom *head_katom,
+				  const struct kbase_jd_atom *next_katom,
+				  u32 action)
+{
+	bool ret = false;
+	u32 hw_action = action & JS_COMMAND_MASK;
+
+	switch (hw_action) {
+	case JS_COMMAND_SOFT_STOP:
+		ret = kbase_js_atom_runs_before(kbdev, head_katom, next_katom,
+						0u);
+		break;
+	case JS_COMMAND_HARD_STOP:
+		/* Unlike soft-stop, a hard-stop targeting a particular atom
+		 * should not cause atoms from unrelated contexts to be
+		 * removed
+		 */
+		ret = (head_katom->kctx == next_katom->kctx);
+		break;
+	default:
+		/* Other stop actions are possible, but the driver should not
+		 * be generating them at this point in the call chain
+		 */
+		WARN(1, "Unexpected stop action: 0x%.8x", hw_action);
+		break;
+	}
+	return ret;
+}
+
 static inline void kbase_gpu_stop_atom(struct kbase_device *kbdev,
 					int js,
 					struct kbase_jd_atom *katom,
 					u32 action)
 {
+	struct kbase_context *kctx = katom->kctx;
 	u32 hw_action = action & JS_COMMAND_MASK;
 
 	kbase_job_check_enter_disjoint(kbdev, action, katom->core_req, katom);
 	kbasep_job_slot_soft_or_hard_stop_do_action(kbdev, js, hw_action,
 							katom->core_req, katom);
-	katom->kctx->blocked_js[js][katom->sched_priority] = true;
+	kbase_jsctx_slot_prio_blocked_set(kctx, js, katom->sched_priority);
 }
 
 static inline void kbase_gpu_remove_atom(struct kbase_device *kbdev,
@@ -1387,11 +1508,14 @@ static inline void kbase_gpu_remove_atom(struct kbase_device *kbdev,
 						u32 action,
 						bool disjoint)
 {
+	struct kbase_context *kctx = katom->kctx;
+
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
 	katom->event_code = BASE_JD_EVENT_REMOVED_FROM_NEXT;
 	kbase_gpu_mark_atom_for_return(kbdev, katom);
-	katom->kctx->blocked_js[katom->slot_nr][katom->sched_priority] = true;
+	kbase_jsctx_slot_prio_blocked_set(kctx, katom->slot_nr,
+					  katom->sched_priority);
 
 	if (disjoint)
 		kbase_job_check_enter_disjoint(kbdev, action, katom->core_req,
@@ -1419,7 +1543,9 @@ bool kbase_backend_soft_hard_stop_slot(struct kbase_device *kbdev,
 					u32 action)
 {
 	struct kbase_jd_atom *katom_idx0;
+	struct kbase_context *kctx_idx0 = NULL;
 	struct kbase_jd_atom *katom_idx1;
+	struct kbase_context *kctx_idx1 = NULL;
 
 	bool katom_idx0_valid, katom_idx1_valid;
 
@@ -1433,30 +1559,32 @@ bool kbase_backend_soft_hard_stop_slot(struct kbase_device *kbdev,
 	katom_idx0 = kbase_gpu_inspect(kbdev, js, 0);
 	katom_idx1 = kbase_gpu_inspect(kbdev, js, 1);
 
-	if (katom_idx0)
+	if (katom_idx0) {
+		kctx_idx0 = katom_idx0->kctx;
 		prio_idx0 = katom_idx0->sched_priority;
-	if (katom_idx1)
+	}
+	if (katom_idx1) {
+		kctx_idx1 = katom_idx1->kctx;
 		prio_idx1 = katom_idx1->sched_priority;
+	}
 
 	if (katom) {
 		katom_idx0_valid = (katom_idx0 == katom);
-		/* If idx0 is to be removed and idx1 is on the same context,
-		 * then idx1 must also be removed otherwise the atoms might be
-		 * returned out of order
-		 */
 		if (katom_idx1)
-			katom_idx1_valid = (katom_idx1 == katom) ||
-						(katom_idx0_valid &&
-							(katom_idx0->kctx ==
-							katom_idx1->kctx));
+			katom_idx1_valid = (katom_idx1 == katom);
 		else
 			katom_idx1_valid = false;
 	} else {
-		katom_idx0_valid =
-			(katom_idx0 && (!kctx || katom_idx0->kctx == kctx));
-		katom_idx1_valid =
-			(katom_idx1 && (!kctx || katom_idx1->kctx == kctx));
+		katom_idx0_valid = (katom_idx0 && (!kctx || kctx_idx0 == kctx));
+		katom_idx1_valid = (katom_idx1 && (!kctx || kctx_idx1 == kctx));
 	}
+	/* If there's an atom in JSn_HEAD_NEXT that we haven't already decided
+	 * to stop, but we're stopping the JSn_HEAD atom, see if they are
+	 * related/ordered in some way that would require the same stop action
+	 */
+	if (!katom_idx1_valid && katom_idx0_valid && katom_idx1)
+		katom_idx1_valid = should_stop_next_atom(kbdev, katom_idx0,
+							 katom_idx1, action);
 
 	if (katom_idx0_valid)
 		stop_x_dep_idx0 = should_stop_x_dep_slot(katom_idx0);
@@ -1472,14 +1600,15 @@ bool kbase_backend_soft_hard_stop_slot(struct kbase_device *kbdev,
 				katom_idx1->event_code =
 						BASE_JD_EVENT_REMOVED_FROM_NEXT;
 				kbase_jm_return_atom_to_js(kbdev, katom_idx1);
-				katom_idx1->kctx->blocked_js[js][prio_idx1] =
-						true;
+				kbase_jsctx_slot_prio_blocked_set(kctx_idx1, js,
+								  prio_idx1);
 			}
 
 			katom_idx0->event_code =
 						BASE_JD_EVENT_REMOVED_FROM_NEXT;
 			kbase_jm_return_atom_to_js(kbdev, katom_idx0);
-			katom_idx0->kctx->blocked_js[js][prio_idx0] = true;
+			kbase_jsctx_slot_prio_blocked_set(kctx_idx0, js,
+							  prio_idx0);
 		} else {
 			/* katom_idx0 is on GPU */
 			if (katom_idx1_valid && katom_idx1->gpu_rb_state ==
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_backend.c b/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
index cc791df..5df7f67 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
@@ -32,6 +32,9 @@
 #include <mali_kbase_hwaccess_jm.h>
 #include <backend/gpu/mali_kbase_js_internal.h>
 #include <backend/gpu/mali_kbase_jm_internal.h>
+#else
+#include <linux/pm_runtime.h>
+#include <mali_kbase_reset_gpu.h>
 #endif /* !MALI_USE_CSF */
 #include <mali_kbase_hwcnt_context.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
@@ -69,6 +72,10 @@ int kbase_pm_runtime_init(struct kbase_device *kbdev)
 					callbacks->power_runtime_idle_callback;
 		kbdev->pm.backend.callback_soft_reset =
 					callbacks->soft_reset_callback;
+		kbdev->pm.backend.callback_power_runtime_gpu_idle =
+					callbacks->power_runtime_gpu_idle_callback;
+		kbdev->pm.backend.callback_power_runtime_gpu_active =
+					callbacks->power_runtime_gpu_active_callback;
 
 		if (callbacks->power_runtime_init_callback)
 			return callbacks->power_runtime_init_callback(kbdev);
@@ -86,6 +93,8 @@ int kbase_pm_runtime_init(struct kbase_device *kbdev)
 	kbdev->pm.backend.callback_power_runtime_off = NULL;
 	kbdev->pm.backend.callback_power_runtime_idle = NULL;
 	kbdev->pm.backend.callback_soft_reset = NULL;
+	kbdev->pm.backend.callback_power_runtime_gpu_idle = NULL;
+	kbdev->pm.backend.callback_power_runtime_gpu_active = NULL;
 
 	return 0;
 }
@@ -120,10 +129,10 @@ void kbase_pm_register_access_disable(struct kbase_device *kbdev)
 
 	callbacks = (struct kbase_pm_callback_conf *)POWER_MANAGEMENT_CALLBACKS;
 
+	kbdev->pm.backend.gpu_powered = false;
+
 	if (callbacks)
 		callbacks->power_off_callback(kbdev);
-
-	kbdev->pm.backend.gpu_powered = false;
 }
 
 int kbase_hwaccess_pm_init(struct kbase_device *kbdev)
@@ -193,6 +202,7 @@ int kbase_hwaccess_pm_init(struct kbase_device *kbdev)
 		kbase_pm_hwcnt_disable_worker);
 	kbase_hwcnt_context_disable(kbdev->hwcnt_gpu_ctx);
 
+
 	if (IS_ENABLED(CONFIG_MALI_HW_ERRATA_1485982_NOT_AFFECTED)) {
 		kbdev->pm.backend.l2_always_on = false;
 		kbdev->pm.backend.gpu_clock_slow_down_wa = false;
@@ -263,6 +273,76 @@ void kbase_pm_do_poweron(struct kbase_device *kbdev, bool is_resume)
 	 */
 }
 
+static void pm_handle_power_off(struct kbase_device *kbdev)
+{
+	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
+#if MALI_USE_CSF
+	enum kbase_mcu_state mcu_state;
+#endif
+	unsigned long flags;
+
+	lockdep_assert_held(&kbdev->pm.lock);
+
+	if (backend->poweron_required)
+		return;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+	if (kbdev->pm.backend.gpu_wakeup_override ) {
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+		return;
+	}
+#endif
+	WARN_ON(backend->shaders_state !=
+			KBASE_SHADERS_OFF_CORESTACK_OFF ||
+		backend->l2_state != KBASE_L2_OFF);
+#if MALI_USE_CSF
+	mcu_state = backend->mcu_state;
+	WARN_ON(!kbase_pm_is_mcu_inactive(kbdev, mcu_state));
+#endif
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+	if (backend->callback_power_runtime_gpu_idle) {
+		WARN_ON(backend->gpu_idled);
+		backend->callback_power_runtime_gpu_idle(kbdev);
+		backend->gpu_idled = true;
+		return;
+	}
+#endif
+
+	/* Disable interrupts and turn the clock off */
+	if (!kbase_pm_clock_off(kbdev)) {
+		/*
+		 * Page/bus faults are pending, must drop locks to
+		 * process.  Interrupts are disabled so no more faults
+		 * should be generated at this point.
+		 */
+		kbase_pm_unlock(kbdev);
+		kbase_flush_mmu_wqs(kbdev);
+		kbase_pm_lock(kbdev);
+
+#ifdef CONFIG_MALI_ARBITER_SUPPORT
+		/* poweron_required may have changed while pm lock
+		 * was released.
+		 */
+		if (kbase_pm_is_gpu_lost(kbdev))
+			backend->poweron_required = false;
+#endif
+
+		/* Turn off clock now that fault have been handled. We
+		 * dropped locks so poweron_required may have changed -
+		 * power back on if this is the case (effectively only
+		 * re-enabling of the interrupts would be done in this
+		 * case, as the clocks to GPU were not withdrawn yet).
+		 */
+		if (backend->poweron_required)
+			kbase_pm_clock_on(kbdev, false);
+		else
+			WARN_ON(!kbase_pm_clock_off(kbdev));
+	}
+}
+
 static void kbase_pm_gpu_poweroff_wait_wq(struct work_struct *data)
 {
 	struct kbase_device *kbdev = container_of(data, struct kbase_device,
@@ -271,6 +351,8 @@ static void kbase_pm_gpu_poweroff_wait_wq(struct work_struct *data)
 	struct kbase_pm_backend_data *backend = &pm->backend;
 	unsigned long flags;
 
+	KBASE_KTRACE_ADD(kbdev, PM_POWEROFF_WAIT_WQ, NULL, 0);
+
 #if !MALI_USE_CSF
 	/* Wait for power transitions to complete. We do this with no locks held
 	 * so that we don't deadlock with any pending workqueues.
@@ -285,46 +367,7 @@ static void kbase_pm_gpu_poweroff_wait_wq(struct work_struct *data)
 		backend->poweron_required = false;
 #endif
 
-	if (!backend->poweron_required) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-		WARN_ON(backend->shaders_state !=
-					KBASE_SHADERS_OFF_CORESTACK_OFF ||
-			backend->l2_state != KBASE_L2_OFF);
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
-		/* Disable interrupts and turn the clock off */
-		if (!kbase_pm_clock_off(kbdev)) {
-			/*
-			 * Page/bus faults are pending, must drop locks to
-			 * process.  Interrupts are disabled so no more faults
-			 * should be generated at this point.
-			 */
-			kbase_pm_unlock(kbdev);
-			kbase_flush_mmu_wqs(kbdev);
-			kbase_pm_lock(kbdev);
-
-#ifdef CONFIG_MALI_ARBITER_SUPPORT
-			/* poweron_required may have changed while pm lock
-			 * was released.
-			 */
-			if (kbase_pm_is_gpu_lost(kbdev))
-				backend->poweron_required = false;
-#endif
-
-			/* Turn off clock now that fault have been handled. We
-			 * dropped locks so poweron_required may have changed -
-			 * power back on if this is the case (effectively only
-			 * re-enabling of the interrupts would be done in this
-			 * case, as the clocks to GPU were not withdrawn yet).
-			 */
-			if (backend->poweron_required)
-				kbase_pm_clock_on(kbdev, false);
-			else
-				WARN_ON(!kbase_pm_clock_off(kbdev));
-		}
-	}
+	pm_handle_power_off(kbdev);
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	backend->poweroff_wait_in_progress = false;
@@ -512,6 +555,74 @@ static void kbase_pm_hwcnt_disable_worker(struct work_struct *data)
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 }
 
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+/**
+ * kbase_pm_do_poweroff_sync - Do the synchronous power down of GPU
+ *
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * This function is called at the time of system suspend or device unload
+ * to power down the GPU synchronously. This is needed as the power down of GPU
+ * would usually happen from the runtime suspend callback function (if gpu_active
+ * and gpu_idle callbacks are used) and runtime suspend operation is disabled
+ * when system suspend takes place.
+ * The function first waits for the @gpu_poweroff_wait_work to complete, which
+ * could have been enqueued after the last PM reference was released.
+ */
+static void kbase_pm_do_poweroff_sync(struct kbase_device *kbdev)
+{
+	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
+	unsigned long flags;
+
+	WARN_ON(kbdev->pm.active_count);
+
+	kbase_pm_wait_for_poweroff_work_complete(kbdev);
+
+	kbase_pm_lock(kbdev);
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	WARN_ON(backend->poweroff_wait_in_progress);
+	if (backend->gpu_powered) {
+		int ret;
+
+		backend->mcu_desired = false;
+		backend->l2_desired = false;
+		kbase_pm_update_state(kbdev);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+		ret = kbase_pm_wait_for_desired_state(kbdev);
+		if (ret) {
+			dev_warn(kbdev->dev, "Wait failed on synchronous power off");
+			kbase_pm_unlock(kbdev);
+			/* Wait for the completion of reset, triggered due to
+			 * the previous failure.
+			 */
+			kbase_reset_gpu_wait(kbdev);
+			/* Wait again for the poweroff work which could have
+			 * been enqueued by the GPU reset worker.
+			 */
+			kbase_pm_wait_for_poweroff_work_complete(kbdev);
+			kbase_pm_lock(kbdev);
+		}
+
+		/* Due to the power policy, GPU could have been kept active
+		 * throughout and so need to invoke the idle callback before
+		 * the power down.
+		 */
+		if (backend->callback_power_runtime_gpu_idle &&
+		    !backend->gpu_idled) {
+			backend->callback_power_runtime_gpu_idle(kbdev);
+			backend->gpu_idled = true;
+		}
+
+		kbase_pm_clock_off(kbdev);
+	} else {
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	}
+
+	kbase_pm_unlock(kbdev);
+}
+#endif
+
 void kbase_pm_do_poweroff(struct kbase_device *kbdev)
 {
 	unsigned long flags;
@@ -561,12 +672,31 @@ static bool is_poweroff_in_progress(struct kbase_device *kbdev)
 	return ret;
 }
 
-void kbase_pm_wait_for_poweroff_complete(struct kbase_device *kbdev)
+void kbase_pm_wait_for_poweroff_work_complete(struct kbase_device *kbdev)
 {
 	wait_event_killable(kbdev->pm.backend.poweroff_wait,
 			is_poweroff_in_progress(kbdev));
 }
-KBASE_EXPORT_TEST_API(kbase_pm_wait_for_poweroff_complete);
+KBASE_EXPORT_TEST_API(kbase_pm_wait_for_poweroff_work_complete);
+
+static bool is_gpu_powered_down(struct kbase_device *kbdev)
+{
+	bool ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	ret = !kbdev->pm.backend.gpu_powered;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	return ret;
+}
+
+void kbase_pm_wait_for_gpu_power_down(struct kbase_device *kbdev)
+{
+	wait_event_killable(kbdev->pm.backend.poweroff_wait,
+			is_gpu_powered_down(kbdev));
+}
+KBASE_EXPORT_TEST_API(kbase_pm_wait_for_gpu_power_down);
 
 int kbase_hwaccess_pm_powerup(struct kbase_device *kbdev,
 		unsigned int flags)
@@ -612,6 +742,15 @@ int kbase_hwaccess_pm_powerup(struct kbase_device *kbdev,
 	 * cores off
 	 */
 	kbdev->pm.active_count = 1;
+#if MALI_USE_CSF && KBASE_PM_RUNTIME
+	if (kbdev->pm.backend.callback_power_runtime_gpu_active) {
+		/* Take the RPM reference count to match with the internal
+		 * PM reference count
+		 */
+		kbdev->pm.backend.callback_power_runtime_gpu_active(kbdev);
+		WARN_ON(kbdev->pm.backend.gpu_idled);
+	}
+#endif
 
 	spin_lock_irqsave(&kbdev->pm.backend.gpu_cycle_counter_requests_lock,
 								irq_flags);
@@ -653,11 +792,15 @@ void kbase_hwaccess_pm_halt(struct kbase_device *kbdev)
 {
 	KBASE_DEBUG_ASSERT(kbdev != NULL);
 
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+	kbase_pm_do_poweroff_sync(kbdev);
+#else
 	mutex_lock(&kbdev->pm.lock);
 	kbase_pm_do_poweroff(kbdev);
 	mutex_unlock(&kbdev->pm.lock);
 
-	kbase_pm_wait_for_poweroff_complete(kbdev);
+	kbase_pm_wait_for_poweroff_work_complete(kbdev);
+#endif
 }
 
 KBASE_EXPORT_TEST_API(kbase_hwaccess_pm_halt);
@@ -761,6 +904,9 @@ void kbase_hwaccess_pm_gpu_idle(struct kbase_device *kbdev)
 
 void kbase_hwaccess_pm_suspend(struct kbase_device *kbdev)
 {
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+	kbase_pm_do_poweroff_sync(kbdev);
+#else
 	/* Force power off the GPU and all cores (regardless of policy), only
 	 * after the PM active count reaches zero (otherwise, we risk turning it
 	 * off prematurely)
@@ -775,7 +921,11 @@ void kbase_hwaccess_pm_suspend(struct kbase_device *kbdev)
 
 	kbase_pm_unlock(kbdev);
 
-	kbase_pm_wait_for_poweroff_complete(kbdev);
+	kbase_pm_wait_for_poweroff_work_complete(kbdev);
+#endif
+
+	WARN_ON(kbdev->pm.backend.gpu_powered);
+	WARN_ON(atomic_read(&kbdev->faults_pending));
 
 	if (kbdev->pm.backend.callback_power_suspend)
 		kbdev->pm.backend.callback_power_suspend(kbdev);
@@ -844,9 +994,12 @@ void kbase_pm_handle_gpu_lost(struct kbase_device *kbdev)
 
 		/* Cancel any pending HWC dumps */
 		spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
-		kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_IDLE;
-		kbdev->hwcnt.backend.triggered = 1;
-		wake_up(&kbdev->hwcnt.backend.wait);
+		if (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_DUMPING ||
+				kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_FAULT) {
+			kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_FAULT;
+			kbdev->hwcnt.backend.triggered = 1;
+			wake_up(&kbdev->hwcnt.backend.wait);
+		}
 		spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
 	}
 	mutex_unlock(&arb_vm_state->vm_state_lock);
@@ -854,3 +1007,208 @@ void kbase_pm_handle_gpu_lost(struct kbase_device *kbdev)
 }
 
 #endif /* CONFIG_MALI_ARBITER_SUPPORT */
+
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+int kbase_pm_force_mcu_wakeup_after_sleep(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+
+	lockdep_assert_held(&kbdev->pm.lock);
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	/* Set the override flag to force the power up of L2 cache */
+	kbdev->pm.backend.gpu_wakeup_override = true;
+	kbase_pm_update_state(kbdev);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	return kbase_pm_wait_for_desired_state(kbdev);
+}
+
+static int pm_handle_mcu_sleep_on_runtime_suspend(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+	int ret;
+
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+	lockdep_assert_held(&kbdev->pm.lock);
+
+	/* In case of no active CSG on slot, powering up L2 could be skipped and
+	 * proceed directly to suspend GPU.
+	 * ToDo: firmware has to be reloaded after wake-up as no halt command
+	 * has been sent when GPU was put to sleep mode.
+	 */
+	if (!kbase_csf_scheduler_get_nr_active_csgs(kbdev))
+		dev_info(
+			kbdev->dev,
+			"No active CSGs. Can skip the power up of L2 and go for suspension directly");
+
+	ret = kbase_pm_force_mcu_wakeup_after_sleep(kbdev);
+	if (ret) {
+		dev_warn(kbdev->dev, "Wait for MCU wake up failed on runtime suspend");
+		return ret;
+	}
+
+	/* Check if a Doorbell mirror interrupt occurred meanwhile */
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	if (kbdev->pm.backend.gpu_sleep_mode_active &&
+	    kbdev->pm.backend.exit_gpu_sleep_mode) {
+		dev_dbg(kbdev->dev, "DB mirror interrupt occurred during runtime suspend after L2 power up");
+		kbdev->pm.backend.gpu_wakeup_override = false;
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+		return -EBUSY;
+	}
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	/* Need to release the kbdev->pm.lock to avoid lock ordering issue
+	 * with kctx->reg.lock, which is taken if the sync wait condition is
+	 * evaluated after the CSG suspend operation.
+	 */
+	kbase_pm_unlock(kbdev);
+	ret = kbase_csf_scheduler_handle_runtime_suspend(kbdev);
+	kbase_pm_lock(kbdev);
+
+	/* Power down L2 cache */
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	kbdev->pm.backend.gpu_wakeup_override = false;
+	kbase_pm_update_state(kbdev);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	/* After re-acquiring the kbdev->pm.lock, check if the device
+	 * became active (or active then idle) meanwhile.
+	 */
+	if (kbdev->pm.active_count ||
+	    kbdev->pm.backend.poweroff_wait_in_progress) {
+		dev_dbg(kbdev->dev,
+			"Device became active on runtime suspend after suspending Scheduler");
+		ret = -EBUSY;
+	}
+
+	if (ret)
+		return ret;
+
+	ret = kbase_pm_wait_for_desired_state(kbdev);
+	if (ret)
+		dev_warn(kbdev->dev, "Wait for power down failed on runtime suspend");
+
+	return ret;
+}
+
+int kbase_pm_handle_runtime_suspend(struct kbase_device *kbdev)
+{
+	enum kbase_mcu_state mcu_state;
+	bool exit_early = false;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	/* This check is needed for the case where Kbase had invoked the
+	 * @power_off_callback directly.
+	 */
+	if (!kbdev->pm.backend.gpu_powered) {
+		dev_dbg(kbdev->dev, "GPU already powered down on runtime suspend");
+		exit_early = true;
+	}
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	if (exit_early)
+		goto out;
+
+	ret = kbase_reset_gpu_try_prevent(kbdev);
+	if (ret == -ENOMEM) {
+		dev_dbg(kbdev->dev, "Quit runtime suspend as GPU is in bad state");
+		/* Finish the runtime suspend, no point in trying again as GPU is
+		 * in irrecoverable bad state.
+		 */
+		goto out;
+	} else if (ret) {
+		dev_dbg(kbdev->dev, "Quit runtime suspend for failing to prevent gpu reset");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	kbase_csf_scheduler_lock(kbdev);
+	kbase_pm_lock(kbdev);
+
+	/*
+	 * This is to handle the case where GPU device becomes active and idle
+	 * very quickly whilst the runtime suspend callback is executing.
+	 * This is useful for the following scenario :-
+	 * - GPU goes idle and pm_callback_runtime_gpu_idle() is called.
+	 * - Auto-suspend timer expires and kbase_device_runtime_suspend()
+	 *   is called.
+	 * - GPU becomes active and pm_callback_runtime_gpu_active() calls
+	 *   pm_runtime_get().
+	 * - Shortly after that GPU becomes idle again.
+	 * - kbase_pm_handle_runtime_suspend() gets called.
+	 * - pm_callback_runtime_gpu_idle() is called.
+	 *
+	 * We do not want to power down the GPU immediately after it goes idle.
+	 * So if we notice that GPU had become active when the runtime suspend
+	 * had already kicked in, we abort the runtime suspend.
+	 * By aborting the runtime suspend, we defer the power down of GPU.
+	 *
+	 * This check also helps prevent warnings regarding L2 and MCU states
+	 * inside the pm_handle_power_off() function. The warning stems from
+	 * the fact that pm.lock is released before invoking Scheduler function
+	 * to suspend the CSGs.
+	 */
+	if (kbdev->pm.active_count ||
+	    kbdev->pm.backend.poweroff_wait_in_progress) {
+		dev_dbg(kbdev->dev, "Device became active on runtime suspend");
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	if (kbdev->pm.backend.gpu_sleep_mode_active &&
+	    kbdev->pm.backend.exit_gpu_sleep_mode) {
+		dev_dbg(kbdev->dev, "DB mirror interrupt occurred during runtime suspend before L2 power up");
+		ret = -EBUSY;
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+		goto unlock;
+	}
+
+	mcu_state = kbdev->pm.backend.mcu_state;
+	WARN_ON(!kbase_pm_is_mcu_inactive(kbdev, mcu_state));
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	if (mcu_state == KBASE_MCU_IN_SLEEP) {
+		ret = pm_handle_mcu_sleep_on_runtime_suspend(kbdev);
+		if (ret)
+			goto unlock;
+	}
+
+	/* Disable interrupts and turn off the GPU clocks */
+	if (!kbase_pm_clock_off(kbdev)) {
+		dev_warn(kbdev->dev, "Failed to turn off GPU clocks on runtime suspend, MMU faults pending");
+
+		WARN_ON(!kbdev->poweroff_pending);
+		/* Previous call to kbase_pm_clock_off() would have disabled
+		 * the interrupts and also synchronized with the interrupt
+		 * handlers, so more fault work items can't be enqueued.
+		 *
+		 * Can't wait for the completion of MMU fault work items as
+		 * there is a possibility of a deadlock since the fault work
+		 * items would do the group termination which requires the
+		 * Scheduler lock.
+		 */
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	wake_up(&kbdev->pm.backend.poweroff_wait);
+	WARN_ON(kbdev->pm.backend.gpu_powered);
+	dev_dbg(kbdev->dev, "GPU power down complete");
+
+unlock:
+	kbase_pm_unlock(kbdev);
+	kbase_csf_scheduler_unlock(kbdev);
+	kbase_reset_gpu_allow(kbdev);
+out:
+	if (ret) {
+		ret = -EBUSY;
+		pm_runtime_mark_last_busy(kbdev->dev);
+	}
+
+	return ret;
+}
+#endif
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_defs.h b/mali_kbase/backend/gpu/mali_kbase_pm_defs.h
index d9d3aa3..52877f5 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_defs.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_defs.h
@@ -29,6 +29,10 @@
 #include "mali_kbase_pm_always_on.h"
 #include "mali_kbase_pm_coarse_demand.h"
 
+#if defined(CONFIG_PM_RUNTIME) || defined(CONFIG_PM)
+#define KBASE_PM_RUNTIME 1
+#endif
+
 /* Forward definition - see mali_kbase.h */
 struct kbase_device;
 struct kbase_jd_atom;
@@ -271,10 +275,18 @@ union kbase_pm_policy_data {
  *                             &struct kbase_pm_callback_conf
  * @callback_power_runtime_off: Callback when the GPU may be turned off. See
  *                              &struct kbase_pm_callback_conf
- * @callback_power_runtime_idle: Optional callback when the GPU may be idle. See
- *                              &struct kbase_pm_callback_conf
+ * @callback_power_runtime_idle: Optional callback invoked by runtime PM core
+ *                               when the GPU may be idle. See
+ *                               &struct kbase_pm_callback_conf
  * @callback_soft_reset: Optional callback to software reset the GPU. See
  *                       &struct kbase_pm_callback_conf
+ * @callback_power_runtime_gpu_idle: Callback invoked by Kbase when GPU has
+ *                                   become idle.
+ *                                   See &struct kbase_pm_callback_conf.
+ * @callback_power_runtime_gpu_active: Callback when GPU has become active and
+ *                                     @callback_power_runtime_gpu_idle was
+ *                                     called previously.
+ *                                     See &struct kbase_pm_callback_conf.
  * @ca_cores_enabled: Cores that are currently available
  * @mcu_state: The current state of the micro-control unit, only applicable
  *             to GPUs that have such a component
@@ -312,6 +324,34 @@ union kbase_pm_policy_data {
  * @policy_change_lock: Used to serialize the policy change calls. In CSF case,
  *                      the change of policy may involve the scheduler to
  *                      suspend running CSGs and then reconfigure the MCU.
+ * @gpu_sleep_supported: Flag to indicate that if GPU sleep feature can be
+ *                       supported by the kernel driver or not. If this
+ *                       flag is not set, then HW state is directly saved
+ *                       when GPU idle notification is received.
+ * @gpu_sleep_mode_active: Flag to indicate that the GPU needs to be in sleep
+ *                         mode. It is set when the GPU idle notification is
+ *                         received and is cleared when HW state has been
+ *                         saved in the runtime suspend callback function or
+ *                         when the GPU power down is aborted if GPU became
+ *                         active whilst it was in sleep mode. The flag is
+ *                         guarded with hwaccess_lock spinlock.
+ * @exit_gpu_sleep_mode: Flag to indicate the GPU can now exit the sleep
+ *                       mode due to the submission of work from Userspace.
+ *                       The flag is guarded with hwaccess_lock spinlock.
+ *                       The @gpu_sleep_mode_active flag is not immediately
+ *                       reset when this flag is set, this is to ensure that
+ *                       MCU doesn't gets disabled undesirably without the
+ *                       suspend of CSGs. That could happen when
+ *                       scheduler_pm_active() and scheduler_pm_idle() gets
+ *                       called before the Scheduler gets reactivated.
+ * @gpu_idled: Flag to ensure that the gpu_idle & gpu_active callbacks are
+ *             always called in pair. The flag is guarded with pm.lock mutex.
+ * @gpu_wakeup_override: Flag to force the power up of L2 cache & reactivation
+ *                       of MCU. This is set during the runtime suspend
+ *                       callback function, when GPU needs to exit the sleep
+ *                       mode for the saving the HW state before power down.
+ * @db_mirror_interrupt_enabled: Flag tracking if the Doorbell mirror interrupt
+ *                               is enabled or not.
  * @in_reset: True if a GPU is resetting and normal power manager operation is
  *            suspended
  * @partial_shaderoff: True if we want to partial power off shader cores,
@@ -398,6 +438,8 @@ struct kbase_pm_backend_data {
 	void (*callback_power_runtime_off)(struct kbase_device *kbdev);
 	int (*callback_power_runtime_idle)(struct kbase_device *kbdev);
 	int (*callback_soft_reset)(struct kbase_device *kbdev);
+	void (*callback_power_runtime_gpu_idle)(struct kbase_device *kbdev);
+	void (*callback_power_runtime_gpu_active)(struct kbase_device *kbdev);
 
 	u64 ca_cores_enabled;
 
@@ -413,6 +455,15 @@ struct kbase_pm_backend_data {
 	bool policy_change_clamp_state_to_off;
 	unsigned int csf_pm_sched_flags;
 	struct mutex policy_change_lock;
+
+#ifdef KBASE_PM_RUNTIME
+	bool gpu_sleep_supported;
+	bool gpu_sleep_mode_active;
+	bool exit_gpu_sleep_mode;
+	bool gpu_idled;
+	bool gpu_wakeup_override;
+	bool db_mirror_interrupt_enabled;
+#endif
 #endif
 	bool l2_desired;
 	bool l2_always_on;
@@ -420,11 +471,13 @@ struct kbase_pm_backend_data {
 
 	bool in_reset;
 
+#if !MALI_USE_CSF
 	bool partial_shaderoff;
 
 	bool protected_entry_transition_override;
 	bool protected_transition_override;
 	int protected_l2_override;
+#endif
 
 	bool hwcnt_desired;
 	bool hwcnt_disabled;
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
index bcada93..d65c684 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
@@ -40,6 +40,7 @@
 #include <mali_kbase_reset_gpu.h>
 #include <mali_kbase_ctx_sched.h>
 #include <mali_kbase_hwcnt_context.h>
+#include <mali_kbase_pbha.h>
 #include <backend/gpu/mali_kbase_cache_policy_backend.h>
 #include <device/mali_kbase_device.h>
 #include <backend/gpu/mali_kbase_irq_internal.h>
@@ -104,9 +105,15 @@ bool kbase_pm_is_mcu_desired(struct kbase_device *kbdev)
 	if (unlikely(!kbdev->csf.firmware_inited))
 		return false;
 
-	if (kbdev->csf.scheduler.pm_active_count)
+	if (kbdev->csf.scheduler.pm_active_count &&
+	    kbdev->pm.backend.mcu_desired)
 		return true;
 
+#ifdef KBASE_PM_RUNTIME
+	if (kbdev->pm.backend.gpu_wakeup_override)
+		return true;
+#endif
+
 	/* MCU is supposed to be ON, only when scheduler.pm_active_count is
 	 * non zero. But for always_on policy, the MCU needs to be kept on,
 	 * unless policy changing transition needs it off.
@@ -120,6 +127,7 @@ bool kbase_pm_is_mcu_desired(struct kbase_device *kbdev)
 
 bool kbase_pm_is_l2_desired(struct kbase_device *kbdev)
 {
+#if !MALI_USE_CSF
 	if (kbdev->pm.backend.protected_entry_transition_override)
 		return false;
 
@@ -130,15 +138,19 @@ bool kbase_pm_is_l2_desired(struct kbase_device *kbdev)
 	if (kbdev->pm.backend.protected_transition_override &&
 			!kbdev->pm.backend.shaders_desired)
 		return false;
-
-#if MALI_USE_CSF
-	if (kbdev->pm.backend.policy_change_clamp_state_to_off)
+#else
+	if (unlikely(kbdev->pm.backend.policy_change_clamp_state_to_off))
 		return false;
+
+	/* Power up the L2 cache only when MCU is desired */
+	if (likely(kbdev->csf.firmware_inited))
+		return kbase_pm_is_mcu_desired(kbdev);
 #endif
 
 	return kbdev->pm.backend.l2_desired;
 }
 
+#if !MALI_USE_CSF
 void kbase_pm_protected_override_enable(struct kbase_device *kbdev)
 {
 	lockdep_assert_held(&kbdev->hwaccess_lock);
@@ -204,6 +216,7 @@ void kbase_pm_protected_l2_override(struct kbase_device *kbdev, bool override)
 
 	kbase_pm_update_state(kbdev);
 }
+#endif
 
 /**
  * core_type_to_reg - Decode a core type and action to a register.
@@ -259,9 +272,8 @@ static void mali_cci_flush_l2(struct kbase_device *kbdev)
 	 * to be called from.
 	 */
 
-	kbase_reg_write(kbdev,
-			GPU_CONTROL_REG(GPU_COMMAND),
-			GPU_COMMAND_CLEAN_INV_CACHES);
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
+			GPU_COMMAND_CACHE_CLN_INV_L2);
 
 	raw = kbase_reg_read(kbdev,
 		GPU_CONTROL_REG(GPU_IRQ_RAWSTAT));
@@ -610,6 +622,35 @@ static inline bool kbase_pm_handle_mcu_core_attr_update(struct kbase_device *kbd
 	return (core_mask_update || timer_update);
 }
 
+bool kbase_pm_is_mcu_inactive(struct kbase_device *kbdev,
+			      enum kbase_mcu_state state)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	return ((state == KBASE_MCU_OFF) || (state == KBASE_MCU_IN_SLEEP));
+}
+
+#ifdef KBASE_PM_RUNTIME
+/**
+ * kbase_pm_enable_mcu_db_notification - Enable the Doorbell notification on
+ *                                       MCU side
+ *
+ * @kbdev: Pointer to the device.
+ *
+ * This function is called to re-enable the Doorbell notification on MCU side
+ * when MCU needs to beome active again.
+ */
+static void kbase_pm_enable_mcu_db_notification(struct kbase_device *kbdev)
+{
+	u32 val = kbase_reg_read(kbdev, GPU_CONTROL_REG(MCU_CONTROL));
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	val &= ~MCU_CNTRL_DOORBELL_DISABLE_MASK;
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(MCU_CONTROL), val);
+}
+#endif
+
 static int kbase_pm_mcu_update_state(struct kbase_device *kbdev)
 {
 	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
@@ -618,12 +659,12 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev)
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
 	/*
-	 * Initial load of firmare should have been done to
+	 * Initial load of firmware should have been done to
 	 * exercise the MCU state machine.
 	 */
 	if (unlikely(!kbdev->csf.firmware_inited)) {
 		WARN_ON(backend->mcu_state != KBASE_MCU_OFF);
-		return -EIO;
+		return 0;
 	}
 
 	do {
@@ -770,8 +811,15 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev)
 			if (!backend->hwcnt_disabled)
 				kbase_pm_trigger_hwcnt_disable(kbdev);
 
-			if (backend->hwcnt_disabled)
-				backend->mcu_state = KBASE_MCU_ON_HALT;
+
+			if (backend->hwcnt_disabled) {
+#ifdef KBASE_PM_RUNTIME
+				if (backend->gpu_sleep_mode_active)
+					backend->mcu_state = KBASE_MCU_ON_SLEEP_INITIATE;
+				else
+#endif
+					backend->mcu_state = KBASE_MCU_ON_HALT;
+			}
 			break;
 
 		case KBASE_MCU_ON_HALT:
@@ -816,7 +864,32 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev)
 			kbase_csf_firmware_disable_mcu_wait(kbdev);
 			backend->mcu_state = KBASE_MCU_OFF;
 			break;
+#ifdef KBASE_PM_RUNTIME
+		case KBASE_MCU_ON_SLEEP_INITIATE:
+			if (!kbase_pm_is_mcu_desired(kbdev)) {
+				kbase_csf_firmware_trigger_mcu_sleep(kbdev);
+				backend->mcu_state = KBASE_MCU_ON_PEND_SLEEP;
+			} else
+				backend->mcu_state = KBASE_MCU_ON_HWCNT_ENABLE;
+			break;
+
+		case KBASE_MCU_ON_PEND_SLEEP:
+			if (kbase_csf_firmware_is_mcu_in_sleep(kbdev)) {
+				backend->mcu_state = KBASE_MCU_IN_SLEEP;
+				kbase_pm_enable_db_mirror_interrupt(kbdev);
+				kbase_csf_scheduler_reval_idleness_post_sleep(kbdev);
+			}
+			break;
 
+		case KBASE_MCU_IN_SLEEP:
+			if (kbase_pm_is_mcu_desired(kbdev) &&
+			    backend->l2_state == KBASE_L2_ON) {
+				kbase_pm_enable_mcu_db_notification(kbdev);
+				kbase_pm_disable_db_mirror_interrupt(kbdev);
+				backend->mcu_state = KBASE_MCU_ON_HWCNT_ENABLE;
+			}
+			break;
+#endif
 		case KBASE_MCU_RESET_WAIT:
 			/* Reset complete  */
 			if (!backend->in_reset)
@@ -889,8 +962,24 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 #endif
 			backend->shaders_state =
 				KBASE_SHADERS_OFF_CORESTACK_OFF;
-			backend->l2_state = KBASE_L2_OFF;
-			dev_dbg(kbdev->dev, "GPU lost has occurred - L2 off\n");
+			backend->hwcnt_desired = false;
+			if (!backend->hwcnt_disabled) {
+				/* Don't progress until hw counters are disabled
+				 * This may involve waiting for a worker to complete.
+				 * The HW counters backend disable code checks for the
+				 * GPU removed case and will error out without touching
+				 * the hardware. This step is needed to keep the HW
+				 * counters in a consistent state after a GPU lost.
+				 */
+				backend->l2_state =
+					KBASE_L2_ON_HWCNT_DISABLE;
+				kbase_pm_trigger_hwcnt_disable(kbdev);
+			}
+
+			if (backend->hwcnt_disabled) {
+				backend->l2_state = KBASE_L2_OFF;
+				dev_dbg(kbdev->dev, "GPU lost has occurred - L2 off\n");
+			}
 			break;
 		}
 
@@ -911,6 +1000,7 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 				 * powering it on
 				 */
 				kbase_pm_l2_config_override(kbdev);
+				kbase_pbha_write_settings(kbdev);
 #if !MALI_USE_CSF
 				/* L2 is required, power on.  Powering on the
 				 * tiler will also power the first L2 cache.
@@ -1027,7 +1117,8 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 					break;
 #else
 				/* Do not power off L2 until the MCU has been stopped */
-				if (backend->mcu_state != KBASE_MCU_OFF)
+				if ((backend->mcu_state != KBASE_MCU_OFF) &&
+				    (backend->mcu_state != KBASE_MCU_IN_SLEEP))
 					break;
 #endif
 
@@ -1608,7 +1699,7 @@ static int kbase_pm_shaders_update_state(struct kbase_device *kbdev)
 
 	return 0;
 }
-#endif
+#endif /* !MALI_USE_CSF */
 
 static bool kbase_pm_is_in_desired_state_nolock(struct kbase_device *kbdev)
 {
@@ -1635,7 +1726,8 @@ static bool kbase_pm_is_in_desired_state_nolock(struct kbase_device *kbdev)
 	    kbdev->pm.backend.mcu_state != KBASE_MCU_ON)
 		in_desired_state = false;
 	else if (!kbase_pm_is_mcu_desired(kbdev) &&
-		 kbdev->pm.backend.mcu_state != KBASE_MCU_OFF)
+		 (kbdev->pm.backend.mcu_state != KBASE_MCU_OFF) &&
+		 (kbdev->pm.backend.mcu_state != KBASE_MCU_IN_SLEEP))
 		in_desired_state = false;
 #endif
 
@@ -1734,8 +1826,8 @@ void kbase_pm_update_state(struct kbase_device *kbdev)
 	if (kbase_pm_mcu_update_state(kbdev))
 		return;
 
-	if (prev_mcu_state != KBASE_MCU_OFF &&
-	    kbdev->pm.backend.mcu_state == KBASE_MCU_OFF) {
+	if (!kbase_pm_is_mcu_inactive(kbdev, prev_mcu_state) &&
+	    kbase_pm_is_mcu_inactive(kbdev, kbdev->pm.backend.mcu_state)) {
 		if (kbase_pm_l2_update_state(kbdev))
 			return;
 	}
@@ -1828,6 +1920,9 @@ void kbase_pm_reset_start_locked(struct kbase_device *kbdev)
 	 */
 	if (likely(kbdev->csf.firmware_inited)) {
 		backend->mcu_state = KBASE_MCU_RESET_WAIT;
+#ifdef KBASE_PM_RUNTIME
+		backend->exit_gpu_sleep_mode = true;
+#endif
 		kbdev->csf.firmware_reload_needed = true;
 	} else {
 		WARN_ON(backend->mcu_state != KBASE_MCU_OFF);
@@ -1865,6 +1960,9 @@ void kbase_pm_reset_complete(struct kbase_device *kbdev)
 	 */
 	kbase_gpu_cache_clean_wait_complete(kbdev);
 	backend->in_reset = false;
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+	backend->gpu_wakeup_override = false;
+#endif
 	kbase_pm_update_state(kbdev);
 
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
@@ -2098,6 +2196,7 @@ static void update_user_reg_page_mapping(struct kbase_device *kbdev)
  */
 void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume)
 {
+	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
 	bool reset_required = is_resume;
 	unsigned long flags;
 
@@ -2115,7 +2214,13 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume)
 	}
 #endif
 
-	if (kbdev->pm.backend.gpu_powered) {
+	if (backend->gpu_powered) {
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+		if (backend->gpu_idled) {
+			backend->callback_power_runtime_gpu_active(kbdev);
+			backend->gpu_idled = false;
+		}
+#endif
 		/* Already turned on */
 		if (kbdev->poweroff_pending)
 			kbase_pm_enable_interrupts(kbdev);
@@ -2128,15 +2233,15 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume)
 
 	KBASE_KTRACE_ADD(kbdev, PM_GPU_ON, NULL, 0u);
 
-	if (is_resume && kbdev->pm.backend.callback_power_resume) {
-		kbdev->pm.backend.callback_power_resume(kbdev);
+	if (is_resume && backend->callback_power_resume) {
+		backend->callback_power_resume(kbdev);
 		return;
-	} else if (kbdev->pm.backend.callback_power_on) {
-		reset_required = kbdev->pm.backend.callback_power_on(kbdev);
+	} else if (backend->callback_power_on) {
+		reset_required = backend->callback_power_on(kbdev);
 	}
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	kbdev->pm.backend.gpu_powered = true;
+	backend->gpu_powered = true;
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
 #if MALI_USE_CSF
@@ -2194,8 +2299,8 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume)
 
 	/* Turn on the L2 caches */
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	kbdev->pm.backend.gpu_ready = true;
-	kbdev->pm.backend.l2_desired = true;
+	backend->gpu_ready = true;
+	backend->l2_desired = true;
 #if MALI_USE_CSF
 	if (reset_required) {
 		/* GPU reset was done after the power on, so send the post
@@ -2209,6 +2314,17 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume)
 #endif
 	kbase_pm_update_state(kbdev);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+	/* GPU is now powered up. Invoke the GPU active callback as GPU idle
+	 * callback would have been invoked before the power down.
+	 */
+	if (backend->gpu_idled) {
+		backend->callback_power_runtime_gpu_active(kbdev);
+		backend->gpu_idled = false;
+	}
+#endif
+
 }
 
 KBASE_EXPORT_TEST_API(kbase_pm_clock_on);
@@ -2252,19 +2368,22 @@ bool kbase_pm_clock_off(struct kbase_device *kbdev)
 	kbase_ipa_control_handle_gpu_power_off(kbdev);
 #endif
 
-	kbdev->pm.backend.gpu_ready = false;
-
-	/* The GPU power may be turned off from this point */
-	kbdev->pm.backend.gpu_powered = false;
-
+	if (kbase_is_gpu_removed(kbdev)
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
-	if (kbase_pm_is_gpu_lost(kbdev)) {
+			|| kbase_pm_is_gpu_lost(kbdev)) {
+#else
+			) {
+#endif
 		/* Ensure we unblock any threads that are stuck waiting
 		 * for the GPU
 		 */
 		kbase_gpu_cache_clean_wait_complete(kbdev);
 	}
-#endif
+
+	kbdev->pm.backend.gpu_ready = false;
+
+	/* The GPU power may be turned off from this point */
+	kbdev->pm.backend.gpu_powered = false;
 
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_internal.h b/mali_kbase/backend/gpu/mali_kbase_pm_internal.h
index 70d009e..ef26c16 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_internal.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_internal.h
@@ -137,6 +137,10 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume);
  * off. It should be modified during integration to perform the necessary
  * actions to turn the clock off (if this is possible in the integration).
  *
+ * If runtime PM is enabled and @power_runtime_gpu_idle_callback is used
+ * then this function would usually be invoked from the runtime suspend
+ * callback function.
+ *
  * @kbdev:      The kbase device structure for the device (must be a valid
  *              pointer)
  *
@@ -242,7 +246,7 @@ int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev);
  * NOTE: This may not wait until the correct state is reached if there is a
  * power off in progress. To correctly wait for the desired state the caller
  * must ensure that this is not the case by, for example, calling
- * kbase_pm_wait_for_poweroff_complete()
+ * kbase_pm_wait_for_poweroff_work_complete()
  *
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
@@ -432,12 +436,25 @@ void kbase_pm_release_gpu_cycle_counter(struct kbase_device *kbdev);
 void kbase_pm_release_gpu_cycle_counter_nolock(struct kbase_device *kbdev);
 
 /**
- * kbase_pm_wait_for_poweroff_complete - Wait for the poweroff workqueue to
- *                                       complete
+ * kbase_pm_wait_for_poweroff_work_complete - Wait for the poweroff workqueue to
+ *                                            complete
  *
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * This function effectively just waits for the @gpu_poweroff_wait_work work
+ * item to complete, if it was enqueued. GPU may not have been powered down
+ * before this function returns.
  */
-void kbase_pm_wait_for_poweroff_complete(struct kbase_device *kbdev);
+void kbase_pm_wait_for_poweroff_work_complete(struct kbase_device *kbdev);
+
+/**
+ * kbase_pm_wait_for_gpu_power_down - Wait for the GPU power down to complete
+ *
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * This function waits for the actual gpu power down to complete.
+ */
+void kbase_pm_wait_for_gpu_power_down(struct kbase_device *kbdev);
 
 /**
  * kbase_pm_runtime_init - Initialize runtime-pm for Mali GPU platform device
@@ -635,6 +652,7 @@ void kbase_pm_reset_start_locked(struct kbase_device *kbdev);
  */
 void kbase_pm_reset_complete(struct kbase_device *kbdev);
 
+#if !MALI_USE_CSF
 /**
  * kbase_pm_protected_override_enable - Enable the protected mode override
  * @kbdev: Device pointer
@@ -707,6 +725,7 @@ int kbase_pm_protected_entry_override_enable(struct kbase_device *kbdev);
  * to enter protected mode.
  */
 void kbase_pm_protected_entry_override_disable(struct kbase_device *kbdev);
+#endif
 
 /* If true, the driver should explicitly control corestack power management,
  * instead of relying on the Power Domain Controller.
@@ -737,6 +756,21 @@ bool kbase_pm_is_l2_desired(struct kbase_device *kbdev);
 bool kbase_pm_is_mcu_desired(struct kbase_device *kbdev);
 
 /**
+ * kbase_pm_is_mcu_inactive - Check if the MCU is inactive (i.e. either
+ *                            it is disabled or it is in sleep)
+ *
+ * @kbdev: kbase device
+ * @state: state of the MCU state machine.
+ *
+ * This function must be called with hwaccess_lock held.
+ * L2 cache can be turned off if this function returns true.
+ *
+ * Return: true if MCU is inactive
+ */
+bool kbase_pm_is_mcu_inactive(struct kbase_device *kbdev,
+			      enum kbase_mcu_state state);
+
+/**
  * kbase_pm_idle_groups_sched_suspendable - Check whether the scheduler can be
  *                                        suspended to low power state when all
  *                                        the CSGs are idle
@@ -818,4 +852,83 @@ static inline void kbase_pm_unlock(struct kbase_device *kbdev)
 #endif /* !MALI_USE_CSF */
 }
 
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+/**
+ * kbase_pm_gpu_sleep_allowed - Check if the GPU is allowed to be put in sleep
+ *
+ * @kbdev: Device pointer
+ *
+ * This function is called on GPU idle notification and if it returns false then
+ * GPU power down will be triggered by suspending the CSGs and halting the MCU.
+ *
+ * Return: true if the GPU is allowed to be in the sleep state.
+ */
+static inline bool kbase_pm_gpu_sleep_allowed(struct kbase_device *kbdev)
+{
+	/* If the autosuspend_delay has been set to 0 then it doesn't make
+	 * sense to first put GPU to sleep state and then power it down,
+	 * instead would be better to power it down right away.
+	 * Also need to do the same when autosuspend_delay is set to a negative
+	 * value, which implies that runtime pm is effectively disabled by the
+	 * kernel.
+	 * A high positive value of autosuspend_delay can be used to keep the
+	 * GPU in sleep state for a long time.
+	 */
+	if (unlikely(!kbdev->dev->power.autosuspend_delay ||
+		     (kbdev->dev->power.autosuspend_delay < 0)))
+		return false;
+
+	return kbdev->pm.backend.gpu_sleep_supported;
+}
+
+/**
+ * kbase_pm_enable_db_mirror_interrupt - Enable the doorbell mirror interrupt to
+ *                                       detect the User doorbell rings.
+ *
+ * @kbdev: Device pointer
+ *
+ * This function is called just before sending the sleep request to MCU firmware
+ * so that User doorbell rings can be detected whilst GPU remains in the sleep
+ * state.
+ *
+ */
+static inline void kbase_pm_enable_db_mirror_interrupt(struct kbase_device *kbdev)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	if (!kbdev->pm.backend.db_mirror_interrupt_enabled) {
+		u32 irq_mask = kbase_reg_read(kbdev,
+				GPU_CONTROL_REG(GPU_IRQ_MASK));
+
+		WARN_ON(irq_mask & DOORBELL_MIRROR);
+
+		kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK),
+				irq_mask | DOORBELL_MIRROR);
+		kbdev->pm.backend.db_mirror_interrupt_enabled = true;
+	}
+}
+
+/**
+ * kbase_pm_disable_db_mirror_interrupt - Disable the doorbell mirror interrupt.
+ *
+ * @kbdev: Device pointer
+ *
+ * This function is called when doorbell mirror interrupt is received or MCU
+ * needs to be reactivated by enabling the doorbell notification.
+ */
+static inline void kbase_pm_disable_db_mirror_interrupt(struct kbase_device *kbdev)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	if (kbdev->pm.backend.db_mirror_interrupt_enabled) {
+		u32 irq_mask = kbase_reg_read(kbdev,
+				GPU_CONTROL_REG(GPU_IRQ_MASK));
+
+		kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK),
+				irq_mask & ~DOORBELL_MIRROR);
+		kbdev->pm.backend.db_mirror_interrupt_enabled = false;
+	}
+}
+#endif
+
 #endif /* _KBASE_BACKEND_PM_INTERNAL_H_ */
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h b/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h
index 4e99928..96f196f 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h
@@ -42,6 +42,20 @@
  * @POWER_DOWN:               MCU halted operations, pending being disabled.
  * @PEND_OFF:                 MCU is being disabled, pending on powering off.
  * @RESET_WAIT:               The GPU is resetting, MCU state is unknown.
+ * @HCTL_SHADERS_PEND_ON:     Global configuration requests sent to the firmware
+ *                            have completed and shaders have been requested to
+ *                            power on.
+ * @HCTL_CORES_NOTIFY_PEND:   Shader cores have powered up and firmware is being
+ *                            notified of the mask of enabled shader cores.
+ * @HCTL_MCU_ON_RECHECK:      MCU is on and hwcnt disabling is triggered
+ *                            and checks are done to increase the number of
+ *                            enabled cores.
+ * @HCTL_SHADERS_READY_OFF:   MCU has halted and cores need to be powered down
+ * @HCTL_SHADERS_PEND_OFF:    Cores are transitioning to power down.
+ * @ON_SLEEP_INITIATE:        MCU is on and hwcnt has been disabled and MCU
+ *                            is being put to sleep.
+ * @ON_PEND_SLEEP:            MCU sleep is in progress.
+ * @IN_SLEEP:                 Sleep request is completed and MCU has halted.
  */
 KBASEP_MCU_STATE(OFF)
 KBASEP_MCU_STATE(PEND_ON_RELOAD)
@@ -61,3 +75,7 @@ KBASEP_MCU_STATE(HCTL_CORES_NOTIFY_PEND)
 KBASEP_MCU_STATE(HCTL_MCU_ON_RECHECK)
 KBASEP_MCU_STATE(HCTL_SHADERS_READY_OFF)
 KBASEP_MCU_STATE(HCTL_SHADERS_PEND_OFF)
+/* Additional MCU states to support GPU sleep feature */
+KBASEP_MCU_STATE(ON_SLEEP_INITIATE)
+KBASEP_MCU_STATE(ON_PEND_SLEEP)
+KBASEP_MCU_STATE(IN_SLEEP)
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_policy.c b/mali_kbase/backend/gpu/mali_kbase_pm_policy.c
index cf61ef8..7b126a1 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_policy.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_policy.c
@@ -183,7 +183,7 @@ void kbase_pm_update_dynamic_cores_onoff(struct kbase_device *kbdev)
 
 void kbase_pm_update_cores_state_nolock(struct kbase_device *kbdev)
 {
-	bool shaders_desired;
+	bool shaders_desired = false;
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
@@ -192,6 +192,7 @@ void kbase_pm_update_cores_state_nolock(struct kbase_device *kbdev)
 	if (kbdev->pm.backend.poweroff_wait_in_progress)
 		return;
 
+#if !MALI_USE_CSF
 	if (kbdev->pm.backend.protected_transition_override)
 		/* We are trying to change in/out of protected mode - force all
 		 * cores off so that the L2 powers down
@@ -199,15 +200,8 @@ void kbase_pm_update_cores_state_nolock(struct kbase_device *kbdev)
 		shaders_desired = false;
 	else
 		shaders_desired = kbdev->pm.backend.pm_current_policy->shaders_needed(kbdev);
-
-#if MALI_USE_CSF
-	/* On CSF GPUs, Host driver isn't supposed to do the power management
-	 * for shader cores. CSF firmware will power up the cores appropriately
-	 * and so from Driver's standpoint 'shaders_desired' flag shall always
-	 * remain 0.
-	 */
-	shaders_desired = false;
 #endif
+
 	if (kbdev->pm.backend.shaders_desired != shaders_desired) {
 		KBASE_KTRACE_ADD(kbdev, PM_CORES_CHANGE_DESIRED, NULL, kbdev->pm.backend.shaders_desired);
 
diff --git a/mali_kbase/backend/gpu/mali_kbase_time.c b/mali_kbase/backend/gpu/mali_kbase_time.c
index d10e404..92a366b 100644
--- a/mali_kbase/backend/gpu/mali_kbase_time.c
+++ b/mali_kbase/backend/gpu/mali_kbase_time.c
@@ -23,6 +23,7 @@
 #include <mali_kbase_hwaccess_time.h>
 #include <device/mali_kbase_device.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
+#include <mali_kbase_config_defaults.h>
 
 void kbase_backend_get_gpu_time_norequest(struct kbase_device *kbdev,
 					  u64 *cycle_counter,
@@ -31,18 +32,8 @@ void kbase_backend_get_gpu_time_norequest(struct kbase_device *kbdev,
 {
 	u32 hi1, hi2;
 
-	if (cycle_counter) {
-		/* Read hi, lo, hi to ensure a coherent u64 */
-		do {
-			hi1 = kbase_reg_read(kbdev,
-					     GPU_CONTROL_REG(CYCLE_COUNT_HI));
-			*cycle_counter = kbase_reg_read(kbdev,
-					     GPU_CONTROL_REG(CYCLE_COUNT_LO));
-			hi2 = kbase_reg_read(kbdev,
-					     GPU_CONTROL_REG(CYCLE_COUNT_HI));
-		} while (hi1 != hi2);
-		*cycle_counter |= (((u64) hi1) << 32);
-	}
+	if (cycle_counter)
+		*cycle_counter = kbase_backend_get_cycle_cnt(kbdev);
 
 	if (system_time) {
 		/* Read hi, lo, hi to ensure a coherent u64 */
@@ -107,3 +98,66 @@ void kbase_backend_get_gpu_time(struct kbase_device *kbdev, u64 *cycle_counter,
 	kbase_pm_release_gpu_cycle_counter(kbdev);
 #endif
 }
+
+unsigned int kbase_get_timeout_ms(struct kbase_device *kbdev,
+				  enum kbase_timeout_selector selector)
+{
+	/* Timeout calculation:
+	 * dividing number of cycles by freq in KHz automatically gives value
+	 * in milliseconds. nr_cycles will have to be multiplied by 1e3 to
+	 * get result in microseconds, and 1e6 to get result in nanoseconds.
+	 */
+
+	u64 timeout, nr_cycles = 0;
+	u64 freq_khz = kbdev->lowest_gpu_freq_khz;
+
+	WARN_ON(!freq_khz);
+
+	switch (selector) {
+	/* use Firmware timeout if invalid selection */
+	default:
+#if !MALI_USE_CSF
+		WARN(1, "Invalid timeout selector used! Using default value");
+		timeout = JM_DEFAULT_TIMEOUT_CYCLES;
+		CSTD_UNUSED(nr_cycles);
+#else
+		WARN(1,
+		     "Invalid timeout selector used! Using CSF Firmware timeout");
+		fallthrough;
+	case CSF_FIRMWARE_TIMEOUT:
+		nr_cycles = CSF_FIRMWARE_TIMEOUT_CYCLES;
+		timeout = div_u64(nr_cycles, freq_khz);
+		/* cap CSF FW timeout to FIRMWARE_PING_INTERVAL_MS
+		 * if calculated timeout exceeds it. This should be adapted to a
+		 * direct timeout comparison once the FIRMWARE_PING_INTERVAL_MS
+		 * option is added to this timeout function. A compile-time check
+		 * such as BUILD_BUG_ON can also be done once the firmware ping
+		 * interval in cycles becomes available as a macro.
+		 */
+		if (timeout > FIRMWARE_PING_INTERVAL_MS) {
+			dev_dbg(kbdev->dev, "Capped CSF_FIRMWARE_TIMEOUT %llu to %d",
+				timeout, FIRMWARE_PING_INTERVAL_MS);
+			timeout = FIRMWARE_PING_INTERVAL_MS;
+		}
+#endif
+		break;
+	}
+	return (unsigned int)timeout;
+}
+
+u64 kbase_backend_get_cycle_cnt(struct kbase_device *kbdev)
+{
+	u32 hi1, hi2, lo;
+
+	/* Read hi, lo, hi to ensure a coherent u64 */
+	do {
+		hi1 = kbase_reg_read(kbdev,
+					GPU_CONTROL_REG(CYCLE_COUNT_HI));
+		lo = kbase_reg_read(kbdev,
+					GPU_CONTROL_REG(CYCLE_COUNT_LO));
+		hi2 = kbase_reg_read(kbdev,
+					GPU_CONTROL_REG(CYCLE_COUNT_HI));
+	} while (hi1 != hi2);
+
+	return lo | (((u64) hi1) << 32);
+}
diff --git a/mali_kbase/build.bp b/mali_kbase/build.bp
index 979e06f..030af9d 100644
--- a/mali_kbase/build.bp
+++ b/mali_kbase/build.bp
@@ -154,7 +154,9 @@ bob_defaults {
         // (catch-all for experimental CS code without separating it into
         // different features).
         "MALI_INCREMENTAL_RENDERING={{.incremental_rendering}}",
-        "GPU_TIMESTAMP_CORRECTION={{.gpu_timestamp_correction}}",
+        "MALI_GPU_TIMESTAMP_CORRECTION={{.gpu_timestamp_correction}}",
+        "MALI_BASE_CSF_PERFORMANCE_TESTS={{.base_csf_performance_tests}}",
+        "MALI_GPU_TIMESTAMP_INTERPOLATION={{.gpu_timestamp_interpolation}}",
     ],
 }
 
diff --git a/mali_kbase/context/mali_kbase_context.c b/mali_kbase/context/mali_kbase_context.c
index b2e7025..85f4c0a 100644
--- a/mali_kbase/context/mali_kbase_context.c
+++ b/mali_kbase/context/mali_kbase_context.c
@@ -283,7 +283,7 @@ int kbase_context_mmu_init(struct kbase_context *kctx)
 {
 	return kbase_mmu_init(
 		kctx->kbdev, &kctx->mmu, kctx,
-		base_context_mmu_group_id_get(kctx->create_flags));
+		kbase_context_mmu_group_id_get(kctx->create_flags));
 }
 
 void kbase_context_mmu_term(struct kbase_context *kctx)
diff --git a/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.c b/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.c
index a62cafa..ce6d546 100644
--- a/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.c
+++ b/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.c
@@ -253,7 +253,7 @@ static inline void calc_prfcnt_delta(struct kbase_device *kbdev,
 
 	if (!WARN_ON_ONCE(kbdev->csf.ipa_control.cur_gpu_rate == 0))
 		if (prfcnt->gpu_norm)
-			delta_value /= kbdev->csf.ipa_control.cur_gpu_rate;
+			delta_value = div_u64(delta_value, kbdev->csf.ipa_control.cur_gpu_rate);
 
 	prfcnt->latest_raw_value = raw_value;
 
@@ -300,17 +300,20 @@ kbase_ipa_control_rate_change_notify(struct kbase_clk_rate_listener *listener,
 		/* Interrupts are already disabled and interrupt state is also saved */
 		spin_lock(&ipa_ctrl->lock);
 
-		for (i = 0; i < ipa_ctrl->num_active_sessions; i++) {
-			size_t j;
+		for (i = 0; i < KBASE_IPA_CONTROL_MAX_SESSIONS; i++) {
 			struct kbase_ipa_control_session *session = &ipa_ctrl->sessions[i];
 
-			for (j = 0; j < session->num_prfcnts; j++) {
-				struct kbase_ipa_control_prfcnt *prfcnt =
-					&session->prfcnts[j];
+			if (session->active) {
+				size_t j;
+
+				for (j = 0; j < session->num_prfcnts; j++) {
+					struct kbase_ipa_control_prfcnt *prfcnt =
+						&session->prfcnts[j];
 
-				if (prfcnt->gpu_norm)
-					calc_prfcnt_delta(kbdev, prfcnt, true);
-			 }
+					if (prfcnt->gpu_norm)
+						calc_prfcnt_delta(kbdev, prfcnt, true);
+				}
+			}
 		}
 
 		ipa_ctrl->cur_gpu_rate = clk_rate_hz;
@@ -480,16 +483,21 @@ static int session_gpu_start(struct kbase_device *kbdev,
 	 */
 	if (!ret) {
 		if (session) {
+			/* On starting a session, value read is required for
+			 * IPA power model's calculation initialization.
+			 */
 			session_read_raw_values(kbdev, session);
 		} else {
 			size_t session_idx;
 
 			for (session_idx = 0;
-			     session_idx < ipa_ctrl->num_active_sessions;
-			     session_idx++)
-				session_read_raw_values(
-					kbdev,
-					&ipa_ctrl->sessions[session_idx]);
+			     session_idx < KBASE_IPA_CONTROL_MAX_SESSIONS;
+			     session_idx++) {
+				struct kbase_ipa_control_session *session_to_check = &ipa_ctrl->sessions[session_idx];
+
+				if (session_to_check->active)
+					session_read_raw_values(kbdev, session_to_check);
+			}
 		}
 	}
 
@@ -783,6 +791,12 @@ int kbase_ipa_control_query(struct kbase_device *kbdev, const void *client,
 	ipa_ctrl = &kbdev->csf.ipa_control;
 	session = (struct kbase_ipa_control_session *)client;
 
+	if (WARN_ON(!session->active)) {
+		dev_err(kbdev->dev,
+			"%s: attempt to query inactive session", __func__);
+		return -EINVAL;
+	}
+
 	if (WARN_ON(num_values < session->num_prfcnts)) {
 		dev_err(kbdev->dev,
 			"%s: not enough space (%zu) to return all counter values (%zu)",
@@ -860,20 +874,23 @@ void kbase_ipa_control_handle_gpu_power_off(struct kbase_device *kbdev)
 			ret);
 	}
 
-	for (session_idx = 0; session_idx < ipa_ctrl->num_active_sessions;
+	for (session_idx = 0; session_idx < KBASE_IPA_CONTROL_MAX_SESSIONS;
 	     session_idx++) {
+
 		struct kbase_ipa_control_session *session =
 			&ipa_ctrl->sessions[session_idx];
-		size_t i;
 
-		for (i = 0; i < session->num_prfcnts; i++) {
-			struct kbase_ipa_control_prfcnt *prfcnt =
-				&session->prfcnts[i];
+		if (session->active) {
+			size_t i;
 
-			calc_prfcnt_delta(kbdev, prfcnt, true);
+			for (i = 0; i < session->num_prfcnts; i++) {
+				struct kbase_ipa_control_prfcnt *prfcnt =
+					&session->prfcnts[i];
+
+				calc_prfcnt_delta(kbdev, prfcnt, true);
+			}
 		}
 	}
-
 	spin_unlock(&ipa_ctrl->lock);
 }
 
@@ -975,13 +992,17 @@ void kbase_ipa_control_protm_exited(struct kbase_device *kbdev)
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
-	for (i = 0; i < ipa_ctrl->num_active_sessions; i++) {
+	for (i = 0; i < KBASE_IPA_CONTROL_MAX_SESSIONS; i++) {
+
 		struct kbase_ipa_control_session *session =
 			&ipa_ctrl->sessions[i];
-		u64 protm_time = time_now - MAX(session->last_query_time,
-						ipa_ctrl->protm_start);
 
-		session->protm_time += protm_time;
+		if (session->active) {
+			u64 protm_time = time_now - MAX(session->last_query_time,
+							ipa_ctrl->protm_start);
+
+			session->protm_time += protm_time;
+		}
 	}
 
 	/* Acknowledge the protected_mode bit in the IPA_CONTROL STATUS
diff --git a/mali_kbase/csf/mali_kbase_csf.c b/mali_kbase/csf/mali_kbase_csf.c
index d49e343..142e5a8 100644
--- a/mali_kbase/csf/mali_kbase_csf.c
+++ b/mali_kbase/csf/mali_kbase_csf.c
@@ -32,6 +32,7 @@
 #include <mmu/mali_kbase_mmu.h>
 #include "mali_kbase_csf_timeout.h"
 #include <csf/ipa_control/mali_kbase_csf_ipa_control.h>
+#include <mali_kbase_hwaccess_time.h>
 
 #define CS_REQ_EXCEPTION_MASK (CS_REQ_FAULT_MASK | CS_REQ_FATAL_MASK)
 #define CS_ACK_EXCEPTION_MASK (CS_ACK_FAULT_MASK | CS_ACK_FATAL_MASK)
@@ -140,7 +141,7 @@ static void gpu_munmap_user_io_pages(struct kbase_context *kctx,
 	WARN_ON(reg->flags & KBASE_REG_FREE);
 
 	mutex_lock(&kctx->kbdev->csf.reg_lock);
-	kbase_remove_va_region(reg);
+	kbase_remove_va_region(kctx->kbdev, reg);
 	mutex_unlock(&kctx->kbdev->csf.reg_lock);
 }
 
@@ -171,6 +172,11 @@ static int gpu_mmap_user_io_pages(struct kbase_device *kbdev,
 	const size_t num_pages = 2;
 	int ret;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 #if ((KERNEL_VERSION(4, 4, 147) >= LINUX_VERSION_CODE) || \
 		((KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE) && \
 		 (KERNEL_VERSION(4, 5, 0) <= LINUX_VERSION_CODE)))
@@ -195,19 +201,18 @@ static int gpu_mmap_user_io_pages(struct kbase_device *kbdev,
 		return ret;
 
 	/* Map input page */
-	ret = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu,
-				     reg->start_pfn, &phys[0],
-				     1, mem_flags, MCU_AS_NR,
-				     KBASE_MEM_GROUP_CSF_IO);
+	ret = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu, reg->start_pfn,
+				     &phys[0], 1, mem_flags, MCU_AS_NR,
+				     KBASE_MEM_GROUP_CSF_IO, mmu_sync_info);
 	if (ret)
 		goto bad_insert;
 
 	/* Map output page, it needs rw access */
 	mem_flags |= KBASE_REG_GPU_WR;
 	ret = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu,
-				     reg->start_pfn + 1, &phys[1],
-				     1, mem_flags, MCU_AS_NR,
-				     KBASE_MEM_GROUP_CSF_IO);
+				     reg->start_pfn + 1, &phys[1], 1, mem_flags,
+				     MCU_AS_NR, KBASE_MEM_GROUP_CSF_IO,
+				     mmu_sync_info);
 	if (ret)
 		goto bad_insert_output_page;
 
@@ -218,7 +223,7 @@ bad_insert_output_page:
 				 reg->start_pfn, 1, MCU_AS_NR);
 bad_insert:
 	mutex_lock(&kbdev->csf.reg_lock);
-	kbase_remove_va_region(reg);
+	kbase_remove_va_region(kbdev, reg);
 	mutex_unlock(&kbdev->csf.reg_lock);
 
 	return ret;
@@ -475,7 +480,7 @@ static int csf_queue_register_internal(struct kbase_context *kctx,
 
 	/* Only one pointer expected, otherwise coding error */
 	if ((reg == NULL && reg_ex == NULL) || (reg && reg_ex)) {
-		dev_err(kctx->kbdev->dev,
+		dev_dbg(kctx->kbdev->dev,
 			"Error, one and only one param-ptr expected!");
 		return -EINVAL;
 	}
@@ -1053,6 +1058,11 @@ static int create_normal_suspend_buffer(struct kbase_context *const kctx,
 		PFN_UP(kctx->kbdev->csf.global_iface.groups[0].suspend_size);
 	int err = 0;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	lockdep_assert_held(&kctx->csf.lock);
 
 	/* Allocate and initialize Region Object */
@@ -1090,9 +1100,9 @@ static int create_normal_suspend_buffer(struct kbase_context *const kctx,
 
 	/* Update MMU table */
 	err = kbase_mmu_insert_pages(kctx->kbdev, &kctx->kbdev->csf.mcu_mmu,
-				     reg->start_pfn, &s_buf->phy[0],
-				     nr_pages, mem_flags,
-				     MCU_AS_NR, KBASE_MEM_GROUP_CSF_FW);
+				     reg->start_pfn, &s_buf->phy[0], nr_pages,
+				     mem_flags, MCU_AS_NR,
+				     KBASE_MEM_GROUP_CSF_FW, mmu_sync_info);
 	if (err)
 		goto mmu_insert_failed;
 
@@ -1102,7 +1112,7 @@ static int create_normal_suspend_buffer(struct kbase_context *const kctx,
 
 mmu_insert_failed:
 	mutex_lock(&kctx->kbdev->csf.reg_lock);
-	WARN_ON(kbase_remove_va_region(reg));
+	kbase_remove_va_region(kctx->kbdev, reg);
 	mutex_unlock(&kctx->kbdev->csf.reg_lock);
 
 add_va_region_failed:
@@ -1138,6 +1148,11 @@ static int create_protected_suspend_buffer(struct kbase_device *const kbdev,
 		PFN_UP(kbdev->csf.global_iface.groups[0].suspend_size);
 	int err = 0;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	/* Allocate and initialize Region Object */
 	reg = kbase_alloc_free_region(&kbdev->csf.shared_reg_rbtree, 0,
 			nr_pages, KBASE_REG_ZONE_MCU_SHARED);
@@ -1170,10 +1185,9 @@ static int create_protected_suspend_buffer(struct kbase_device *const kbdev,
 		goto add_va_region_failed;
 
 	/* Update MMU table */
-	err = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu,
-				     reg->start_pfn, phys,
-				     nr_pages, mem_flags, MCU_AS_NR,
-				     KBASE_MEM_GROUP_CSF_FW);
+	err = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu, reg->start_pfn,
+				     phys, nr_pages, mem_flags, MCU_AS_NR,
+				     KBASE_MEM_GROUP_CSF_FW, mmu_sync_info);
 	if (err)
 		goto mmu_insert_failed;
 
@@ -1183,7 +1197,7 @@ static int create_protected_suspend_buffer(struct kbase_device *const kbdev,
 
 mmu_insert_failed:
 	mutex_lock(&kbdev->csf.reg_lock);
-	WARN_ON(kbase_remove_va_region(reg));
+	kbase_remove_va_region(kbdev, reg);
 	mutex_unlock(&kbdev->csf.reg_lock);
 
 add_va_region_failed:
@@ -1244,16 +1258,9 @@ static int create_suspend_buffers(struct kbase_context *const kctx,
  */
 static u32 generate_group_uid(void)
 {
-	/* use first KBase device to store max UID */
-	struct kbase_device *kbdev = kbase_find_device(-1);
-	u32 uid = 1;
-
-	if (kbdev)
-		uid = (u32) atomic_inc_return(&kbdev->group_max_uid_in_devices);
-	else
-		WARN(1, "NULL kbase device pointer in group UID generation");
+	static atomic_t global_csg_uid = ATOMIC_INIT(0);
 
-	return uid;
+	return (u32)atomic_inc_return(&global_csg_uid);
 }
 
 /**
@@ -1272,8 +1279,8 @@ static int create_queue_group(struct kbase_context *const kctx,
 	int group_handle = find_free_group_handle(kctx);
 
 	if (group_handle < 0) {
-		dev_err(kctx->kbdev->dev,
-			"All queue group handles are already in use\n");
+		dev_dbg(kctx->kbdev->dev,
+			"All queue group handles are already in use");
 	} else {
 		struct kbase_queue_group * const group =
 			kmalloc(sizeof(struct kbase_queue_group),
@@ -1349,16 +1356,16 @@ int kbase_csf_queue_group_create(struct kbase_context *const kctx,
 	if ((create->in.tiler_max > tiler_count) ||
 	    (create->in.fragment_max > fragment_count) ||
 	    (create->in.compute_max > compute_count)) {
-		dev_err(kctx->kbdev->dev,
-			"Invalid maximum number of endpoints for a queue group\n");
+		dev_dbg(kctx->kbdev->dev,
+			"Invalid maximum number of endpoints for a queue group");
 		err = -EINVAL;
 	} else if (create->in.priority >= BASE_QUEUE_GROUP_PRIORITY_COUNT) {
-		dev_err(kctx->kbdev->dev, "Invalid queue group priority %u\n",
+		dev_dbg(kctx->kbdev->dev, "Invalid queue group priority %u",
 			(unsigned int)create->in.priority);
 		err = -EINVAL;
 	} else if (!iface_has_enough_streams(kctx->kbdev, create->in.cs_min)) {
-		dev_err(kctx->kbdev->dev,
-			"No CSG has at least %d CSs\n",
+		dev_dbg(kctx->kbdev->dev,
+			"No CSG has at least %d CSs",
 			create->in.cs_min);
 		err = -EINVAL;
 	} else {
@@ -1403,7 +1410,7 @@ static void term_normal_suspend_buffer(struct kbase_context *const kctx,
 	WARN_ON(s_buf->reg->flags & KBASE_REG_FREE);
 
 	mutex_lock(&kctx->kbdev->csf.reg_lock);
-	WARN_ON(kbase_remove_va_region(s_buf->reg));
+	kbase_remove_va_region(kctx->kbdev, s_buf->reg);
 	mutex_unlock(&kctx->kbdev->csf.reg_lock);
 
 	kbase_mem_pool_free_pages(
@@ -1436,7 +1443,7 @@ static void term_protected_suspend_buffer(struct kbase_device *const kbdev,
 	WARN_ON(s_buf->reg->flags & KBASE_REG_FREE);
 
 	mutex_lock(&kbdev->csf.reg_lock);
-	WARN_ON(kbase_remove_va_region(s_buf->reg));
+	kbase_remove_va_region(kbdev, s_buf->reg);
 	mutex_unlock(&kbdev->csf.reg_lock);
 
 	kbase_csf_protected_memory_free(kbdev, s_buf->pma, nr_pages);
@@ -1994,6 +2001,26 @@ bool kbase_csf_error_pending(struct kbase_context *kctx)
 	return event_pended;
 }
 
+static void sync_update_notify_gpu(struct kbase_context *kctx)
+{
+	bool can_notify_gpu;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kctx->kbdev->hwaccess_lock, flags);
+	can_notify_gpu = kctx->kbdev->pm.backend.gpu_powered;
+#ifdef KBASE_PM_RUNTIME
+	if (kctx->kbdev->pm.backend.gpu_sleep_mode_active)
+		can_notify_gpu = false;
+#endif
+
+	if (can_notify_gpu) {
+		kbase_csf_ring_doorbell(kctx->kbdev, CSF_KERNEL_DOORBELL_NR);
+		KBASE_KTRACE_ADD(kctx->kbdev, SYNC_UPDATE_EVENT_NOTIFY_GPU, kctx, 0u);
+	}
+
+	spin_unlock_irqrestore(&kctx->kbdev->hwaccess_lock, flags);
+}
+
 void kbase_csf_event_signal(struct kbase_context *kctx, bool notify_gpu)
 {
 	struct kbase_csf_event *event, *next_event;
@@ -2014,13 +2041,8 @@ void kbase_csf_event_signal(struct kbase_context *kctx, bool notify_gpu)
 	 * synch object wait operations are re-evaluated on a write to any
 	 * CS_DOORBELL/GLB_DOORBELL register.
 	 */
-	if (notify_gpu) {
-		spin_lock_irqsave(&kctx->kbdev->hwaccess_lock, flags);
-		if (kctx->kbdev->pm.backend.gpu_powered)
-			kbase_csf_ring_doorbell(kctx->kbdev, CSF_KERNEL_DOORBELL_NR);
-		KBASE_KTRACE_ADD(kctx->kbdev, SYNC_UPDATE_EVENT_NOTIFY_GPU, kctx, 0u);
-		spin_unlock_irqrestore(&kctx->kbdev->hwaccess_lock, flags);
-	}
+	if (notify_gpu)
+		sync_update_notify_gpu(kctx);
 
 	/* Now invoke the callbacks registered on backend side.
 	 * Allow item removal inside the loop, if requested by the callback.
@@ -2364,31 +2386,6 @@ static void protm_event_worker(struct work_struct *data)
 				 group, 0u);
 }
 
-static void report_queue_fatal_error(struct kbase_queue *const queue,
-				     u32 cs_fatal, u64 cs_fatal_info,
-				     u8 group_handle)
-{
-	struct base_csf_notification error =
-		{ .type = BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR,
-		  .payload = {
-			  .csg_error = {
-				  .handle = group_handle,
-				  .error = {
-					  .error_type =
-						  BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL,
-					  .payload = {
-						  .fatal_queue = {
-							  .sideband =
-								  cs_fatal_info,
-							  .status = cs_fatal,
-							  .csi_index =
-								  queue->csi_index,
-						  } } } } } };
-
-	add_error(queue->kctx, &queue->error, &error);
-	kbase_event_wakeup(queue->kctx);
-}
-
 /**
  * handle_fault_event - Handler for CS fault.
  *
@@ -2429,10 +2426,34 @@ handle_fault_event(struct kbase_queue *const queue,
 		 kbase_gpu_exception_name(cs_fault_exception_type),
 		 cs_fault_exception_data, cs_fault_info_exception_data);
 
-	if (cs_fault_exception_type ==
-	    CS_FAULT_EXCEPTION_TYPE_RESOURCE_EVICTION_TIMEOUT)
-		report_queue_fatal_error(queue, GPU_EXCEPTION_TYPE_SW_FAULT_2,
-					 0, queue->group->handle);
+}
+
+static void report_queue_fatal_error(struct kbase_queue *const queue,
+				     u32 cs_fatal, u64 cs_fatal_info,
+				     u8 group_handle)
+{
+	struct base_csf_notification error = {
+		.type = BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR,
+		.payload = {
+			.csg_error = {
+				.handle = group_handle,
+				.error = {
+					.error_type =
+					BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL,
+					.payload = {
+						.fatal_queue = {
+						.sideband = cs_fatal_info,
+						.status = cs_fatal,
+						.csi_index = queue->csi_index,
+						}
+					}
+				}
+			}
+		}
+	};
+
+	add_error(queue->kctx, &queue->error, &error);
+	kbase_event_wakeup(queue->kctx);
 }
 
 /**
@@ -2531,6 +2552,7 @@ handle_fatal_event(struct kbase_queue *const queue,
 		if (!queue_work(queue->kctx->csf.wq, &queue->fatal_event_work))
 			release_queue(queue);
 	}
+
 }
 
 /**
@@ -2757,9 +2779,14 @@ static void process_csg_interrupts(struct kbase_device *const kbdev,
 			 group->handle, csg_nr);
 
 		/* Check if the scheduling tick can be advanced */
-		if (kbase_csf_scheduler_all_csgs_idle(kbdev) &&
-		    !scheduler->gpu_idle_fw_timer_enabled) {
-			kbase_csf_scheduler_advance_tick_nolock(kbdev);
+		if (kbase_csf_scheduler_all_csgs_idle(kbdev)) {
+			if (!scheduler->gpu_idle_fw_timer_enabled)
+				kbase_csf_scheduler_advance_tick_nolock(kbdev);
+		} else if (atomic_read(&scheduler->non_idle_offslot_grps)) {
+			/* If there are non-idle CSGs waiting for a slot, fire
+			 * a tock for a replacement.
+			 */
+			mod_delayed_work(scheduler->wq, &scheduler->tock_work, 0);
 		}
 	}
 
@@ -2770,7 +2797,8 @@ static void process_csg_interrupts(struct kbase_device *const kbdev,
 		KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_PROGRESS_TIMER_INTERRUPT,
 					 group, req ^ ack);
 		dev_info(kbdev->dev,
-			"Timeout notification received for group %u of ctx %d_%d on slot %d\n",
+			"[%llu] Iterator PROGRESS_TIMER timeout notification received for group %u of ctx %d_%d on slot %d\n",
+			kbase_backend_get_cycle_cnt(kbdev),
 			group->handle, group->kctx->tgid, group->kctx->id, csg_nr);
 
 		handle_progress_timer_event(group);
@@ -2868,6 +2896,79 @@ static void process_prfcnt_interrupts(struct kbase_device *kbdev, u32 glb_req,
 	}
 }
 
+/**
+ * check_protm_enter_req_complete - Check if PROTM_ENTER request completed
+ *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ * @glb_req: Global request register value.
+ * @glb_ack: Global acknowledge register value.
+ *
+ * This function checks if the PROTM_ENTER Global request had completed and
+ * appropriately sends notification about the protected mode entry to components
+ * like IPA, HWC, IPA_CONTROL.
+ */
+static inline void check_protm_enter_req_complete(struct kbase_device *kbdev,
+						  u32 glb_req, u32 glb_ack)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
+
+	if (likely(!kbdev->csf.scheduler.active_protm_grp))
+		return;
+
+	if (kbdev->protected_mode)
+		return;
+
+	if ((glb_req & GLB_REQ_PROTM_ENTER_MASK) !=
+	    (glb_ack & GLB_REQ_PROTM_ENTER_MASK))
+		return;
+
+	dev_dbg(kbdev->dev, "Protected mode entry interrupt received");
+
+	kbdev->protected_mode = true;
+	kbase_ipa_protection_mode_switch_event(kbdev);
+	kbase_ipa_control_protm_entered(kbdev);
+	kbase_hwcnt_backend_csf_protm_entered(&kbdev->hwcnt_gpu_iface);
+}
+
+/**
+ * process_protm_exit - Handle the protected mode exit interrupt
+ *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ * @glb_ack: Global acknowledge register value.
+ *
+ * This function handles the PROTM_EXIT interrupt and sends notification
+ * about the protected mode exit to components like HWC, IPA_CONTROL.
+ */
+static inline void process_protm_exit(struct kbase_device *kbdev, u32 glb_ack)
+{
+	const struct kbase_csf_global_iface *const global_iface =
+		&kbdev->csf.global_iface;
+	struct kbase_csf_scheduler *scheduler =	&kbdev->csf.scheduler;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
+
+	dev_dbg(kbdev->dev, "Protected mode exit interrupt received");
+
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_REQ, glb_ack,
+					     GLB_REQ_PROTM_EXIT_MASK);
+
+	if (likely(scheduler->active_protm_grp)) {
+		KBASE_KTRACE_ADD_CSF_GRP(kbdev, SCHEDULER_EXIT_PROTM,
+					 scheduler->active_protm_grp, 0u);
+		scheduler->active_protm_grp = NULL;
+	} else {
+		dev_warn(kbdev->dev, "PROTM_EXIT interrupt after no pmode group");
+	}
+
+	if (!WARN_ON(!kbdev->protected_mode)) {
+		kbdev->protected_mode = false;
+		kbase_ipa_control_protm_exited(kbdev);
+		kbase_hwcnt_backend_csf_protm_exited(&kbdev->hwcnt_gpu_iface);
+	}
+}
+
 void kbase_csf_interrupt(struct kbase_device *kbdev, u32 val)
 {
 	unsigned long flags;
@@ -2898,19 +2999,10 @@ void kbase_csf_interrupt(struct kbase_device *kbdev, u32 val)
 					global_iface, GLB_ACK);
 			KBASE_KTRACE_ADD(kbdev, GLB_REQ_ACQ, NULL, glb_req ^ glb_ack);
 
-			if ((glb_req ^ glb_ack) & GLB_REQ_PROTM_EXIT_MASK) {
-				dev_dbg(kbdev->dev, "Protected mode exit interrupt received");
-				kbase_csf_firmware_global_input_mask(
-						global_iface, GLB_REQ, glb_ack,
-						GLB_REQ_PROTM_EXIT_MASK);
-				WARN_ON(!kbase_csf_scheduler_protected_mode_in_use(kbdev));
-				KBASE_KTRACE_ADD_CSF_GRP(kbdev, SCHEDULER_EXIT_PROTM, scheduler->active_protm_grp, 0u);
-				scheduler->active_protm_grp = NULL;
-				kbdev->protected_mode = false;
-				kbase_ipa_control_protm_exited(kbdev);
-				kbase_hwcnt_backend_csf_protm_exited(
-					&kbdev->hwcnt_gpu_iface);
-			}
+			check_protm_enter_req_complete(kbdev, glb_req, glb_ack);
+
+			if ((glb_req ^ glb_ack) & GLB_REQ_PROTM_EXIT_MASK)
+				process_protm_exit(kbdev, glb_ack);
 
 			/* Handle IDLE Hysteresis notification event */
 			if ((glb_req ^ glb_ack) & GLB_REQ_IDLE_EVENT_MASK) {
@@ -3066,4 +3158,3 @@ u8 kbase_csf_priority_check(struct kbase_device *kbdev, u8 req_priority)
 
 	return out_priority;
 }
-
diff --git a/mali_kbase/csf/mali_kbase_csf.h b/mali_kbase/csf/mali_kbase_csf.h
index e3bd436..640d2ed 100644
--- a/mali_kbase/csf/mali_kbase_csf.h
+++ b/mali_kbase/csf/mali_kbase_csf.h
@@ -39,10 +39,13 @@
  */
 #define KBASEP_USER_DB_NR_INVALID ((s8)-1)
 
-#define FIRMWARE_PING_INTERVAL_MS (4000) /* 4 seconds */
+#define FIRMWARE_PING_INTERVAL_MS (8000) /* 8 seconds */
 
 #define FIRMWARE_IDLE_HYSTERESIS_TIME_MS (10) /* Default 10 milliseconds */
 
+/* Idle hysteresis time can be scaled down when GPU sleep feature is used */
+#define FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER (5)
+
 /**
  * enum kbase_csf_event_callback_action - return type for CSF event callbacks.
  *
diff --git a/mali_kbase/csf/mali_kbase_csf_csg_debugfs.c b/mali_kbase/csf/mali_kbase_csf_csg_debugfs.c
index 14deb98..40bee79 100644
--- a/mali_kbase/csf/mali_kbase_csf_csg_debugfs.c
+++ b/mali_kbase/csf/mali_kbase_csf_csg_debugfs.c
@@ -24,10 +24,32 @@
 #include <linux/seq_file.h>
 #include <linux/delay.h>
 #include <csf/mali_kbase_csf_trace_buffer.h>
+#include <backend/gpu/mali_kbase_pm_internal.h>
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 #include "mali_kbase_csf_tl_reader.h"
 
+#define MAX_SCHED_STATE_STRING_LEN (16)
+static const char *scheduler_state_to_string(struct kbase_device *kbdev,
+			enum kbase_csf_scheduler_state sched_state)
+{
+	switch (sched_state) {
+	case SCHED_BUSY:
+		return "BUSY";
+	case SCHED_INACTIVE:
+		return "INACTIVE";
+	case SCHED_SUSPENDED:
+		return "SUSPENDED";
+#ifdef KBASE_PM_RUNTIME
+	case SCHED_SLEEPING:
+		return "SLEEPING";
+#endif
+	default:
+		dev_warn(kbdev->dev, "Unknown Scheduler state %d", sched_state);
+		return NULL;
+	}
+}
+
 /**
  * blocked_reason_to_string() - Convert blocking reason id to a string
  *
@@ -142,10 +164,6 @@ static void kbasep_csf_scheduler_dump_active_queue(struct seq_file *file,
 		    !queue->group))
 		return;
 
-	/* Ring the doorbell to have firmware update CS_EXTRACT */
-	kbase_csf_ring_cs_user_doorbell(queue->kctx->kbdev, queue);
-	msleep(100);
-
 	addr = (u32 *)queue->user_io_addr;
 	cs_insert = addr[CS_INSERT_LO/4] | ((u64)addr[CS_INSERT_HI/4] << 32);
 
@@ -253,32 +271,68 @@ static void kbasep_csf_scheduler_dump_active_queue(struct seq_file *file,
 /* Waiting timeout for STATUS_UPDATE acknowledgment, in milliseconds */
 #define CSF_STATUS_UPDATE_TO_MS (100)
 
+static void update_active_group_status(struct seq_file *file,
+		struct kbase_queue_group *const group)
+{
+	struct kbase_device *const kbdev = group->kctx->kbdev;
+	struct kbase_csf_cmd_stream_group_info const *const ginfo =
+		&kbdev->csf.global_iface.groups[group->csg_nr];
+	long remaining =
+		kbase_csf_timeout_in_jiffies(CSF_STATUS_UPDATE_TO_MS);
+	unsigned long flags;
+
+	/* Global doorbell ring for CSG STATUS_UPDATE request or User doorbell
+	 * ring for Extract offset update, shall not be made when MCU has been
+	 * put to sleep otherwise it will undesirably make MCU exit the sleep
+	 * state. Also it isn't really needed as FW will implicitly update the
+	 * status of all on-slot groups when MCU sleep request is sent to it.
+	 */
+	if (kbdev->csf.scheduler.state == SCHED_SLEEPING)
+		return;
+
+	/* Ring the User doobell shared between the queues bound to this
+	 * group, to have FW update the CS_EXTRACT for all the queues
+	 * bound to the group. Ring early so that FW gets adequate time
+	 * for the handling.
+	 */
+	kbase_csf_ring_doorbell(kbdev, group->doorbell_nr);
+
+	kbase_csf_scheduler_spin_lock(kbdev, &flags);
+	kbase_csf_firmware_csg_input_mask(ginfo, CSG_REQ,
+			~kbase_csf_firmware_csg_output(ginfo, CSG_ACK),
+			CSG_REQ_STATUS_UPDATE_MASK);
+	kbase_csf_scheduler_spin_unlock(kbdev, flags);
+	kbase_csf_ring_csg_doorbell(kbdev, group->csg_nr);
+
+	remaining = wait_event_timeout(kbdev->csf.event_wait,
+		!((kbase_csf_firmware_csg_input_read(ginfo, CSG_REQ) ^
+		kbase_csf_firmware_csg_output(ginfo, CSG_ACK)) &
+		CSG_REQ_STATUS_UPDATE_MASK), remaining);
+
+	if (!remaining) {
+		dev_err(kbdev->dev,
+			"Timed out for STATUS_UPDATE on group %d on slot %d",
+			group->handle, group->csg_nr);
+
+		seq_printf(file, "*** Warn: Timed out for STATUS_UPDATE on slot %d\n",
+			group->csg_nr);
+		seq_puts(file, "*** The following group-record is likely stale\n");
+	}
+}
+
 static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file,
 		struct kbase_queue_group *const group)
 {
 	if (kbase_csf_scheduler_group_get_slot(group) >= 0) {
 		struct kbase_device *const kbdev = group->kctx->kbdev;
-		unsigned long flags;
 		u32 ep_c, ep_r;
 		char exclusive;
 		struct kbase_csf_cmd_stream_group_info const *const ginfo =
 			&kbdev->csf.global_iface.groups[group->csg_nr];
-		long remaining =
-			kbase_csf_timeout_in_jiffies(CSF_STATUS_UPDATE_TO_MS);
 		u8 slot_priority =
 			kbdev->csf.scheduler.csg_slots[group->csg_nr].priority;
 
-		kbase_csf_scheduler_spin_lock(kbdev, &flags);
-		kbase_csf_firmware_csg_input_mask(ginfo, CSG_REQ,
-				~kbase_csf_firmware_csg_output(ginfo, CSG_ACK),
-				CSG_REQ_STATUS_UPDATE_MASK);
-		kbase_csf_scheduler_spin_unlock(kbdev, flags);
-		kbase_csf_ring_csg_doorbell(kbdev, group->csg_nr);
-
-		remaining = wait_event_timeout(kbdev->csf.event_wait,
-			!((kbase_csf_firmware_csg_input_read(ginfo, CSG_REQ) ^
-			   kbase_csf_firmware_csg_output(ginfo, CSG_ACK)) &
-			   CSG_REQ_STATUS_UPDATE_MASK), remaining);
+		update_active_group_status(file, group);
 
 		ep_c = kbase_csf_firmware_csg_output(ginfo,
 				CSG_STATUS_EP_CURRENT);
@@ -291,16 +345,6 @@ static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file,
 		else
 			exclusive = '0';
 
-		if (!remaining) {
-			dev_err(kbdev->dev,
-				"Timed out for STATUS_UPDATE on group %d on slot %d",
-				group->handle, group->csg_nr);
-
-			seq_printf(file, "*** Warn: Timed out for STATUS_UPDATE on slot %d\n",
-				group->csg_nr);
-			seq_printf(file, "*** The following group-record is likely stale\n");
-		}
-
 		seq_puts(file, "GroupID, CSG NR, CSG Prio, Run State, Priority, C_EP(Alloc/Req), F_EP(Alloc/Req), T_EP(Alloc/Req), Exclusive\n");
 		seq_printf(file, "%7d, %6d, %8d, %9d, %8d, %11d/%3d, %11d/%3d, %11d/%3d, %9c\n",
 			group->handle,
@@ -315,6 +359,10 @@ static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file,
 			CSG_STATUS_EP_CURRENT_TILER_EP_GET(ep_c),
 			CSG_STATUS_EP_REQ_TILER_EP_GET(ep_r),
 			exclusive);
+
+		/* Wait for the User doobell ring to take effect */
+		if (kbdev->csf.scheduler.state != SCHED_SLEEPING)
+			msleep(100);
 	} else {
 		seq_puts(file, "GroupID, CSG NR, Run State, Priority\n");
 		seq_printf(file, "%7d, %6d, %9d, %8d\n",
@@ -362,6 +410,12 @@ static int kbasep_csf_queue_group_debugfs_show(struct seq_file *file,
 
 	mutex_lock(&kctx->csf.lock);
 	kbase_csf_scheduler_lock(kbdev);
+	if (kbdev->csf.scheduler.state == SCHED_SLEEPING) {
+		/* Wait for the MCU sleep request to complete. Please refer the
+		 * update_active_group_status() function for the explanation.
+		 */
+		kbase_pm_wait_for_desired_state(kbdev);
+	}
 	for (gr = 0; gr < MAX_QUEUE_GROUP_NUM; gr++) {
 		struct kbase_queue_group *const group =
 			kctx->csf.queue_groups[gr];
@@ -395,6 +449,12 @@ static int kbasep_csf_scheduler_dump_active_groups(struct seq_file *file,
 			MALI_CSF_CSG_DEBUGFS_VERSION);
 
 	kbase_csf_scheduler_lock(kbdev);
+	if (kbdev->csf.scheduler.state == SCHED_SLEEPING) {
+		/* Wait for the MCU sleep request to complete. Please refer the
+		 * update_active_group_status() function for the explanation.
+		 */
+		kbase_pm_wait_for_desired_state(kbdev);
+	}
 	for (csg_nr = 0; csg_nr < num_groups; csg_nr++) {
 		struct kbase_queue_group *const group =
 			kbdev->csf.scheduler.csg_slots[csg_nr].resident_group;
@@ -502,59 +562,93 @@ DEFINE_SIMPLE_ATTRIBUTE(kbasep_csf_debugfs_scheduling_timer_kick_fops,
 		"%llu\n");
 
 /**
- * kbase_csf_debugfs_scheduler_suspend_get() - get if the scheduler is suspended.
+ * kbase_csf_debugfs_scheduler_state_get() - Get the state of scheduler.
  *
- * @data: The debugfs dentry private data, a pointer to kbase_device
- * @val: The debugfs output value, boolean: 1 suspended, 0 otherwise
+ * @file:     Object of the file that is being read.
+ * @user_buf: User buffer that contains the string.
+ * @count:    Length of user buffer
+ * @ppos:     Offset within file object
  *
- * Return: 0
+ * This function will return the current Scheduler state to Userspace
+ * Scheduler may exit that state by the time the state string is received
+ * by the Userspace.
+ *
+ * Return: 0 if Scheduler was found in an unexpected state, or the
+ *         size of the state string if it was copied successfully to the
+ *         User buffer or a negative value in case of an error.
  */
-static int kbase_csf_debugfs_scheduler_suspend_get(
-		void *data, u64 *val)
+static ssize_t kbase_csf_debugfs_scheduler_state_get(struct file *file,
+		    char __user *user_buf, size_t count, loff_t *ppos)
 {
-	struct kbase_device *kbdev = data;
+	struct kbase_device *kbdev = file->private_data;
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+	const char *state_string;
 
 	kbase_csf_scheduler_lock(kbdev);
-	*val = (scheduler->state == SCHED_SUSPENDED);
+	state_string = scheduler_state_to_string(kbdev, scheduler->state);
 	kbase_csf_scheduler_unlock(kbdev);
 
-	return 0;
+	if (!state_string)
+		count = 0;
+
+	return simple_read_from_buffer(user_buf, count, ppos,
+				       state_string, strlen(state_string));
 }
 
 /**
- * kbase_csf_debugfs_scheduler_suspend_set() - set the scheduler to suspended.
+ * kbase_csf_debugfs_scheduler_state_set() - Set the state of scheduler.
  *
- * @data: The debugfs dentry private data, a pointer to kbase_device
- * @val: The debugfs input value, boolean: 1 suspend, 0 otherwise
+ * @file:  Object of the file that is being written to.
+ * @ubuf:  User buffer that contains the string.
+ * @count: Length of user buffer
+ * @ppos:  Offset within file object
  *
- * Return: Negative value if already in requested state, 0 otherwise.
+ * This function will update the Scheduler state as per the state string
+ * passed by the Userspace. Scheduler may or may not remain in new state
+ * for long.
+ *
+ * Return: Negative value if the string doesn't correspond to a valid Scheduler
+ *         state or if copy from user buffer failed, otherwise the length of
+ *         the User buffer.
  */
-static int kbase_csf_debugfs_scheduler_suspend_set(
-		void *data, u64 val)
+static ssize_t kbase_csf_debugfs_scheduler_state_set(struct file *file,
+		const char __user *ubuf, size_t count, loff_t *ppos)
 {
-	struct kbase_device *kbdev = data;
-	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
-	enum kbase_csf_scheduler_state state;
+	struct kbase_device *kbdev = file->private_data;
+	char buf[MAX_SCHED_STATE_STRING_LEN];
+	ssize_t ret = count;
 
-	kbase_csf_scheduler_lock(kbdev);
-	state = scheduler->state;
-	kbase_csf_scheduler_unlock(kbdev);
+	CSTD_UNUSED(ppos);
+
+	count = min_t(size_t, sizeof(buf) - 1, count);
+	if (copy_from_user(buf, ubuf, count))
+		return -EFAULT;
+
+	buf[count] = 0;
 
-	if (val && (state != SCHED_SUSPENDED))
+	if (sysfs_streq(buf, "SUSPENDED"))
 		kbase_csf_scheduler_pm_suspend(kbdev);
-	else if (!val && (state == SCHED_SUSPENDED))
-		kbase_csf_scheduler_pm_resume(kbdev);
-	else
-		return -1;
+#ifdef KBASE_PM_RUNTIME
+	else if (sysfs_streq(buf, "SLEEPING"))
+		kbase_csf_scheduler_force_sleep(kbdev);
+#endif
+	else if (sysfs_streq(buf, "INACTIVE"))
+		kbase_csf_scheduler_force_wakeup(kbdev);
+	else {
+		dev_dbg(kbdev->dev, "Bad scheduler state %s", buf);
+		ret = -EINVAL;
+	}
 
-	return 0;
+	return ret;
 }
 
-DEFINE_SIMPLE_ATTRIBUTE(kbasep_csf_debugfs_scheduler_suspend_fops,
-		&kbase_csf_debugfs_scheduler_suspend_get,
-		&kbase_csf_debugfs_scheduler_suspend_set,
-		"%llu\n");
+static const struct file_operations kbasep_csf_debugfs_scheduler_state_fops = {
+	.owner = THIS_MODULE,
+	.read = kbase_csf_debugfs_scheduler_state_get,
+	.write = kbase_csf_debugfs_scheduler_state_set,
+	.open = simple_open,
+	.llseek = default_llseek,
+};
 
 void kbase_csf_debugfs_init(struct kbase_device *kbdev)
 {
@@ -568,9 +662,9 @@ void kbase_csf_debugfs_init(struct kbase_device *kbdev)
 	debugfs_create_file("scheduling_timer_kick", 0200,
 			kbdev->mali_debugfs_directory, kbdev,
 			&kbasep_csf_debugfs_scheduling_timer_kick_fops);
-	debugfs_create_file("scheduler_suspend", 0644,
+	debugfs_create_file("scheduler_state", 0644,
 			kbdev->mali_debugfs_directory, kbdev,
-			&kbasep_csf_debugfs_scheduler_suspend_fops);
+			&kbasep_csf_debugfs_scheduler_state_fops);
 
 	kbase_csf_tl_reader_debugfs_init(kbdev);
 	kbase_csf_firmware_trace_buffer_debugfs_init(kbdev);
diff --git a/mali_kbase/csf/mali_kbase_csf_defs.h b/mali_kbase/csf/mali_kbase_csf_defs.h
index 53526ce..de471eb 100644
--- a/mali_kbase/csf/mali_kbase_csf_defs.h
+++ b/mali_kbase/csf/mali_kbase_csf_defs.h
@@ -219,11 +219,19 @@ enum kbase_csf_csg_slot_state {
  *                      management reference. This can happen if the GPU
  *                      becomes idle for a duration exceeding a threshold,
  *                      or due to a system triggered suspend action.
+ * @SCHED_SLEEPING:     The scheduler is in low-power mode with scheduling
+ *                      operations suspended and is not holding the power
+ *                      management reference. This state is set, only for the
+ *                      GPUs that supports the sleep feature, when GPU idle
+ *                      notification is received. The state is changed to
+ *                      @SCHED_SUSPENDED from the runtime suspend callback
+ *                      function after the suspend of CSGs.
  */
 enum kbase_csf_scheduler_state {
 	SCHED_BUSY,
 	SCHED_INACTIVE,
 	SCHED_SUSPENDED,
+	SCHED_SLEEPING,
 };
 
 /**
@@ -561,7 +569,9 @@ struct kbase_csf_heap_context_allocator {
  * @kbase_context. It is not the same as a heap context structure allocated by
  * the kernel for use by the firmware.
  *
- * @lock:        Lock preventing concurrent access to the tiler heaps.
+ * @lock:        Lock to prevent the concurrent access to tiler heaps (after the
+ *               initialization), a tiler heap can be terminated whilst an OoM
+ *               event is being handled for it.
  * @list:        List of tiler heaps.
  * @ctx_alloc:   Allocator for heap context structures.
  * @nr_of_heaps: Total number of tiler heaps that were added during the
@@ -802,6 +812,11 @@ struct kbase_csf_csg_slot {
  * @active_protm_grp:       Indicates if firmware has been permitted to let GPU
  *                          enter protected mode with the given group. On exit
  *                          from protected mode the pointer is reset to NULL.
+ *                          This pointer is set and PROTM_ENTER request is sent
+ *                          atomically with @interrupt_lock held.
+ *                          This pointer being set doesn't necessarily indicates
+ *                          that GPU is in protected mode, kbdev->protected_mode
+ *                          needs to be checked for that.
  * @gpu_idle_fw_timer_enabled: Whether the CSF scheduler has activiated the
  *                            firmware idle hysteresis timer for preparing a
  *                            GPU suspend on idle.
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware.c b/mali_kbase/csf/mali_kbase_csf_firmware.c
index 1b31122..785555c 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware.c
+++ b/mali_kbase/csf/mali_kbase_csf_firmware.c
@@ -27,12 +27,14 @@
 #include "mali_kbase_reset_gpu.h"
 #include "mali_kbase_ctx_sched.h"
 #include "mali_kbase_csf_scheduler.h"
+#include <mali_kbase_hwaccess_time.h>
 #include "device/mali_kbase_device.h"
 #include "backend/gpu/mali_kbase_pm_internal.h"
 #include "tl/mali_kbase_timeline_priv.h"
 #include "mali_kbase_csf_tl_reader.h"
 #include "backend/gpu/mali_kbase_clk_rate_trace_mgr.h"
 #include <csf/ipa_control/mali_kbase_csf_ipa_control.h>
+#include <uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h>
 
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -47,7 +49,7 @@
 #include <asm/arch_timer.h>
 
 #define MALI_MAX_FIRMWARE_NAME_LEN ((size_t)20)
-
+#define ACK_TIMEOUT_MILLISECONDS 1000
 
 static char fw_name[MALI_MAX_FIRMWARE_NAME_LEN] = "mali_csffw.bin";
 module_param_string(fw_name, fw_name, sizeof(fw_name), 0644);
@@ -190,8 +192,10 @@ static int setup_shared_iface_static_region(struct kbase_device *kbdev)
 	reg = kbase_alloc_free_region(&kbdev->csf.shared_reg_rbtree, 0,
 			interface->num_pages, KBASE_REG_ZONE_MCU_SHARED);
 	if (reg) {
+		mutex_lock(&kbdev->csf.reg_lock);
 		ret = kbase_add_va_region_rbtree(kbdev, reg,
 				interface->virtual, interface->num_pages, 1);
+		mutex_unlock(&kbdev->csf.reg_lock);
 		if (ret)
 			kfree(reg);
 		else
@@ -1305,9 +1309,12 @@ static int wait_for_global_request(struct kbase_device *const kbdev,
 				       wait_timeout);
 
 	if (!remaining) {
-		dev_warn(kbdev->dev, "Timed out waiting for global request %x to complete",
+		dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for global request %x to complete",
+			 kbase_backend_get_cycle_cnt(kbdev),
+			 kbdev->csf.fw_timeout_ms,
 			 req_mask);
 		err = -ETIMEDOUT;
+
 	}
 
 	return err;
@@ -1388,11 +1395,6 @@ static void global_init(struct kbase_device *const kbdev, u64 core_mask)
 
 	kbase_csf_scheduler_spin_lock(kbdev, &flags);
 
-	/* Set the coherency mode for protected mode execution */
-	WARN_ON(kbdev->system_coherency == COHERENCY_ACE);
-	kbase_csf_firmware_global_input(global_iface, GLB_PROTM_COHERENCY,
-					kbdev->system_coherency);
-
 	/* Update shader core allocation enable mask */
 	enable_endpoints_global(global_iface, core_mask);
 	enable_shader_poweroff_timer(kbdev, global_iface);
@@ -1675,12 +1677,75 @@ u32 kbase_csf_firmware_set_mcu_core_pwroff_time(struct kbase_device *kbdev, u32
 	return pwroff;
 }
 
+/**
+ * kbase_device_csf_iterator_trace_init - Send request to enable iterator
+ *                                        trace port.
+ * @kbdev: Kernel base device pointer
+ *
+ * Return: 0 on success (or if enable request is not sent), or error
+ *         code -EINVAL on failure of GPU to acknowledge enable request.
+ */
+static int kbase_device_csf_iterator_trace_init(struct kbase_device *kbdev)
+{
+	/* Enable the iterator trace port if supported by the GPU.
+	 * It requires the GPU to have a nonzero "iter_trace_enable"
+	 * property in the device tree, and the FW must advertise
+	 * this feature in GLB_FEATURES.
+	 */
+	if (kbdev->pm.backend.gpu_powered) {
+		/* check device tree for iterator trace enable property */
+		const void *iter_trace_param = of_get_property(
+					       kbdev->dev->of_node,
+					       "iter_trace_enable", NULL);
+
+		const struct kbase_csf_global_iface *iface =
+						&kbdev->csf.global_iface;
+
+		if (iter_trace_param) {
+			u32 iter_trace_value = be32_to_cpup(iter_trace_param);
+
+			if ((iface->features &
+			     GLB_FEATURES_ITER_TRACE_SUPPORTED_MASK) &&
+			    iter_trace_value) {
+				long ack_timeout;
+
+				ack_timeout = kbase_csf_timeout_in_jiffies(
+						ACK_TIMEOUT_MILLISECONDS);
+
+				/* write enable request to global input */
+				kbase_csf_firmware_global_input_mask(
+					iface, GLB_REQ,
+					GLB_REQ_ITER_TRACE_ENABLE_MASK,
+					GLB_REQ_ITER_TRACE_ENABLE_MASK);
+				/* Ring global doorbell */
+				kbase_csf_ring_doorbell(kbdev,
+						    CSF_KERNEL_DOORBELL_NR);
+
+				ack_timeout = wait_event_timeout(
+					kbdev->csf.event_wait,
+					!((kbase_csf_firmware_global_input_read(
+						   iface, GLB_REQ) ^
+					   kbase_csf_firmware_global_output(
+						   iface, GLB_ACK)) &
+					  GLB_REQ_ITER_TRACE_ENABLE_MASK),
+					ack_timeout);
+
+				return ack_timeout ? 0 : -EINVAL;
+
+			}
+		}
+
+	}
+	return 0;
+}
 
 int kbase_csf_firmware_early_init(struct kbase_device *kbdev)
 {
 	init_waitqueue_head(&kbdev->csf.event_wait);
 	kbdev->csf.interrupt_received = false;
-	kbdev->csf.fw_timeout_ms = CSF_FIRMWARE_TIMEOUT_MS;
+
+	kbdev->csf.fw_timeout_ms =
+		kbase_get_timeout_ms(kbdev, CSF_FIRMWARE_TIMEOUT);
 
 	INIT_LIST_HEAD(&kbdev->csf.firmware_interfaces);
 	INIT_LIST_HEAD(&kbdev->csf.firmware_config);
@@ -1721,8 +1786,14 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev)
 	}
 
 	kbdev->csf.gpu_idle_hysteresis_ms = FIRMWARE_IDLE_HYSTERESIS_TIME_MS;
+#ifdef KBASE_PM_RUNTIME
+	if (kbase_pm_gpu_sleep_allowed(kbdev))
+		kbdev->csf.gpu_idle_hysteresis_ms /=
+			FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER;
+#endif
+	WARN_ON(!kbdev->csf.gpu_idle_hysteresis_ms);
 	kbdev->csf.gpu_idle_dur_count = convert_dur_to_idle_count(
-		kbdev, FIRMWARE_IDLE_HYSTERESIS_TIME_MS);
+		kbdev, kbdev->csf.gpu_idle_hysteresis_ms);
 
 	kbdev->csf.mcu_core_pwroff_dur_us = DEFAULT_GLB_PWROFF_TIMEOUT_US;
 	kbdev->csf.mcu_core_pwroff_dur_count = convert_dur_to_core_pwroff_count(
@@ -1851,6 +1922,9 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev)
 	if (ret != 0)
 		goto error;
 
+	ret = kbase_device_csf_iterator_trace_init(kbdev);
+	if (ret != 0)
+		goto error;
 
 	/* Firmware loaded successfully */
 	release_firmware(firmware);
@@ -2048,30 +2122,20 @@ int kbase_csf_firmware_set_timeout(struct kbase_device *const kbdev,
 void kbase_csf_enter_protected_mode(struct kbase_device *kbdev)
 {
 	struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface;
-	unsigned long flags;
-	int err;
 
-	kbase_csf_scheduler_spin_lock(kbdev, &flags);
+	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
 	set_global_request(global_iface, GLB_REQ_PROTM_ENTER_MASK);
 	dev_dbg(kbdev->dev, "Sending request to enter protected mode");
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
-	kbase_csf_scheduler_spin_unlock(kbdev, flags);
-
-	err = wait_for_global_request(kbdev, GLB_REQ_PROTM_ENTER_MASK);
-
-	if (!err) {
-		unsigned long irq_flags;
-
-		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-		kbdev->protected_mode = true;
-		kbase_ipa_protection_mode_switch_event(kbdev);
-		kbase_ipa_control_protm_entered(kbdev);
+}
 
-		kbase_csf_scheduler_spin_lock(kbdev, &irq_flags);
-		kbase_hwcnt_backend_csf_protm_entered(&kbdev->hwcnt_gpu_iface);
-		kbase_csf_scheduler_spin_unlock(kbdev, irq_flags);
+void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
+{
+	int err = wait_for_global_request(kbdev, GLB_REQ_PROTM_ENTER_MASK);
 
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	if (err) {
+		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
+			kbase_reset_gpu(kbdev);
 	}
 }
 
@@ -2081,12 +2145,38 @@ void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev)
 	unsigned long flags;
 
 	kbase_csf_scheduler_spin_lock(kbdev, &flags);
+	/* Validate there are no on-slot groups when sending the
+	 * halt request to firmware.
+	 */
+	WARN_ON(kbase_csf_scheduler_get_nr_active_csgs_locked(kbdev));
 	set_global_request(global_iface, GLB_REQ_HALT_MASK);
 	dev_dbg(kbdev->dev, "Sending request to HALT MCU");
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 	kbase_csf_scheduler_spin_unlock(kbdev, flags);
 }
 
+#ifdef KBASE_PM_RUNTIME
+void kbase_csf_firmware_trigger_mcu_sleep(struct kbase_device *kbdev)
+{
+	struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface;
+	unsigned long flags;
+
+	kbase_csf_scheduler_spin_lock(kbdev, &flags);
+	set_global_request(global_iface, GLB_REQ_SLEEP_MASK);
+	dev_dbg(kbdev->dev, "Sending sleep request to MCU");
+	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
+	kbase_csf_scheduler_spin_unlock(kbdev, flags);
+}
+
+bool kbase_csf_firmware_is_mcu_in_sleep(struct kbase_device *kbdev)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	return (global_request_complete(kbdev, GLB_REQ_SLEEP_MASK) &&
+		kbase_csf_firmware_mcu_halted(kbdev));
+}
+#endif
+
 int kbase_csf_trigger_firmware_config_update(struct kbase_device *kbdev)
 {
 	struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface;
@@ -2095,6 +2185,7 @@ int kbase_csf_trigger_firmware_config_update(struct kbase_device *kbdev)
 
 	/* Ensure GPU is powered-up until we complete config update.*/
 	kbase_csf_scheduler_pm_active(kbdev);
+	kbase_csf_scheduler_wait_mcu_active(kbdev);
 
 	/* The 'reg_lock' is also taken and is held till the update is
 	 * complete, to ensure the config update gets serialized.
@@ -2288,7 +2379,7 @@ int kbase_csf_firmware_mcu_shared_mapping_init(
 
 mmu_insert_pages_error:
 	mutex_lock(&kbdev->csf.reg_lock);
-	kbase_remove_va_region(va_reg);
+	kbase_remove_va_region(kbdev, va_reg);
 va_region_add_error:
 	kbase_free_alloced_region(va_reg);
 	mutex_unlock(&kbdev->csf.reg_lock);
@@ -2320,7 +2411,7 @@ void kbase_csf_firmware_mcu_shared_mapping_term(
 {
 	if (csf_mapping->va_reg) {
 		mutex_lock(&kbdev->csf.reg_lock);
-		kbase_remove_va_region(csf_mapping->va_reg);
+		kbase_remove_va_region(kbdev, csf_mapping->va_reg);
 		kbase_free_alloced_region(csf_mapping->va_reg);
 		mutex_unlock(&kbdev->csf.reg_lock);
 	}
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware.h b/mali_kbase/csf/mali_kbase_csf_firmware.h
index 60d7065..0edcc30 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware.h
+++ b/mali_kbase/csf/mali_kbase_csf_firmware.h
@@ -78,9 +78,6 @@
 /* Maximum CSs per csg. */
 #define MAX_SUPPORTED_STREAMS_PER_GROUP 32
 
-/* Waiting timeout for status change acknowledgment, in milliseconds */
-#define CSF_FIRMWARE_TIMEOUT_MS (3000) /* Relaxed to 3000ms from 800ms due to Android */
-
 struct kbase_device;
 
 
@@ -442,13 +439,27 @@ int kbase_csf_firmware_set_timeout(struct kbase_device *kbdev, u64 timeout);
 
 /**
  * kbase_csf_enter_protected_mode - Send the Global request to firmware to
- *                                  enter protected mode and wait for its
- *                                  completion.
+ *                                  enter protected mode.
  *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ *
+ * The function must be called with kbdev->csf.scheduler.interrupt_lock held
+ * and it does not wait for the protected mode entry to complete.
  */
 void kbase_csf_enter_protected_mode(struct kbase_device *kbdev);
 
+/**
+ * kbase_csf_wait_protected_mode_enter - Wait for the completion of PROTM_ENTER
+ *                                       Global request sent to firmware.
+ *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ *
+ * This function needs to be called after kbase_csf_wait_protected_mode_enter()
+ * to wait for the protected mode entry to complete. GPU reset is triggered if
+ * the wait is unsuccessful.
+ */
+void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev);
+
 static inline bool kbase_csf_firmware_mcu_halted(struct kbase_device *kbdev)
 {
 	return (kbase_reg_read(kbdev, GPU_CONTROL_REG(MCU_STATUS)) ==
@@ -497,6 +508,26 @@ static inline void kbase_csf_firmware_disable_mcu(struct kbase_device *kbdev)
  */
 void kbase_csf_firmware_disable_mcu_wait(struct kbase_device *kbdev);
 
+#ifdef KBASE_PM_RUNTIME
+/**
+ * kbase_csf_firmware_trigger_mcu_sleep - Send the command to put MCU in sleep
+ *                                        state.
+ *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ */
+void kbase_csf_firmware_trigger_mcu_sleep(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_firmware_is_mcu_in_sleep - Check if sleep request has completed
+ *                                      and MCU has halted.
+ *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ *
+ * Return: true if sleep request has completed, otherwise false.
+ */
+bool kbase_csf_firmware_is_mcu_in_sleep(struct kbase_device *kbdev);
+#endif
+
 /**
  * kbase_trigger_firmware_reload - Trigger the reboot of MCU firmware, for the
  *                                 cold boot case firmware image would be
diff --git a/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c b/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c
index 33ae3f7..e99c968 100644
--- a/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c
+++ b/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c
@@ -27,6 +27,7 @@
 #include "mali_kbase_reset_gpu.h"
 #include "mali_kbase_ctx_sched.h"
 #include "device/mali_kbase_device.h"
+#include <mali_kbase_hwaccess_time.h>
 #include "backend/gpu/mali_kbase_pm_internal.h"
 #include "mali_kbase_csf_scheduler.h"
 #include "mmu/mali_kbase_mmu.h"
@@ -551,6 +552,8 @@ static int wait_for_global_request(struct kbase_device *const kbdev,
 		dev_warn(kbdev->dev, "Timed out waiting for global request %x to complete",
 			 req_mask);
 		err = -ETIMEDOUT;
+
+
 	}
 
 	return err;
@@ -886,7 +889,9 @@ int kbase_csf_firmware_early_init(struct kbase_device *kbdev)
 {
 	init_waitqueue_head(&kbdev->csf.event_wait);
 	kbdev->csf.interrupt_received = false;
-	kbdev->csf.fw_timeout_ms = CSF_FIRMWARE_TIMEOUT_MS;
+
+	kbdev->csf.fw_timeout_ms =
+		kbase_get_timeout_ms(kbdev, CSF_FIRMWARE_TIMEOUT);
 
 	INIT_LIST_HEAD(&kbdev->csf.firmware_interfaces);
 	INIT_LIST_HEAD(&kbdev->csf.firmware_config);
@@ -920,8 +925,14 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev)
 	}
 
 	kbdev->csf.gpu_idle_hysteresis_ms = FIRMWARE_IDLE_HYSTERESIS_TIME_MS;
+#ifdef KBASE_PM_RUNTIME
+	if (kbase_pm_gpu_sleep_allowed(kbdev))
+		kbdev->csf.gpu_idle_hysteresis_ms /=
+			FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER;
+#endif
+	WARN_ON(!kbdev->csf.gpu_idle_hysteresis_ms);
 	kbdev->csf.gpu_idle_dur_count = convert_dur_to_idle_count(
-		kbdev, FIRMWARE_IDLE_HYSTERESIS_TIME_MS);
+		kbdev, kbdev->csf.gpu_idle_hysteresis_ms);
 
 	ret = kbase_mcu_shared_interface_region_tracker_init(kbdev);
 	if (ret != 0) {
@@ -1110,15 +1121,21 @@ int kbase_csf_firmware_set_timeout(struct kbase_device *const kbdev,
 void kbase_csf_enter_protected_mode(struct kbase_device *kbdev)
 {
 	struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface;
-	unsigned long flags;
 
-	kbase_csf_scheduler_spin_lock(kbdev, &flags);
+	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
 	set_global_request(global_iface, GLB_REQ_PROTM_ENTER_MASK);
 	dev_dbg(kbdev->dev, "Sending request to enter protected mode");
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
-	kbase_csf_scheduler_spin_unlock(kbdev, flags);
+}
+
+void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
+{
+	int err = wait_for_global_request(kbdev, GLB_REQ_PROTM_ENTER_MASK);
 
-	wait_for_global_request(kbdev, GLB_REQ_PROTM_ENTER_MASK);
+	if (err) {
+		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
+			kbase_reset_gpu(kbdev);
+	}
 }
 
 void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev)
@@ -1127,12 +1144,38 @@ void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev)
 	unsigned long flags;
 
 	kbase_csf_scheduler_spin_lock(kbdev, &flags);
+	/* Validate there are no on-slot groups when sending the
+	 * halt request to firmware.
+	 */
+	WARN_ON(kbase_csf_scheduler_get_nr_active_csgs_locked(kbdev));
 	set_global_request(global_iface, GLB_REQ_HALT_MASK);
 	dev_dbg(kbdev->dev, "Sending request to HALT MCU");
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 	kbase_csf_scheduler_spin_unlock(kbdev, flags);
 }
 
+#ifdef KBASE_PM_RUNTIME
+void kbase_csf_firmware_trigger_mcu_sleep(struct kbase_device *kbdev)
+{
+	struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface;
+	unsigned long flags;
+
+	kbase_csf_scheduler_spin_lock(kbdev, &flags);
+	set_global_request(global_iface, GLB_REQ_SLEEP_MASK);
+	dev_dbg(kbdev->dev, "Sending sleep request to MCU");
+	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
+	kbase_csf_scheduler_spin_unlock(kbdev, flags);
+}
+
+bool kbase_csf_firmware_is_mcu_in_sleep(struct kbase_device *kbdev)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	return (global_request_complete(kbdev, GLB_REQ_SLEEP_MASK) &&
+		kbase_csf_firmware_mcu_halted(kbdev));
+}
+#endif
+
 int kbase_csf_trigger_firmware_config_update(struct kbase_device *kbdev)
 {
 	struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface;
@@ -1331,7 +1374,7 @@ int kbase_csf_firmware_mcu_shared_mapping_init(
 
 mmu_insert_pages_error:
 	mutex_lock(&kbdev->csf.reg_lock);
-	kbase_remove_va_region(va_reg);
+	kbase_remove_va_region(kbdev, va_reg);
 va_region_add_error:
 	kbase_free_alloced_region(va_reg);
 	mutex_unlock(&kbdev->csf.reg_lock);
@@ -1363,7 +1406,7 @@ void kbase_csf_firmware_mcu_shared_mapping_term(
 {
 	if (csf_mapping->va_reg) {
 		mutex_lock(&kbdev->csf.reg_lock);
-		kbase_remove_va_region(csf_mapping->va_reg);
+		kbase_remove_va_region(kbdev, csf_mapping->va_reg);
 		kbase_free_alloced_region(csf_mapping->va_reg);
 		mutex_unlock(&kbdev->csf.reg_lock);
 	}
diff --git a/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.c b/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.c
index 96746c6..1815a26 100644
--- a/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.c
+++ b/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.c
@@ -50,8 +50,8 @@ static u64 sub_alloc(struct kbase_csf_heap_context_allocator *const ctx_alloc)
 		MAX_TILER_HEAPS);
 
 	if (unlikely(heap_nr >= MAX_TILER_HEAPS)) {
-		dev_err(kctx->kbdev->dev,
-			"No free tiler heap contexts in the pool\n");
+		dev_dbg(kctx->kbdev->dev,
+			"No free tiler heap contexts in the pool");
 		return 0;
 	}
 
@@ -159,6 +159,11 @@ u64 kbase_csf_heap_context_allocator_alloc(
 	u64 nr_pages = PFN_UP(HEAP_CTX_REGION_SIZE);
 	u64 heap_gpu_va = 0;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 #ifdef CONFIG_MALI_VECTOR_DUMP
 	flags |= BASE_MEM_PROT_CPU_RD;
 #endif
@@ -169,13 +174,14 @@ u64 kbase_csf_heap_context_allocator_alloc(
 	 * allocate it.
 	 */
 	if (!ctx_alloc->region) {
-		ctx_alloc->region = kbase_mem_alloc(kctx, nr_pages, nr_pages,
-					0, &flags, &ctx_alloc->gpu_va);
+		ctx_alloc->region =
+			kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags,
+					&ctx_alloc->gpu_va, mmu_sync_info);
 	}
 
 	/* If the pool still isn't allocated then an error occurred. */
 	if (unlikely(!ctx_alloc->region)) {
-		dev_err(kctx->kbdev->dev, "Failed to allocate a pool of tiler heap contexts\n");
+		dev_dbg(kctx->kbdev->dev, "Failed to allocate a pool of tiler heap contexts");
 	} else {
 		heap_gpu_va = sub_alloc(ctx_alloc);
 	}
diff --git a/mali_kbase/csf/mali_kbase_csf_kcpu.c b/mali_kbase/csf/mali_kbase_csf_kcpu.c
index 4e26a49..8729307 100644
--- a/mali_kbase/csf/mali_kbase_csf_kcpu.c
+++ b/mali_kbase/csf/mali_kbase_csf_kcpu.c
@@ -34,7 +34,7 @@ static DEFINE_SPINLOCK(kbase_csf_fence_lock);
 #endif
 
 static void kcpu_queue_process(struct kbase_kcpu_command_queue *kcpu_queue,
-			bool ignore_waits);
+			       bool drain_queue);
 
 static void kcpu_queue_process_worker(struct work_struct *data);
 
@@ -220,7 +220,7 @@ static int kbase_kcpu_jit_allocate_process(
 	for (i = 0; i < count; i++, info++) {
 		/* The JIT ID is still in use so fail the allocation */
 		if (kctx->jit_alloc[info->id]) {
-			dev_warn(kctx->kbdev->dev, "JIT ID still in use\n");
+			dev_dbg(kctx->kbdev->dev, "JIT ID still in use");
 			return -EINVAL;
 		}
 	}
@@ -458,7 +458,7 @@ static int kbase_kcpu_jit_free_process(struct kbase_kcpu_command_queue *queue,
 		int item_err = 0;
 
 		if (!kctx->jit_alloc[ids[i]]) {
-			dev_warn(kctx->kbdev->dev, "invalid JIT free ID\n");
+			dev_dbg(kctx->kbdev->dev, "invalid JIT free ID");
 			rc = -EINVAL;
 			item_err = rc;
 		} else {
@@ -964,7 +964,7 @@ static int kbase_kcpu_cqs_wait_operation_process(struct kbase_device *kbdev,
 				sig_set = *evt > cqs_wait_operation->objs[i].val;
 				break;
 			default:
-				dev_warn(kbdev->dev,
+				dev_dbg(kbdev->dev,
 					"Unsupported CQS wait operation %d", cqs_wait_operation->objs[i].operation);
 
 				kbase_phy_alloc_mapping_put(queue->kctx, mapping);
@@ -976,8 +976,9 @@ static int kbase_kcpu_cqs_wait_operation_process(struct kbase_device *kbdev,
 			/* Increment evt up to the error_state value depending on the CQS data type */
 			switch (cqs_wait_operation->objs[i].data_type) {
 			default:
-				dev_warn(kbdev->dev, "Unreachable data_type=%d", cqs_wait_operation->objs[i].data_type);
-			/* Fallthrough - hint to compiler that there's really only 2 options at present */
+				dev_dbg(kbdev->dev, "Unreachable data_type=%d", cqs_wait_operation->objs[i].data_type);
+				/* Fallthrough - hint to compiler that there's really only 2 options at present */
+				fallthrough;
 			case BASEP_CQS_DATA_TYPE_U32:
 				evt = (u64 *)((u8 *)evt + sizeof(u32));
 				break;
@@ -1100,7 +1101,7 @@ static void kbase_kcpu_cqs_set_operation_process(
 				*evt = cqs_set_operation->objs[i].val;
 				break;
 			default:
-				dev_warn(kbdev->dev,
+				dev_dbg(kbdev->dev,
 					"Unsupported CQS set operation %d", cqs_set_operation->objs[i].operation);
 				queue->has_error = true;
 				break;
@@ -1109,8 +1110,9 @@ static void kbase_kcpu_cqs_set_operation_process(
 			/* Increment evt up to the error_state value depending on the CQS data type */
 			switch (cqs_set_operation->objs[i].data_type) {
 			default:
-				dev_warn(kbdev->dev, "Unreachable data_type=%d", cqs_set_operation->objs[i].data_type);
-			/* Fallthrough - hint to compiler that there's really only 2 options at present */
+				dev_dbg(kbdev->dev, "Unreachable data_type=%d", cqs_set_operation->objs[i].data_type);
+				/* Fallthrough - hint to compiler that there's really only 2 options at present */
+				fallthrough;
 			case BASEP_CQS_DATA_TYPE_U32:
 				evt = (u64 *)((u8 *)evt + sizeof(u32));
 				break;
@@ -1465,8 +1467,8 @@ static int delete_queue(struct kbase_context *kctx, u32 id)
 
 		kfree(queue);
 	} else {
-		dev_warn(kctx->kbdev->dev,
-			"Attempt to delete a non-existent KCPU queue\n");
+		dev_dbg(kctx->kbdev->dev,
+			"Attempt to delete a non-existent KCPU queue");
 		mutex_unlock(&kctx->csf.kcpu_queues.lock);
 		err = -EINVAL;
 	}
@@ -1525,7 +1527,7 @@ static void KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_FREE_END(
 }
 
 static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
-			bool ignore_waits)
+			       bool drain_queue)
 {
 	struct kbase_device *kbdev = queue->kctx->kbdev;
 	bool process_next = true;
@@ -1548,7 +1550,7 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
 
 			status = 0;
 #if IS_ENABLED(CONFIG_SYNC_FILE)
-			if (ignore_waits) {
+			if (drain_queue) {
 				kbase_kcpu_fence_wait_cancel(queue,
 					&cmd->info.fence);
 			} else {
@@ -1601,7 +1603,7 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
 			status = kbase_kcpu_cqs_wait_process(kbdev, queue,
 						&cmd->info.cqs_wait);
 
-			if (!status && !ignore_waits) {
+			if (!status && !drain_queue) {
 				process_next = false;
 			} else {
 				/* Either all CQS objects were signaled or
@@ -1623,7 +1625,7 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
 			status = kbase_kcpu_cqs_wait_operation_process(kbdev, queue,
 						&cmd->info.cqs_wait_operation);
 
-			if (!status && !ignore_waits) {
+			if (!status && !drain_queue) {
 				process_next = false;
 			} else {
 				/* Either all CQS objects were signaled or
@@ -1651,22 +1653,25 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
 		case BASE_KCPU_COMMAND_TYPE_MAP_IMPORT: {
 			struct kbase_ctx_ext_res_meta *meta = NULL;
 
-			KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_MAP_IMPORT_START(
-				kbdev, queue);
+			if (!drain_queue) {
+				KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_MAP_IMPORT_START(
+					kbdev, queue);
 
-			kbase_gpu_vm_lock(queue->kctx);
-			meta = kbase_sticky_resource_acquire(
-				queue->kctx, cmd->info.import.gpu_va);
-			kbase_gpu_vm_unlock(queue->kctx);
+				kbase_gpu_vm_lock(queue->kctx);
+				meta = kbase_sticky_resource_acquire(
+					queue->kctx, cmd->info.import.gpu_va);
+				kbase_gpu_vm_unlock(queue->kctx);
 
-			if (meta == NULL) {
-				queue->has_error = true;
-				dev_warn(kbdev->dev,
-						"failed to map an external resource\n");
-			}
+				if (meta == NULL) {
+					queue->has_error = true;
+					dev_dbg(
+						kbdev->dev,
+						"failed to map an external resource");
+				}
 
-			KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_MAP_IMPORT_END(
-				kbdev, queue, meta ? 0 : 1);
+				KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_MAP_IMPORT_END(
+					kbdev, queue, meta ? 0 : 1);
+			}
 			break;
 		}
 		case BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT: {
@@ -1682,8 +1687,8 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
 
 			if (!ret) {
 				queue->has_error = true;
-				dev_warn(kbdev->dev,
-						"failed to release the reference. resource not found\n");
+				dev_dbg(kbdev->dev,
+						"failed to release the reference. resource not found");
 			}
 
 			KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_UNMAP_IMPORT_END(
@@ -1703,8 +1708,8 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
 
 			if (!ret) {
 				queue->has_error = true;
-				dev_warn(kbdev->dev,
-						"failed to release the reference. resource not found\n");
+				dev_dbg(kbdev->dev,
+						"failed to release the reference. resource not found");
 			}
 
 			KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_UNMAP_IMPORT_FORCE_END(
@@ -1713,24 +1718,32 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
 		}
 		case BASE_KCPU_COMMAND_TYPE_JIT_ALLOC:
 		{
-			KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_ALLOC_START(
-				kbdev, queue);
-
-			status = kbase_kcpu_jit_allocate_process(queue, cmd);
-			if (status == -EAGAIN) {
-				process_next = false;
+			if (drain_queue) {
+				/* We still need to call this function to clean the JIT alloc info up */
+				kbase_kcpu_jit_allocate_finish(queue, cmd);
 			} else {
-				if (status != 0)
-					queue->has_error = true;
+				KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_ALLOC_START(
+					kbdev, queue);
 
-				KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_ALLOC_INFO(
-					kbdev, queue, &cmd->info.jit_alloc,
-					status);
+				status = kbase_kcpu_jit_allocate_process(queue,
+									 cmd);
+				if (status == -EAGAIN) {
+					process_next = false;
+				} else {
+					if (status != 0)
+						queue->has_error = true;
 
-				kbase_kcpu_jit_allocate_finish(queue, cmd);
-				KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_ALLOC_END(
+					KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_ALLOC_INFO(
+						kbdev, queue,
+						&cmd->info.jit_alloc, status);
+
+					kbase_kcpu_jit_allocate_finish(queue,
+								       cmd);
+					KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_ALLOC_END(
 						kbdev, queue);
+				}
 			}
+
 			break;
 		}
 		case BASE_KCPU_COMMAND_TYPE_JIT_FREE:
@@ -1748,56 +1761,39 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
 			struct kbase_suspend_copy_buffer *sus_buf =
 					cmd->info.suspend_buf_copy.sus_buf;
 
-			KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_GROUP_SUSPEND_START(
-				kbdev, queue);
+			if (!drain_queue) {
+				KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_GROUP_SUSPEND_START(
+					kbdev, queue);
 
-			status = kbase_csf_queue_group_suspend_process(
+				status = kbase_csf_queue_group_suspend_process(
 					queue->kctx, sus_buf,
 					cmd->info.suspend_buf_copy.group_handle);
-			if (status)
-				queue->has_error = true;
+				if (status)
+					queue->has_error = true;
 
-			KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_GROUP_SUSPEND_END(
-				kbdev, queue, status);
+				KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_GROUP_SUSPEND_END(
+					kbdev, queue, status);
 
-			if (!sus_buf->cpu_alloc) {
-				int i;
+				if (!sus_buf->cpu_alloc) {
+					int i;
 
-				for (i = 0; i < sus_buf->nr_pages; i++)
-					put_page(sus_buf->pages[i]);
-			} else {
-				kbase_mem_phy_alloc_kernel_unmapped(
-					sus_buf->cpu_alloc);
-				kbase_mem_phy_alloc_put(sus_buf->cpu_alloc);
+					for (i = 0; i < sus_buf->nr_pages; i++)
+						put_page(sus_buf->pages[i]);
+				} else {
+					kbase_mem_phy_alloc_kernel_unmapped(
+						sus_buf->cpu_alloc);
+					kbase_mem_phy_alloc_put(
+						sus_buf->cpu_alloc);
+				}
 			}
 
 			kfree(sus_buf->pages);
 			kfree(sus_buf);
 			break;
 		}
-#if MALI_UNIT_TEST
-		case BASE_KCPU_COMMAND_TYPE_SAMPLE_TIME: {
-			u64 time = ktime_get_raw_ns();
-			void *target_page = kmap(*cmd->info.sample_time.page);
-
-			if (target_page) {
-				memcpy(target_page +
-					       cmd->info.sample_time.page_offset,
-				       &time, sizeof(time));
-				kunmap(*cmd->info.sample_time.page);
-			} else {
-				dev_warn(kbdev->dev,
-					 "Could not kmap target page\n");
-				queue->has_error = true;
-			}
-			put_page(*cmd->info.sample_time.page);
-			kfree(cmd->info.sample_time.page);
-			break;
-		}
-#endif /* MALI_UNIT_TEST */
 		default:
-			dev_warn(kbdev->dev,
-				"Unrecognized command type\n");
+			dev_dbg(kbdev->dev,
+				"Unrecognized command type");
 			break;
 		} /* switch */
 
@@ -1933,14 +1929,6 @@ static void KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_COMMAND(
 			kbdev, queue, cmd->info.suspend_buf_copy.sus_buf,
 			cmd->info.suspend_buf_copy.group_handle);
 		break;
-#if MALI_UNIT_TEST
-	case BASE_KCPU_COMMAND_TYPE_SAMPLE_TIME:
-		/*
-		 * This is test-only KCPU command, no need to have a timeline
-		 * entry
-		 */
-		break;
-#endif /* MALI_UNIT_TEST */
 	}
 }
 
@@ -1966,8 +1954,8 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,
 	 * in the set.
 	 */
 	if (enq->nr_commands != 1) {
-		dev_err(kctx->kbdev->dev,
-			"More than one commands enqueued\n");
+		dev_dbg(kctx->kbdev->dev,
+			"More than one commands enqueued");
 		return -EINVAL;
 	}
 
@@ -2081,40 +2069,9 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,
 					&command.info.suspend_buf_copy,
 					kcpu_cmd);
 			break;
-#if MALI_UNIT_TEST
-		case BASE_KCPU_COMMAND_TYPE_SAMPLE_TIME: {
-			int const page_cnt = 1;
-
-			kcpu_cmd->type = BASE_KCPU_COMMAND_TYPE_SAMPLE_TIME;
-			kcpu_cmd->info.sample_time.page_addr =
-				command.info.sample_time.time & PAGE_MASK;
-			kcpu_cmd->info.sample_time.page_offset =
-				command.info.sample_time.time & ~PAGE_MASK;
-			kcpu_cmd->info.sample_time.page = kcalloc(
-				page_cnt, sizeof(struct page *), GFP_KERNEL);
-			if (!kcpu_cmd->info.sample_time.page) {
-				ret = -ENOMEM;
-			} else {
-				int pinned_pages = get_user_pages_fast(
-					kcpu_cmd->info.sample_time.page_addr,
-					page_cnt, 1,
-					kcpu_cmd->info.sample_time.page);
-
-				if (pinned_pages < 0) {
-					ret = pinned_pages;
-					kfree(kcpu_cmd->info.sample_time.page);
-				} else if (pinned_pages != page_cnt) {
-					ret = -EINVAL;
-					kfree(kcpu_cmd->info.sample_time.page);
-				}
-			}
-
-			break;
-		}
-#endif /* MALI_UNIT_TEST */
 		default:
-			dev_warn(queue->kctx->kbdev->dev,
-				"Unknown command type %u\n", command.type);
+			dev_dbg(queue->kctx->kbdev->dev,
+				"Unknown command type %u", command.type);
 			ret = -EINVAL;
 			break;
 		}
diff --git a/mali_kbase/csf/mali_kbase_csf_kcpu.h b/mali_kbase/csf/mali_kbase_csf_kcpu.h
index 9964f20..6300569 100644
--- a/mali_kbase/csf/mali_kbase_csf_kcpu.h
+++ b/mali_kbase/csf/mali_kbase_csf_kcpu.h
@@ -196,13 +196,6 @@ struct kbase_kcpu_command_group_suspend_info {
 	u8 group_handle;
 };
 
-#if MALI_UNIT_TEST
-struct kbase_kcpu_command_sample_time_info {
-	u64 page_addr;
-	u64 page_offset;
-	struct page **page;
-};
-#endif /* MALI_UNIT_TEST */
 
 /**
  * struct kbase_cpu_command - Command which is to be part of the kernel
@@ -235,9 +228,6 @@ struct kbase_kcpu_command {
 		struct kbase_kcpu_command_jit_alloc_info jit_alloc;
 		struct kbase_kcpu_command_jit_free_info jit_free;
 		struct kbase_kcpu_command_group_suspend_info suspend_buf_copy;
-#if MALI_UNIT_TEST
-		struct kbase_kcpu_command_sample_time_info sample_time;
-#endif /* MALI_UNIT_TEST */
 	} info;
 };
 
diff --git a/mali_kbase/csf/mali_kbase_csf_reset_gpu.c b/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
index f6d61d7..7b63132 100644
--- a/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
+++ b/mali_kbase/csf/mali_kbase_csf_reset_gpu.c
@@ -461,11 +461,14 @@ static void kbase_csf_reset_gpu_worker(struct work_struct *data)
 {
 	struct kbase_device *kbdev = container_of(data, struct kbase_device,
 						  csf.reset.work);
+	bool gpu_sleep_mode_active = false;
 	bool firmware_inited;
 	unsigned long flags;
 	int err = 0;
 	const enum kbase_csf_reset_gpu_state initial_reset_state =
 		atomic_read(&kbdev->csf.reset.state);
+	const bool silent =
+		kbase_csf_reset_state_is_silent(initial_reset_state);
 
 	/* Ensure any threads (e.g. executing the CSF scheduler) have finished
 	 * using the HW
@@ -474,14 +477,30 @@ static void kbase_csf_reset_gpu_worker(struct work_struct *data)
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	firmware_inited = kbdev->csf.firmware_inited;
+#ifdef KBASE_PM_RUNTIME
+	gpu_sleep_mode_active = kbdev->pm.backend.gpu_sleep_mode_active;
+#endif
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
-	if (!kbase_pm_context_active_handle_suspend(kbdev,
-			KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE)) {
-		bool silent =
-			kbase_csf_reset_state_is_silent(initial_reset_state);
+	if (unlikely(gpu_sleep_mode_active)) {
+#ifdef KBASE_PM_RUNTIME
+		/* As prior to GPU reset all on-slot groups are suspended,
+		 * need to wake up the MCU from sleep.
+		 * No pm active reference is taken here since GPU is in sleep
+		 * state and both runtime & system suspend synchronize with the
+		 * GPU reset before they wake up the GPU to suspend on-slot
+		 * groups. GPUCORE-29850 would add the proper handling.
+		 */
+		kbase_pm_lock(kbdev);
+		if (kbase_pm_force_mcu_wakeup_after_sleep(kbdev))
+			dev_warn(kbdev->dev, "Wait for MCU wake up failed on GPU reset");
+		kbase_pm_unlock(kbdev);
 
 		err = kbase_csf_reset_gpu_now(kbdev, firmware_inited, silent);
+#endif
+	} else if (!kbase_pm_context_active_handle_suspend(kbdev,
+			KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE)) {
+		err = kbase_csf_reset_gpu_now(kbdev, firmware_inited, silent);
 		kbase_pm_context_idle(kbdev);
 	}
 
@@ -599,6 +618,8 @@ int kbase_reset_gpu_wait(struct kbase_device *kbdev)
 
 	if (!remaining) {
 		dev_warn(kbdev->dev, "Timed out waiting for the GPU reset to complete");
+
+
 		return -ETIMEDOUT;
 	} else if (atomic_read(&kbdev->csf.reset.state) ==
 			KBASE_CSF_RESET_GPU_FAILED) {
diff --git a/mali_kbase/csf/mali_kbase_csf_scheduler.c b/mali_kbase/csf/mali_kbase_csf_scheduler.c
index 8109570..f22a5d7 100644
--- a/mali_kbase/csf/mali_kbase_csf_scheduler.c
+++ b/mali_kbase/csf/mali_kbase_csf_scheduler.c
@@ -30,14 +30,16 @@
 #include <linux/export.h>
 #include <uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h>
 #include <uapi/gpu/arm/midgard/mali_base_kernel.h>
+#include <mali_kbase_hwaccess_time.h>
 
 /* Value to indicate that a queue group is not groups_to_schedule list */
 #define KBASEP_GROUP_PREPARED_SEQ_NUM_INVALID (U32_MAX)
 
-/* Waiting timeout for scheduler state change for descheduling a CSG */
-#define CSG_SCHED_STOP_TIMEOUT_MS (50)
-
-#define CSG_SUSPEND_ON_RESET_WAIT_TIMEOUT_MS DEFAULT_RESET_TIMEOUT_MS
+/* This decides the upper limit on the waiting time for the Scheduler
+ * to exit the sleep state. Usually the value of autosuspend_delay is
+ * expected to be around 100 milli seconds.
+ */
+#define MAX_AUTO_SUSPEND_DELAY_MS (5000)
 
 /* Maximum number of endpoints which may run tiler jobs. */
 #define CSG_TILER_MAX ((u8)1)
@@ -75,10 +77,8 @@
 /* CS suspended and is wait for a CQS condition */
 #define CS_WAIT_SYNC_FLAG (1 << 1)
 
-/* 2 GPU address space slots are reserved for MCU and privileged context for HW
- * counter dumping. TODO remove the slot reserved for latter in GPUCORE-26293.
- */
-#define NUM_RESERVED_AS_SLOTS (2)
+/* A GPU address space slot is reserved for MCU. */
+#define NUM_RESERVED_AS_SLOTS (1)
 
 static int scheduler_group_schedule(struct kbase_queue_group *group);
 static void remove_group_from_idle_wait(struct kbase_queue_group *const group);
@@ -94,14 +94,116 @@ static struct kbase_queue_group *get_tock_top_group(
 static void scheduler_enable_tick_timer_nolock(struct kbase_device *kbdev);
 static int suspend_active_queue_groups(struct kbase_device *kbdev,
 				       unsigned long *slot_mask);
+static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev,
+					      bool system_suspend);
 static void schedule_in_cycle(struct kbase_queue_group *group, bool force);
 
 #define kctx_as_enabled(kctx) (!kbase_ctx_flag(kctx, KCTX_AS_DISABLED_ON_FAULT))
 
+#ifdef KBASE_PM_RUNTIME
+/**
+ * wait_for_scheduler_to_exit_sleep() - Wait for Scheduler to exit the
+ *                                      sleeping state.
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function waits until the Scheduler has exited the sleep state and
+ * it is called when an on-slot group is terminated or when the suspend
+ * buffer of an on-slot group needs to be captured.
+ *
+ * Return: 0 when the wait is successful, otherwise an error code.
+ */
+static int wait_for_scheduler_to_exit_sleep(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+	int autosuspend_delay = kbdev->dev->power.autosuspend_delay;
+	unsigned int sleep_exit_wait_time;
+	long remaining;
+	int ret = 0;
+
+	lockdep_assert_held(&scheduler->lock);
+	WARN_ON(scheduler->state != SCHED_SLEEPING);
+
+	/* No point in waiting if autosuspend_delay value is negative.
+	 * For the negative value of autosuspend_delay Driver will directly
+	 * go for the suspend of Scheduler, but the autosuspend_delay value
+	 * could have been changed after the sleep was initiated.
+	 */
+	if (autosuspend_delay < 0)
+		return -EINVAL;
+
+	if (autosuspend_delay > MAX_AUTO_SUSPEND_DELAY_MS)
+		autosuspend_delay = MAX_AUTO_SUSPEND_DELAY_MS;
+
+	/* Usually Scheduler would remain in sleeping state until the
+	 * auto-suspend timer expires and all active CSGs are suspended.
+	 */
+	sleep_exit_wait_time = autosuspend_delay + kbdev->reset_timeout_ms;
+
+	remaining = kbase_csf_timeout_in_jiffies(sleep_exit_wait_time);
+
+	while ((scheduler->state == SCHED_SLEEPING) && !ret) {
+		mutex_unlock(&scheduler->lock);
+		remaining = wait_event_timeout(
+				kbdev->csf.event_wait,
+				(scheduler->state != SCHED_SLEEPING),
+				remaining);
+		mutex_lock(&scheduler->lock);
+		if (!remaining && (scheduler->state == SCHED_SLEEPING))
+			ret = -ETIMEDOUT;
+	}
+
+	return ret;
+}
+
+/**
+ * force_scheduler_to_exit_sleep() - Force scheduler to exit sleep state
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function will force the Scheduler to exit the sleep state by doing the
+ * wake up of MCU and suspension of on-slot groups. It is called at the time of
+ * system suspend.
+ */
+static void force_scheduler_to_exit_sleep(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+	unsigned long flags;
+	int ret;
+
+	lockdep_assert_held(&scheduler->lock);
+	WARN_ON(scheduler->state != SCHED_SLEEPING);
+	WARN_ON(!kbdev->pm.backend.gpu_sleep_mode_active);
+
+	kbase_pm_lock(kbdev);
+	ret = kbase_pm_force_mcu_wakeup_after_sleep(kbdev);
+	if (ret)
+		dev_warn(kbdev->dev, "[%llu] Wait for MCU wake up failed on forced scheduler suspend",
+			kbase_backend_get_cycle_cnt(kbdev));
+	kbase_pm_unlock(kbdev);
+
+	suspend_active_groups_on_powerdown(kbdev, true);
+
+	kbase_pm_lock(kbdev);
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	kbdev->pm.backend.gpu_sleep_mode_active = false;
+	kbdev->pm.backend.gpu_wakeup_override = false;
+	kbase_pm_update_state(kbdev);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	ret = kbase_pm_wait_for_desired_state(kbdev);
+	if (ret)
+		dev_warn(kbdev->dev, "[%llu] Wait for pm state change failed on forced scheduler suspend",
+			kbase_backend_get_cycle_cnt(kbdev));
+	kbase_pm_unlock(kbdev);
+
+	scheduler->state = SCHED_SUSPENDED;
+}
+#endif
+
 /**
  * tick_timer_callback() - Callback function for the scheduling tick hrtimer
  *
- * @timer: Pointer to the device
+ * @timer: Pointer to the scheduling tick hrtimer
  *
  * This function will enqueue the scheduling tick work item for immediate
  * execution, if it has not been queued already.
@@ -173,14 +275,10 @@ static void cancel_tick_timer(struct kbase_device *kbdev)
 static void enqueue_tick_work(struct kbase_device *kbdev)
 {
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
-	unsigned long flags;
 
 	lockdep_assert_held(&scheduler->lock);
 
-	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
-	WARN_ON(scheduler->tick_timer_active);
-	queue_work(scheduler->wq, &scheduler->tick_work);
-	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
+	kbase_csf_scheduler_invoke_tick(kbdev);
 }
 
 static void release_doorbell(struct kbase_device *kbdev, int doorbell_nr)
@@ -288,11 +386,11 @@ static void scheduler_doorbell_init(struct kbase_device *kbdev)
 	WARN_ON(doorbell_nr != CSF_KERNEL_DOORBELL_NR);
 }
 
-static u32 get_nr_active_csgs(struct kbase_device *kbdev)
+u32 kbase_csf_scheduler_get_nr_active_csgs_locked(struct kbase_device *kbdev)
 {
 	u32 nr_active_csgs;
 
-	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+	lockdep_assert_held(&kbdev->csf.scheduler.interrupt_lock);
 
 	nr_active_csgs = bitmap_weight(kbdev->csf.scheduler.csg_inuse_bitmap,
 				kbdev->csf.global_iface.group_num);
@@ -300,27 +398,16 @@ static u32 get_nr_active_csgs(struct kbase_device *kbdev)
 	return nr_active_csgs;
 }
 
-/**
- * csgs_active - returns true if any of CSG slots are in use
- *
- * @kbdev: Instance of a GPU platform device that implements a CSF interface.
- *
- * Return: the interface is actively engaged flag.
- */
-static bool csgs_active(struct kbase_device *kbdev)
+u32 kbase_csf_scheduler_get_nr_active_csgs(struct kbase_device *kbdev)
 {
 	u32 nr_active_csgs;
+	unsigned long flags;
 
-	mutex_lock(&kbdev->csf.scheduler.lock);
-	nr_active_csgs = get_nr_active_csgs(kbdev);
-	mutex_unlock(&kbdev->csf.scheduler.lock);
+	spin_lock_irqsave(&kbdev->csf.scheduler.interrupt_lock, flags);
+	nr_active_csgs = kbase_csf_scheduler_get_nr_active_csgs_locked(kbdev);
+	spin_unlock_irqrestore(&kbdev->csf.scheduler.interrupt_lock, flags);
 
-	/* Right now if any of the CSG interfaces are in use
-	 * then we need to assume that there is some work pending.
-	 * In future when we have IDLE notifications from firmware implemented
-	 * then we would have a better idea of the pending work.
-	 */
-	return (nr_active_csgs != 0);
+	return nr_active_csgs;
 }
 
 /**
@@ -395,7 +482,9 @@ static void scheduler_wait_protm_quit(struct kbase_device *kbdev)
 			!kbase_csf_scheduler_protected_mode_in_use(kbdev), wt);
 
 	if (!remaining)
-		dev_warn(kbdev->dev, "Timeout, protm_quit wait skipped");
+		dev_warn(kbdev->dev, "[%llu] Timeout (%d ms), protm_quit wait skipped",
+			kbase_backend_get_cycle_cnt(kbdev),
+			kbdev->csf.fw_timeout_ms);
 
 	KBASE_KTRACE_ADD(kbdev, SCHEDULER_WAIT_PROTM_QUIT_DONE, NULL,
 			 jiffies_to_msecs(remaining));
@@ -483,20 +572,198 @@ static void disable_gpu_idle_fw_timer(struct kbase_device *kbdev)
 	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
 }
 
+/**
+ * scheduler_pm_active_handle_suspend() - Acquire the PM reference count for
+ *                                        Scheduler
+ *
+ * @kbdev: Pointer to the device
+ * @suspend_handler: Handler code for how to handle a suspend that might occur.
+ *
+ * This function is usually called when Scheduler needs to be activated.
+ * The PM reference count is acquired for the Scheduler and the power on
+ * of GPU is initiated.
+ */
+static int scheduler_pm_active_handle_suspend(struct kbase_device *kbdev,
+				enum kbase_pm_suspend_handler suspend_handler)
+{
+	unsigned long flags;
+	u32 prev_count;
+	int ret = 0;
+
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	prev_count = kbdev->csf.scheduler.pm_active_count;
+	if (!WARN_ON(prev_count == U32_MAX))
+		kbdev->csf.scheduler.pm_active_count++;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	/* On 0 => 1, make a pm_ctx_active request */
+	if (!prev_count) {
+		ret = kbase_pm_context_active_handle_suspend(kbdev,
+							suspend_handler);
+		if (ret) {
+			spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+			kbdev->csf.scheduler.pm_active_count--;
+			kbase_pm_update_state(kbdev);
+			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+		}
+	}
+
+	return ret;
+}
+
+#ifdef KBASE_PM_RUNTIME
+/**
+ * scheduler_pm_active_after_sleep() - Acquire the PM reference count for
+ *                                     Scheduler
+ *
+ * @kbdev: Pointer to the device
+ * @flags: flags containing previous interrupt state
+ *
+ * This function is called when Scheduler needs to be activated from the
+ * sleeping state.
+ * The PM reference count is acquired for the Scheduler and the wake up of
+ * MCU is initiated. It resets the flag that indicates to the MCU state
+ * machine that MCU needs to be put in sleep state.
+ *
+ * Note: This function shall be called with hwaccess lock held and it will
+ * release that lock.
+ *
+ * Return: zero when the PM reference was taken and non-zero when the
+ * system is being suspending/suspended.
+ */
+static int scheduler_pm_active_after_sleep(struct kbase_device *kbdev,
+					   unsigned long flags)
+{
+	u32 prev_count;
+	int ret = 0;
+
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	prev_count = kbdev->csf.scheduler.pm_active_count;
+	if (!WARN_ON(prev_count == U32_MAX))
+		kbdev->csf.scheduler.pm_active_count++;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	/* On 0 => 1, make a pm_ctx_active request */
+	if (!prev_count) {
+		ret = kbase_pm_context_active_handle_suspend(kbdev,
+				KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE);
+
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		if (ret)
+			kbdev->csf.scheduler.pm_active_count--;
+		else
+			kbdev->pm.backend.gpu_sleep_mode_active = false;
+		kbase_pm_update_state(kbdev);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	}
+
+	return ret;
+}
+#endif
+
+/**
+ * scheduler_pm_idle() - Release the PM reference count held by Scheduler
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function is usually called after Scheduler is suspended.
+ * The PM reference count held by the Scheduler is released to trigger the
+ * power down of GPU.
+ */
+static void scheduler_pm_idle(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+	u32 prev_count;
+
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	prev_count = kbdev->csf.scheduler.pm_active_count;
+	if (!WARN_ON(prev_count == 0))
+		kbdev->csf.scheduler.pm_active_count--;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	if (prev_count == 1)
+		kbase_pm_context_idle(kbdev);
+}
+
+#ifdef KBASE_PM_RUNTIME
+/**
+ * scheduler_pm_idle_before_sleep() - Release the PM reference count and
+ *                                    trigger the tranistion to sleep state.
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function is called on the GPU idle notification. It releases the
+ * Scheduler's PM reference count and sets the flag to indicate to the
+ * MCU state machine that MCU needs to be put in sleep state.
+ */
+static void scheduler_pm_idle_before_sleep(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+	u32 prev_count;
+
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	prev_count = kbdev->csf.scheduler.pm_active_count;
+	if (!WARN_ON(prev_count == 0))
+		kbdev->csf.scheduler.pm_active_count--;
+	kbdev->pm.backend.gpu_sleep_mode_active = true;
+	kbdev->pm.backend.exit_gpu_sleep_mode = false;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	if (prev_count == 1)
+		kbase_pm_context_idle(kbdev);
+}
+#endif
+
 static void scheduler_wakeup(struct kbase_device *kbdev, bool kick)
 {
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+	int ret;
 
 	lockdep_assert_held(&scheduler->lock);
 
+	if ((scheduler->state != SCHED_SUSPENDED) &&
+	    (scheduler->state != SCHED_SLEEPING))
+		return;
+
 	if (scheduler->state == SCHED_SUSPENDED) {
-		dev_dbg(kbdev->dev, "Re-activating the Scheduler");
-		kbase_csf_scheduler_pm_active(kbdev);
-		scheduler->state = SCHED_INACTIVE;
+		dev_dbg(kbdev->dev,
+			"Re-activating the Scheduler after suspend");
+		ret = scheduler_pm_active_handle_suspend(kbdev,
+				KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE);
+	} else {
+#ifdef KBASE_PM_RUNTIME
+		unsigned long flags;
 
-		if (kick)
-			scheduler_enable_tick_timer_nolock(kbdev);
+		dev_dbg(kbdev->dev,
+			"Re-activating the Scheduler out of sleep");
+
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		ret = scheduler_pm_active_after_sleep(kbdev, flags);
+		/* hwaccess_lock is released in the previous function call. */
+#endif
 	}
+
+	if (ret) {
+		/* GPUCORE-29850 would add the handling for the case where
+		 * Scheduler could not be activated due to system suspend.
+		 */
+		dev_info(kbdev->dev,
+			"Couldn't wakeup Scheduler due to system suspend");
+		return;
+	}
+
+	scheduler->state = SCHED_INACTIVE;
+
+	if (kick)
+		scheduler_enable_tick_timer_nolock(kbdev);
 }
 
 static void scheduler_suspend(struct kbase_device *kbdev)
@@ -507,7 +774,7 @@ static void scheduler_suspend(struct kbase_device *kbdev)
 
 	if (!WARN_ON(scheduler->state == SCHED_SUSPENDED)) {
 		dev_dbg(kbdev->dev, "Suspending the Scheduler");
-		kbase_csf_scheduler_pm_idle(kbdev);
+		scheduler_pm_idle(kbdev);
 		scheduler->state = SCHED_SUSPENDED;
 	}
 }
@@ -542,11 +809,30 @@ static void update_idle_suspended_group_state(struct kbase_queue_group *group)
 
 		/* If scheduler is not suspended and the given group's
 		 * static priority (reflected by the scan_seq_num) is inside
-		 * the current tick slot-range, schedules an async tock.
+		 * the current tick slot-range, or there are some on_slot
+		 * idle groups, schedule an async tock.
 		 */
-		if (scheduler->state != SCHED_SUSPENDED &&
-		    group->scan_seq_num < scheduler->num_csg_slots_for_tick)
-			schedule_in_cycle(group, true);
+		if (scheduler->state != SCHED_SUSPENDED) {
+			unsigned long flags;
+			int n_idle;
+			int n_used;
+			int n_slots =
+				group->kctx->kbdev->csf.global_iface.group_num;
+
+			spin_lock_irqsave(&scheduler->interrupt_lock, flags);
+			n_idle = bitmap_weight(scheduler->csg_slots_idle_mask,
+					       n_slots);
+			n_used = bitmap_weight(scheduler->csg_inuse_bitmap,
+					       n_slots);
+			spin_unlock_irqrestore(&scheduler->interrupt_lock,
+					       flags);
+
+			if (n_idle ||
+			    n_used < scheduler->num_csg_slots_for_tick ||
+			    group->scan_seq_num <
+				    scheduler->num_csg_slots_for_tick)
+				schedule_in_cycle(group, true);
+		}
 	} else
 		return;
 
@@ -586,6 +872,14 @@ int kbase_csf_scheduler_group_get_slot(struct kbase_queue_group *group)
 	return slot_num;
 }
 
+/* kbasep_csf_scheduler_group_is_on_slot_locked() - Check if CSG is on slot.
+ *
+ * @group: GPU queue group to be checked
+ *
+ * This function needs to be called with scheduler's lock held
+ *
+ * Return: true if @group is on slot.
+ */
 static bool kbasep_csf_scheduler_group_is_on_slot_locked(
 				struct kbase_queue_group *group)
 {
@@ -653,11 +947,13 @@ static int halt_stream_sync(struct kbase_queue *queue)
 			 == CS_ACK_STATE_START), remaining);
 
 		if (!remaining) {
-			dev_warn(kbdev->dev, "Timed out waiting for queue to start on csi %d bound to group %d on slot %d",
+			dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for queue to start on csi %d bound to group %d on slot %d",
+				 kbase_backend_get_cycle_cnt(kbdev), kbdev->csf.fw_timeout_ms,
 				 csi_index, group->handle, group->csg_nr);
 			if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 				kbase_reset_gpu(kbdev);
 
+
 			return -ETIMEDOUT;
 		}
 
@@ -678,7 +974,8 @@ static int halt_stream_sync(struct kbase_queue *queue)
 		 == CS_ACK_STATE_STOP), remaining);
 
 	if (!remaining) {
-		dev_warn(kbdev->dev, "Timed out waiting for queue to stop on csi %d bound to group %d on slot %d",
+		dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for queue to stop on csi %d bound to group %d on slot %d",
+			 kbase_backend_get_cycle_cnt(kbdev), kbdev->csf.fw_timeout_ms,
 			 queue->csi_index, group->handle, group->csg_nr);
 
 		/* TODO GPUCORE-25328: The CSG can't be terminated, the GPU
@@ -686,6 +983,8 @@ static int halt_stream_sync(struct kbase_queue *queue)
 		 */
 		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu(kbdev);
+
+
 	}
 	return (remaining) ? 0 : -ETIMEDOUT;
 }
@@ -739,6 +1038,8 @@ static int sched_halt_stream(struct kbase_queue *queue)
 	long remaining;
 	int slot;
 	int err = 0;
+	const u32 group_schedule_timeout =
+		20 * kbdev->csf.scheduler.csg_scheduling_period_ms;
 
 	if (WARN_ON(!group))
 		return -EINVAL;
@@ -782,8 +1083,7 @@ retry:
 	 */
 	remaining = wait_event_timeout(
 		kbdev->csf.event_wait, can_halt_stream(kbdev, group),
-		kbase_csf_timeout_in_jiffies(
-			20 * kbdev->csf.scheduler.csg_scheduling_period_ms));
+		kbase_csf_timeout_in_jiffies(group_schedule_timeout));
 
 	mutex_lock(&scheduler->lock);
 
@@ -845,26 +1145,62 @@ retry:
 						 kbase_csf_firmware_cs_output(
 							 stream, CS_ACK)) ==
 					 CS_ACK_STATE_STOP),
-					kbdev->csf.fw_timeout_ms);
+					kbase_csf_timeout_in_jiffies(kbdev->csf.fw_timeout_ms));
 
 				if (!remaining) {
 					dev_warn(kbdev->dev,
-						 "Timed out waiting for queue stop ack on csi %d bound to group %d on slot %d",
+						 "[%llu] Timeout (%d ms) waiting for queue stop ack on csi %d bound to group %d on slot %d",
+						 kbase_backend_get_cycle_cnt(kbdev), kbdev->csf.fw_timeout_ms,
 						 queue->csi_index,
 						 group->handle, group->csg_nr);
+
+
 					err = -ETIMEDOUT;
 				}
 			}
 		}
 	} else if (!remaining) {
-		dev_warn(kbdev->dev, "Group-%d failed to get a slot for stopping the queue on csi %d",
-			 group->handle, queue->csi_index);
+		dev_warn(kbdev->dev, "[%llu] Group-%d failed to get a slot for stopping the queue on csi %d (timeout %d ms)",
+			 kbase_backend_get_cycle_cnt(kbdev),
+			 group->handle, queue->csi_index,
+			 group_schedule_timeout);
+
+
 		err = -ETIMEDOUT;
 	}
 
 	return err;
 }
 
+/**
+ * scheduler_activate_on_queue_stop() - Activate the Scheduler when the GPU
+ *                                      queue needs to be stopped.
+ *
+ * @queue: Pointer the GPU command queue
+ *
+ * This function is called when the CSI to which GPU queue is bound needs to
+ * be stopped. For that the corresponding queue group needs to be resident on
+ * the CSG slot and MCU firmware should be running. So this function makes the
+ * Scheduler exit the sleeping or suspended state.
+ */
+static void scheduler_activate_on_queue_stop(struct kbase_queue *queue)
+{
+	struct kbase_device *kbdev = queue->kctx->kbdev;
+
+	scheduler_wakeup(kbdev, true);
+
+	/* Wait for MCU firmware to start running */
+	if (kbase_csf_scheduler_wait_mcu_active(kbdev)) {
+		dev_warn(
+			kbdev->dev,
+			"[%llu] Wait for MCU active failed for stopping queue on csi %d bound to group %d of context %d_%d on slot %d",
+			kbase_backend_get_cycle_cnt(kbdev),
+			queue->csi_index, queue->group->handle,
+			queue->kctx->tgid, queue->kctx->id,
+			queue->group->csg_nr);
+	}
+}
+
 int kbase_csf_scheduler_queue_stop(struct kbase_queue *queue)
 {
 	struct kbase_device *kbdev = queue->kctx->kbdev;
@@ -890,7 +1226,7 @@ int kbase_csf_scheduler_queue_stop(struct kbase_queue *queue)
 		/* Since the group needs to be resumed in order to stop the queue,
 		 * check if GPU needs to be powered up.
 		 */
-		scheduler_wakeup(kbdev, true);
+		scheduler_activate_on_queue_stop(queue);
 
 		if ((slot >= 0) &&
 		    (atomic_read(&csg_slot[slot].state) == CSG_SLOT_RUNNING))
@@ -1228,7 +1564,9 @@ static void halt_csg_slot(struct kbase_queue_group *group, bool suspend)
 				csg_slot_running(kbdev, slot), remaining);
 		if (!remaining)
 			dev_warn(kbdev->dev,
-				 "slot %d timed out on up-running\n", slot);
+				 "[%llu] slot %d timeout (%d ms) on up-running\n",
+				 kbase_backend_get_cycle_cnt(kbdev),
+				 slot, kbdev->csf.fw_timeout_ms);
 	}
 
 	if (csg_slot_running(kbdev, slot)) {
@@ -1251,6 +1589,8 @@ static void halt_csg_slot(struct kbase_queue_group *group, bool suspend)
 		csg_slot[slot].trigger_jiffies = jiffies;
 		KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_SLOT_STOP, group, halt_cmd);
 
+		KBASE_TLSTREAM_TL_KBASE_DEVICE_HALT_CSG(
+			kbdev, kbdev->gpu_props.props.raw_props.gpu_id, slot);
 		kbase_csf_ring_csg_doorbell(kbdev, slot);
 	}
 }
@@ -1399,37 +1739,6 @@ bool save_slot_cs(struct kbase_csf_cmd_stream_group_info const *const ginfo,
 	return is_waiting;
 }
 
-/**
- * Calculate how far in the future an event should be scheduled.
- *
- * The objective of this function is making sure that a minimum period of
- * time is guaranteed between handling two consecutive events.
- *
- * This function guarantees a minimum period of time between two consecutive
- * events: given the minimum period and the distance between the current time
- * and the last event, the function returns the difference between the two.
- * However, if more time than the minimum period has already elapsed
- * since the last event, the function will return 0 to schedule work to handle
- * the event with the lowest latency possible.
- *
- * @last_event: Timestamp of the last event, in jiffies.
- * @time_now:   Timestamp of the new event to handle, in jiffies.
- *              Must be successive to last_event.
- * @period:     Minimum period between two events, in jiffies.
- *
- * Return:      Time to delay work to handle the current event, in jiffies
- */
-static unsigned long get_schedule_delay(unsigned long last_event,
-					unsigned long time_now,
-					unsigned long period)
-{
-	const unsigned long t_distance = time_now - last_event;
-	const unsigned long delay_t = (t_distance < period) ?
-					(period - t_distance) : 0;
-
-	return delay_t;
-}
-
 static void schedule_in_cycle(struct kbase_queue_group *group, bool force)
 {
 	struct kbase_context *kctx = group->kctx;
@@ -1446,13 +1755,10 @@ static void schedule_in_cycle(struct kbase_queue_group *group, bool force)
 	 */
 	if ((likely(scheduler_timer_is_enabled_nolock(kbdev)) || force) &&
 			!scheduler->tock_pending_request) {
-		const unsigned long delay =
-			get_schedule_delay(scheduler->last_schedule, jiffies,
-					   CSF_SCHEDULER_TIME_TOCK_JIFFIES);
 		scheduler->tock_pending_request = true;
 		dev_dbg(kbdev->dev, "Kicking async for group %d\n",
 			group->handle);
-		mod_delayed_work(scheduler->wq, &scheduler->tock_work, delay);
+		mod_delayed_work(scheduler->wq, &scheduler->tock_work, 0);
 	}
 }
 
@@ -1494,7 +1800,8 @@ void insert_group_to_runnable(struct kbase_csf_scheduler *const scheduler,
 
 	if (likely(scheduler_timer_is_enabled_nolock(kbdev)) &&
 	    (scheduler->total_runnable_grps == 1 ||
-	     scheduler->state == SCHED_SUSPENDED)) {
+	     scheduler->state == SCHED_SUSPENDED ||
+	     scheduler->state == SCHED_SLEEPING)) {
 		dev_dbg(kbdev->dev, "Kicking scheduler on first runnable group\n");
 		/* Fire a scheduling to start the time-slice */
 		enqueue_tick_work(kbdev);
@@ -1516,6 +1823,7 @@ void remove_group_from_runnable(struct kbase_csf_scheduler *const scheduler,
 	struct kbase_queue_group *new_head_grp;
 	struct list_head *list =
 		&kctx->csf.sched.runnable_groups[group->priority];
+	unsigned long flags;
 
 	lockdep_assert_held(&scheduler->lock);
 
@@ -1524,6 +1832,30 @@ void remove_group_from_runnable(struct kbase_csf_scheduler *const scheduler,
 	group->run_state = run_state;
 	list_del_init(&group->link);
 
+	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
+	/* The below condition will be true when the group running in protected
+	 * mode is being terminated but the protected mode exit interrupt was't
+	 * received. This can happen if the FW got stuck during protected mode
+	 * for some reason (like GPU page fault or some internal error).
+	 * In normal cases FW is expected to send the protected mode exit
+	 * interrupt before it handles the CSG termination request.
+	 */
+	if (unlikely(scheduler->active_protm_grp == group)) {
+		/* CSG slot cleanup should have happened for the pmode group */
+		WARN_ON(kbasep_csf_scheduler_group_is_on_slot_locked(group));
+		WARN_ON(group->run_state != KBASE_CSF_GROUP_INACTIVE);
+		/* Initiate a GPU reset, in case it wasn't initiated yet,
+		 * in order to rectify the anomaly.
+		 */
+		if (kbase_prepare_to_reset_gpu(kctx->kbdev, RESET_FLAGS_NONE))
+			kbase_reset_gpu(kctx->kbdev);
+
+		KBASE_KTRACE_ADD_CSF_GRP(kctx->kbdev, SCHEDULER_EXIT_PROTM,
+					 scheduler->active_protm_grp, 0u);
+		scheduler->active_protm_grp = NULL;
+	}
+	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
+
 	if (scheduler->top_grp == group) {
 		/*
 		 * Note: this disables explicit rotation in the next scheduling
@@ -2025,6 +2357,9 @@ static void program_csg_slot(struct kbase_queue_group *group, s8 slot,
 	kbase_csf_firmware_csg_input(ginfo, CSG_ALLOW_OTHER,
 				     tiler_mask & U32_MAX);
 
+	/* Register group UID with firmware */
+	kbase_csf_firmware_csg_input(ginfo, CSG_ITER_TRACE_CONFIG,
+				     group->group_uid);
 
 	ep_cfg = CSG_EP_REQ_COMPUTE_EP_SET(ep_cfg, compute_max);
 	ep_cfg = CSG_EP_REQ_FRAGMENT_EP_SET(ep_cfg, fragment_max);
@@ -2077,8 +2412,9 @@ static void program_csg_slot(struct kbase_queue_group *group, s8 slot,
 	csg_slot->priority = prio;
 
 	/* Trace the programming of the CSG on the slot */
-	KBASE_TLSTREAM_TL_KBASE_DEVICE_PROGRAM_CSG(kbdev,
-		kbdev->gpu_props.props.raw_props.gpu_id, group->handle, slot);
+	KBASE_TLSTREAM_TL_KBASE_DEVICE_PROGRAM_CSG(
+		kbdev, kbdev->gpu_props.props.raw_props.gpu_id, group->kctx->id,
+		group->handle, slot);
 
 	dev_dbg(kbdev->dev, "Starting group %d of context %d_%d on slot %d with priority %u\n",
 		group->handle, kctx->tgid, kctx->id, slot, prio);
@@ -2175,11 +2511,14 @@ static int term_group_sync(struct kbase_queue_group *group)
 		csg_slot_stopped_locked(kbdev, group->csg_nr), remaining);
 
 	if (!remaining) {
-		dev_warn(kbdev->dev, "term request timed out for group %d of context %d_%d on slot %d",
+		dev_warn(kbdev->dev, "[%llu] term request timeout (%d ms) for group %d of context %d_%d on slot %d",
+			 kbase_backend_get_cycle_cnt(kbdev), kbdev->csf.fw_timeout_ms,
 			 group->handle, group->kctx->tgid,
 			 group->kctx->id, group->csg_nr);
 		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu(kbdev);
+
+
 		err = -ETIMEDOUT;
 	}
 
@@ -2190,46 +2529,70 @@ void kbase_csf_scheduler_group_deschedule(struct kbase_queue_group *group)
 {
 	struct kbase_device *kbdev = group->kctx->kbdev;
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
-	long remaining =
-		kbase_csf_timeout_in_jiffies(CSG_SCHED_STOP_TIMEOUT_MS);
-	bool force = false;
+	bool on_slot;
 
 	kbase_reset_gpu_assert_failed_or_prevented(kbdev);
 	lockdep_assert_held(&group->kctx->csf.lock);
 	mutex_lock(&scheduler->lock);
 
 	KBASE_KTRACE_ADD_CSF_GRP(kbdev, GROUP_DESCHEDULE, group, group->run_state);
-	while (queue_group_scheduled_locked(group)) {
-		u32 saved_state = scheduler->state;
-
-		if (!kbasep_csf_scheduler_group_is_on_slot_locked(group)) {
-			sched_evict_group(group, false, true);
-		} else if (saved_state == SCHED_INACTIVE || force) {
-			bool as_faulty;
-
-			term_group_sync(group);
-			/* Treat the csg been terminated */
-			as_faulty = cleanup_csg_slot(group);
-			/* remove from the scheduler list */
-			sched_evict_group(group, as_faulty, false);
-		}
+	if (!queue_group_scheduled_locked(group))
+		goto unlock;
 
-		/* waiting scheduler state to change */
-		if (queue_group_scheduled_locked(group)) {
-			mutex_unlock(&scheduler->lock);
-			remaining = wait_event_timeout(
-					kbdev->csf.event_wait,
-					saved_state != scheduler->state,
-					remaining);
-			if (!remaining) {
-				dev_warn(kbdev->dev, "Scheduler state change wait timed out for group %d on slot %d",
-					 group->handle, group->csg_nr);
-				force = true;
-			}
-			mutex_lock(&scheduler->lock);
+	on_slot = kbasep_csf_scheduler_group_is_on_slot_locked(group);
+
+#ifdef KBASE_PM_RUNTIME
+	/* If the queue group is on slot and Scheduler is in SLEEPING state,
+	 * then we need to wait here for Scheduler to exit the sleep state
+	 * (i.e. wait for the runtime suspend or power down of GPU). This would
+	 * be better than aborting the power down. The group will be suspended
+	 * anyways on power down, so won't have to send the CSG termination
+	 * request to FW.
+	 */
+	if (on_slot && (scheduler->state == SCHED_SLEEPING)) {
+		if (wait_for_scheduler_to_exit_sleep(kbdev)) {
+			dev_warn(
+				kbdev->dev,
+				"Wait for scheduler to exit sleep state timedout when terminating group %d of context %d_%d on slot %d",
+				group->handle, group->kctx->tgid,
+				group->kctx->id, group->csg_nr);
+
+			scheduler_wakeup(kbdev, true);
+
+			/* Wait for MCU firmware to start running */
+			if (kbase_csf_scheduler_wait_mcu_active(kbdev))
+				dev_warn(
+					kbdev->dev,
+					"[%llu] Wait for MCU active failed when when terminating group %d of context %d_%d on slot %d",
+					kbase_backend_get_cycle_cnt(kbdev),
+					group->handle, group->kctx->tgid,
+					group->kctx->id, group->csg_nr);
 		}
+
+		/* Check the group state again as scheduler lock would have been
+		 * released when waiting for the exit from SLEEPING state.
+		 */
+		if (!queue_group_scheduled_locked(group))
+			goto unlock;
+
+		on_slot = kbasep_csf_scheduler_group_is_on_slot_locked(group);
+	}
+#endif
+	if (!on_slot) {
+		sched_evict_group(group, false, true);
+	} else {
+		bool as_faulty;
+
+		term_group_sync(group);
+		/* Treat the csg been terminated */
+		as_faulty = cleanup_csg_slot(group);
+		/* remove from the scheduler list */
+		sched_evict_group(group, as_faulty, false);
 	}
 
+	WARN_ON(queue_group_scheduled_locked(group));
+
+unlock:
 	mutex_unlock(&scheduler->lock);
 }
 
@@ -2684,9 +3047,11 @@ static void program_suspending_csg_slots(struct kbase_device *kbdev)
 				 */
 				dev_warn(
 					kbdev->dev,
-					"Group %d of context %d_%d on slot %u failed to suspend",
+					"[%llu] Group %d of context %d_%d on slot %u failed to suspend (timeout %d ms)",
+					kbase_backend_get_cycle_cnt(kbdev),
 					group->handle, group->kctx->tgid,
-					group->kctx->id, i);
+					group->kctx->id, i,
+					kbdev->csf.fw_timeout_ms);
 
 				/* The group has failed suspension, stop
 				 * further examination.
@@ -2784,7 +3149,9 @@ static void wait_csg_slots_start(struct kbase_device *kbdev)
 				group->run_state = KBASE_CSF_GROUP_RUNNABLE;
 			}
 		} else {
-			dev_warn(kbdev->dev, "Timed out waiting for CSG slots to start, slots: 0x%*pb\n",
+			dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for CSG slots to start, slots: 0x%*pb\n",
+				 kbase_backend_get_cycle_cnt(kbdev),
+				 kbdev->csf.fw_timeout_ms,
 				 num_groups, slot_mask);
 
 			if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
@@ -2904,9 +3271,12 @@ static int wait_csg_slots_handshake_ack(struct kbase_device *kbdev,
 
 		if (remaining)
 			bitmap_andnot(slot_mask, slot_mask, dones, num_groups);
-		else
+		else {
+
+
 			/* Timed-out on the wait */
 			return -ETIMEDOUT;
+		}
 	}
 
 	return 0;
@@ -2929,7 +3299,9 @@ static void wait_csg_slots_finish_prio_update(struct kbase_device *kbdev)
 		 */
 		dev_warn(
 			kbdev->dev,
-			"Timeout on CSG_REQ:EP_CFG, skipping the update wait: slot mask=0x%lx",
+			"[%llu] Timeout (%d ms) on CSG_REQ:EP_CFG, skipping the update wait: slot mask=0x%lx",
+			kbase_backend_get_cycle_cnt(kbdev),
+			kbdev->csf.fw_timeout_ms,
 			slot_mask[0]);
 	}
 }
@@ -3075,7 +3447,11 @@ static void scheduler_group_check_protm_enter(struct kbase_device *const kbdev,
 
 	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
 
-	protm_in_use = kbase_csf_scheduler_protected_mode_in_use(kbdev);
+	/* Check if the previous transition to enter & exit the protected
+	 * mode has completed or not.
+	 */
+	protm_in_use = kbase_csf_scheduler_protected_mode_in_use(kbdev) ||
+		       kbdev->protected_mode;
 	KBASE_KTRACE_ADD_CSF_GRP(kbdev, SCHEDULER_CHECK_PROTM_ENTER, input_grp,
 				 protm_in_use);
 
@@ -3123,8 +3499,10 @@ static void scheduler_group_check_protm_enter(struct kbase_device *const kbdev,
 				KBASE_KTRACE_ADD_CSF_GRP(kbdev, SCHEDULER_ENTER_PROTM,
 							 input_grp, 0u);
 
-				spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
 				kbase_csf_enter_protected_mode(kbdev);
+				spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
+
+				kbase_csf_wait_protected_mode_enter(kbdev);
 				return;
 			}
 		}
@@ -3433,7 +3811,9 @@ static void scheduler_update_idle_slots_status(struct kbase_device *kbdev,
 				CSG_REQ_STATUS_UPDATE_MASK, csg_bitmap, wt)) {
 			dev_warn(
 				kbdev->dev,
-				"Timeout on CSG_REQ:STATUS_UPDATE, treat groups as not idle: slot mask=0x%lx",
+				"[%llu] Timeout (%d ms) on CSG_REQ:STATUS_UPDATE, treat groups as not idle: slot mask=0x%lx",
+				kbase_backend_get_cycle_cnt(kbdev),
+				kbdev->csf.fw_timeout_ms,
 				csg_bitmap[0]);
 
 			/* Store the bitmap of timed out slots */
@@ -3576,7 +3956,7 @@ static struct kbase_queue_group *get_tock_top_group(
 }
 
 static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev,
-					      bool is_suspend)
+					      bool system_suspend)
 {
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
 	DECLARE_BITMAP(slot_mask, MAX_SUPPORTED_CSGS) = { 0 };
@@ -3587,15 +3967,19 @@ static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev,
 		/* The suspend of CSGs failed, trigger the GPU reset and wait
 		 * for it to complete to be in a deterministic state.
 		 */
-		dev_warn(kbdev->dev, "Timed out waiting for CSG slots to suspend on power down, slot_mask: 0x%*pb\n",
+		dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for CSG slots to suspend on power down, slot_mask: 0x%*pb\n",
+			 kbase_backend_get_cycle_cnt(kbdev),
+			 kbdev->csf.fw_timeout_ms,
 			 kbdev->csf.global_iface.group_num, slot_mask);
 
 		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu(kbdev);
 
-		if (is_suspend) {
+		if (system_suspend) {
 			mutex_unlock(&scheduler->lock);
+			kbase_reset_gpu_allow(kbdev);
 			kbase_reset_gpu_wait(kbdev);
+			kbase_reset_gpu_prevent_and_wait(kbdev);
 			mutex_lock(&scheduler->lock);
 		}
 		return -1;
@@ -3604,7 +3988,7 @@ static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev,
 	/* Check if the groups became active whilst the suspend was ongoing,
 	 * but only for the case where the system suspend is not in progress
 	 */
-	if (!is_suspend && atomic_read(&scheduler->non_idle_offslot_grps))
+	if (!system_suspend && atomic_read(&scheduler->non_idle_offslot_grps))
 		return -1;
 
 	return 0;
@@ -3618,7 +4002,8 @@ static bool scheduler_idle_suspendable(struct kbase_device *kbdev)
 
 	lockdep_assert_held(&scheduler->lock);
 
-	if  (scheduler->state == SCHED_SUSPENDED)
+	if ((scheduler->state == SCHED_SUSPENDED) ||
+	    (scheduler->state == SCHED_SLEEPING))
 		return false;
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
@@ -3639,12 +4024,66 @@ static bool scheduler_idle_suspendable(struct kbase_device *kbdev)
 	return suspend;
 }
 
+#ifdef KBASE_PM_RUNTIME
+/**
+ * scheduler_sleep_on_idle - Put the Scheduler in sleeping state on GPU
+ *                           becoming idle.
+ *
+ * @kbdev: Pointer to the device.
+ *
+ * This function is called on GPU idle notification to trigger the transition of
+ * GPU to sleep state, where MCU firmware pauses execution and L2 cache is
+ * turned off. Scheduler's state is changed to sleeping and all the active queue
+ * groups remain on the CSG slots.
+ */
+static void scheduler_sleep_on_idle(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+
+	lockdep_assert_held(&scheduler->lock);
+
+	dev_dbg(kbdev->dev,
+		"Scheduler to be put to sleep on GPU becoming idle");
+	cancel_tick_timer(kbdev);
+	scheduler_pm_idle_before_sleep(kbdev);
+	scheduler->state = SCHED_SLEEPING;
+}
+#endif
+
+/**
+ * scheduler_suspend_on_idle - Put the Scheduler in suspended state on GPU
+ *                             becoming idle.
+ *
+ * @kbdev: Pointer to the device.
+ *
+ * This function is called on GPU idle notification to trigger the power down of
+ * GPU. Scheduler's state is changed to suspended and all the active queue
+ * groups are suspended before halting the MCU firmware.
+ */
+static bool scheduler_suspend_on_idle(struct kbase_device *kbdev)
+{
+	int ret = suspend_active_groups_on_powerdown(kbdev, false);
+
+	if (ret) {
+		dev_dbg(kbdev->dev, "Aborting suspend scheduler (grps: %d)",
+			atomic_read(
+				&kbdev->csf.scheduler.non_idle_offslot_grps));
+		/* Bring forward the next tick */
+		kbase_csf_scheduler_advance_tick(kbdev);
+		return false;
+	}
+
+	dev_dbg(kbdev->dev, "Scheduler to be suspended on GPU becoming idle");
+	scheduler_suspend(kbdev);
+	cancel_tick_timer(kbdev);
+	return true;
+}
+
 static void gpu_idle_worker(struct work_struct *work)
 {
 	struct kbase_device *kbdev = container_of(
 		work, struct kbase_device, csf.scheduler.gpu_idle_work);
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
-	bool reset_active = false;
 	bool scheduler_is_idle_suspendable = false;
 	bool all_groups_suspended = false;
 
@@ -3664,27 +4103,22 @@ static void gpu_idle_worker(struct work_struct *work)
 	/* Cycle completed, disable the firmware idle timer */
 	disable_gpu_idle_fw_timer(kbdev);
 	scheduler_is_idle_suspendable = scheduler_idle_suspendable(kbdev);
-	reset_active = kbase_reset_gpu_is_active(kbdev);
-	if (scheduler_is_idle_suspendable && !reset_active) {
-		all_groups_suspended =
-			!suspend_active_groups_on_powerdown(kbdev, false);
-
-		if (all_groups_suspended) {
-			dev_dbg(kbdev->dev, "Scheduler becomes idle suspended now");
-			scheduler_suspend(kbdev);
-			cancel_tick_timer(kbdev);
-		} else {
-			dev_dbg(kbdev->dev, "Aborting suspend scheduler (grps: %d)",
-				atomic_read(&scheduler->non_idle_offslot_grps));
-			/* Bring forward the next tick */
-			kbase_csf_scheduler_advance_tick(kbdev);
-		}
+	if (scheduler_is_idle_suspendable) {
+#ifdef KBASE_PM_RUNTIME
+		if (kbase_pm_gpu_sleep_allowed(kbdev) &&
+		    scheduler->total_runnable_grps)
+			scheduler_sleep_on_idle(kbdev);
+		else
+#endif
+			all_groups_suspended = scheduler_suspend_on_idle(kbdev);
 	}
 
 	mutex_unlock(&scheduler->lock);
 	kbase_reset_gpu_allow(kbdev);
 	KBASE_KTRACE_ADD(kbdev, IDLE_WORKER_END, NULL,
-			 __ENCODE_KTRACE_INFO(reset_active, scheduler_is_idle_suspendable, all_groups_suspended));
+			 __ENCODE_KTRACE_INFO(false,
+					      scheduler_is_idle_suspendable,
+					      all_groups_suspended));
 #undef __ENCODE_KTRACE_INFO
 }
 
@@ -3777,32 +4211,151 @@ static void scheduler_handle_idle_timer_onoff(struct kbase_device *kbdev)
 		enable_gpu_idle_fw_timer(kbdev);
 }
 
-static void schedule_actions(struct kbase_device *kbdev)
+/**
+ * keep_lru_on_slots() - Check the condition for LRU is met.
+ *
+ * This function tries to maintain the Last-Recent-Use case on slots, when
+ * the scheduler has no non-idle off-slot CSGs for a replacement
+ * consideration. This effectively extends the previous scheduling results
+ * for the new one. That is, the last recent used CSGs are retained on slots
+ * for the new tick/tock action.
+ *
+ * @kbdev: Pointer to the device.
+ *
+ * Return: true for avoiding on-slot CSGs changes (i.e. keep existing LRU),
+ *         otherwise false.
+ */
+static bool keep_lru_on_slots(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+	bool keep_lru = false;
+	int on_slots = bitmap_weight(scheduler->csg_inuse_bitmap,
+				     kbdev->csf.global_iface.group_num);
+
+	lockdep_assert_held(&scheduler->lock);
+
+	if (on_slots && !atomic_read(&scheduler->non_idle_offslot_grps)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&scheduler->interrupt_lock, flags);
+		/* All on-slots are idle, no non-idle off-slot CSGs available
+		 * for considering a meaningful change. Set keep_lru.
+		 */
+		keep_lru = kbase_csf_scheduler_all_csgs_idle(kbdev);
+
+		if (keep_lru && !scheduler->gpu_idle_fw_timer_enabled) {
+			scheduler->gpu_idle_fw_timer_enabled = true;
+			kbase_csf_firmware_enable_gpu_idle_timer(kbdev);
+		}
+		spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
+
+		dev_dbg(kbdev->dev, "Keep_LRU: %d, CSGs on-slots: %d\n",
+			keep_lru, on_slots);
+	}
+
+	return keep_lru;
+}
+
+/**
+ * prepare_fast_local_tock() - making preparation arrangement for exercizing
+ *                             a fast local tock inside scheduling-actions.
+ *
+ * The function assumes that a scheduling action of firing a fast local tock
+ * call (i.e. an equivalent tock action without dropping the lock) is desired
+ * if there are idle onslot CSGs. The function updates those affected CSGs'
+ * run-state as a preparation. This should only be called from inside the
+ * schedule_actions(), where the previous idle-flags are still considered to
+ * be reflective, following its earlier idle confirmation operational call,
+ * plus some potential newly idle CSGs in the scheduling action committing
+ * steps.
+ *
+ * @kbdev:  Pointer to the GPU device.
+ *
+ * Return: number of on-slots CSGs that can be considered for replacing.
+ */
+static int prepare_fast_local_tock(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+	u32 num_groups = kbdev->csf.global_iface.group_num;
+	unsigned long flags, i;
+	DECLARE_BITMAP(csg_bitmap, MAX_SUPPORTED_CSGS) = { 0 };
+
+	lockdep_assert_held(&scheduler->lock);
+
+	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
+	bitmap_copy(csg_bitmap, scheduler->csg_slots_idle_mask, num_groups);
+	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
+
+	/* Marking the flagged idle CSGs' run state to IDLE, so
+	 * the intended fast local tock can replacing them with off-slots
+	 * non-idle CSGs.
+	 */
+	for_each_set_bit(i, csg_bitmap, num_groups) {
+		struct kbase_csf_csg_slot *csg_slot = &scheduler->csg_slots[i];
+		struct kbase_queue_group *group = csg_slot->resident_group;
+
+		if (!queue_group_idle_locked(group))
+			group->run_state = KBASE_CSF_GROUP_IDLE;
+	}
+
+	/* Return the number of idle slots for potential replacement */
+	return bitmap_weight(csg_bitmap, num_groups);
+}
+
+static void schedule_actions(struct kbase_device *kbdev, bool is_tick)
 {
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
 	unsigned long flags;
 	struct kbase_queue_group *protm_grp;
 	int ret;
+	bool skip_scheduling_actions;
 	bool skip_idle_slots_update;
 	bool new_protm_top_grp = false;
+	int local_tock_slots = 0;
 
 	kbase_reset_gpu_assert_prevented(kbdev);
 	lockdep_assert_held(&scheduler->lock);
 
-	ret = kbase_pm_wait_for_desired_state(kbdev);
+	ret = kbase_csf_scheduler_wait_mcu_active(kbdev);
 	if (ret) {
-		dev_err(kbdev->dev, "Wait for MCU power on failed");
+		dev_err(kbdev->dev,
+			"Wait for MCU power on failed on scheduling tick/tock");
 		return;
 	}
 
 	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
 	skip_idle_slots_update = kbase_csf_scheduler_protected_mode_in_use(kbdev);
+	skip_scheduling_actions =
+			!skip_idle_slots_update && kbdev->protected_mode;
 	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
 
-	/* Skip updating on-slot idle CSGs if GPU is in protected mode. */
-	if (!skip_idle_slots_update)
+	/* Skip scheduling actions as GPU reset hasn't been performed yet to
+	 * rectify the anomaly that happened when pmode exit interrupt wasn't
+	 * received before the termination of group running in pmode.
+	 */
+	if (unlikely(skip_scheduling_actions)) {
+		dev_info(kbdev->dev,
+			 "Scheduling actions skipped due to anomaly in pmode");
+		return;
+	}
+
+	if (!skip_idle_slots_update) {
+		/* Updating on-slot idle CSGs when not in protected mode. */
 		scheduler_handle_idle_slots(kbdev);
 
+		/* Determine whether the condition is met for keeping the
+		 * Last-Recent-Use. If true, skipping the remaining action
+		 * steps and thus extending the previous tick's arrangement,
+		 * in particular, no alterations to on-slot CSGs.
+		 */
+		if (keep_lru_on_slots(kbdev))
+			return;
+	}
+
+	if (is_tick)
+		scheduler_rotate(kbdev);
+
+redo_local_tock:
 	scheduler_prepare(kbdev);
 	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
 	protm_grp = scheduler->active_protm_grp;
@@ -3866,6 +4419,21 @@ static void schedule_actions(struct kbase_device *kbdev)
 		if (new_protm_top_grp) {
 			scheduler_group_check_protm_enter(kbdev,
 						scheduler->top_grp);
+		} else if (!local_tock_slots &&
+			   atomic_read(&scheduler->non_idle_offslot_grps)) {
+			/* If during the scheduling action, we have off-slot
+			 * non-idle CSGs in waiting, if it happens to have
+			 * some new idle slots emerging during the committed
+			 * action steps, trigger a one-off fast local tock.
+			 */
+			local_tock_slots = prepare_fast_local_tock(kbdev);
+
+			if (local_tock_slots) {
+				dev_dbg(kbdev->dev,
+					"In-cycle %d idle slots available\n",
+					local_tock_slots);
+				goto redo_local_tock;
+			}
 		}
 
 		return;
@@ -3875,13 +4443,66 @@ static void schedule_actions(struct kbase_device *kbdev)
 	return;
 }
 
+/**
+ * can_skip_scheduling() - Check if the scheduling actions can be skipped.
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function is called on a scheduling tick or tock to determine if the
+ * scheduling actions can be skipped.
+ * If Scheduler is in sleeping state and exit from the sleep state is allowed
+ * then activation of MCU will be triggered. The tick or tock work item could
+ * have been in flight when the state of Scheduler was changed to sleeping.
+ *
+ * Return: true if the scheduling actions can be skipped.
+ */
+static bool can_skip_scheduling(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+
+	lockdep_assert_held(&scheduler->lock);
+
+	if (scheduler->state == SCHED_SUSPENDED)
+		return true;
+
+#ifdef KBASE_PM_RUNTIME
+	if (scheduler->state == SCHED_SLEEPING) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		if (kbdev->pm.backend.exit_gpu_sleep_mode) {
+			int ret = scheduler_pm_active_after_sleep(kbdev, flags);
+			/* hwaccess_lock is released in the previous function
+			 * call.
+			 */
+			if (!ret) {
+				scheduler->state = SCHED_INACTIVE;
+				return false;
+			}
+
+			dev_info(kbdev->dev,
+				 "Skip scheduling due to system suspend");
+			return true;
+		}
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+		return true;
+	}
+#endif
+
+	return false;
+}
+
 static void schedule_on_tock(struct work_struct *work)
 {
 	struct kbase_device *kbdev = container_of(work, struct kbase_device,
 					csf.scheduler.tock_work.work);
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+	int err;
 
-	int err = kbase_reset_gpu_try_prevent(kbdev);
+	/* Tock work item is serviced */
+	scheduler->tock_pending_request = false;
+
+	err = kbase_reset_gpu_try_prevent(kbdev);
 	/* Regardless of whether reset failed or is currently happening, exit
 	 * early
 	 */
@@ -3889,7 +4510,7 @@ static void schedule_on_tock(struct work_struct *work)
 		return;
 
 	mutex_lock(&scheduler->lock);
-	if (scheduler->state == SCHED_SUSPENDED)
+	if (can_skip_scheduling(kbdev))
 		goto exit_no_schedule_unlock;
 
 	WARN_ON(!(scheduler->state == SCHED_INACTIVE));
@@ -3897,15 +4518,14 @@ static void schedule_on_tock(struct work_struct *work)
 
 	/* Undertaking schedule action steps */
 	KBASE_KTRACE_ADD(kbdev, SCHEDULER_TOCK, NULL, 0u);
-	schedule_actions(kbdev);
+	schedule_actions(kbdev, false);
 
-	/* Record time information */
+	/* Record time information on a non-skipped tock */
 	scheduler->last_schedule = jiffies;
 
-	/* Tock is serviced */
-	scheduler->tock_pending_request = false;
-
 	scheduler->state = SCHED_INACTIVE;
+	if (!scheduler->total_runnable_grps)
+		queue_work(system_wq, &scheduler->gpu_idle_work);
 	mutex_unlock(&scheduler->lock);
 	kbase_reset_gpu_allow(kbdev);
 
@@ -3936,17 +4556,15 @@ static void schedule_on_tick(struct work_struct *work)
 	mutex_lock(&scheduler->lock);
 
 	WARN_ON(scheduler->tick_timer_active);
-	if (scheduler->state == SCHED_SUSPENDED)
+	if (can_skip_scheduling(kbdev))
 		goto exit_no_schedule_unlock;
 
 	scheduler->state = SCHED_BUSY;
-	/* Do scheduling stuff */
-	scheduler_rotate(kbdev);
 
 	/* Undertaking schedule action steps */
 	KBASE_KTRACE_ADD(kbdev, SCHEDULER_TICK, NULL,
 			 scheduler->total_runnable_grps);
-	schedule_actions(kbdev);
+	schedule_actions(kbdev, true);
 
 	/* Record time information */
 	scheduler->last_schedule = jiffies;
@@ -3958,7 +4576,8 @@ static void schedule_on_tick(struct work_struct *work)
 		dev_dbg(kbdev->dev,
 			"scheduling for next tick, num_runnable_groups:%u\n",
 			scheduler->total_runnable_grps);
-	}
+	} else if (!scheduler->total_runnable_grps)
+		queue_work(system_wq, &scheduler->gpu_idle_work);
 
 	scheduler->state = SCHED_INACTIVE;
 	mutex_unlock(&scheduler->lock);
@@ -4024,8 +4643,11 @@ static int wait_csg_slots_suspend(struct kbase_device *kbdev,
 				}
 			}
 		} else {
-			dev_warn(kbdev->dev, "Timed out waiting for CSG slots to suspend, slot_mask: 0x%*pb\n",
+			dev_warn(kbdev->dev, "[%llu] Timeout waiting for CSG slots to suspend, slot_mask: 0x%*pb\n",
+				 kbase_backend_get_cycle_cnt(kbdev),
 				 num_groups, slot_mask_local);
+
+
 			err = -ETIMEDOUT;
 		}
 	}
@@ -4069,7 +4691,7 @@ static int suspend_active_queue_groups_on_reset(struct kbase_device *kbdev)
 	ret = suspend_active_queue_groups(kbdev, slot_mask);
 
 	if (ret) {
-		dev_warn(kbdev->dev, "Timed out waiting for CSG slots to suspend before reset, slot_mask: 0x%*pb\n",
+		dev_warn(kbdev->dev, "Timeout waiting for CSG slots to suspend before reset, slot_mask: 0x%*pb\n",
 			 kbdev->csf.global_iface.group_num, slot_mask);
 	}
 
@@ -4088,7 +4710,8 @@ static int suspend_active_queue_groups_on_reset(struct kbase_device *kbdev)
 	ret2 = kbase_gpu_wait_cache_clean_timeout(kbdev,
 			kbdev->reset_timeout_ms);
 	if (ret2) {
-		dev_warn(kbdev->dev, "Timed out waiting for cache clean to complete before reset");
+		dev_warn(kbdev->dev, "[%llu] Timeout waiting for cache clean to complete before reset",
+			 kbase_backend_get_cycle_cnt(kbdev));
 		if (!ret)
 			ret = ret2;
 	}
@@ -4125,7 +4748,8 @@ static bool scheduler_handle_reset_in_protected_mode(struct kbase_device *kbdev)
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
 	u32 const num_groups = kbdev->csf.global_iface.group_num;
 	struct kbase_queue_group *protm_grp;
-	bool suspend_on_slot_groups;
+	bool suspend_on_slot_groups = true;
+	bool pmode_active;
 	unsigned long flags;
 	u32 csg_nr;
 
@@ -4133,20 +4757,51 @@ static bool scheduler_handle_reset_in_protected_mode(struct kbase_device *kbdev)
 
 	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
 	protm_grp = scheduler->active_protm_grp;
+	pmode_active = kbdev->protected_mode;
+
+	if (likely(!protm_grp && !pmode_active)) {
+		/* Case 1: GPU is not in protected mode or it successfully
+		 * exited protected mode. All on-slot groups can be suspended in
+		 * the regular way before reset.
+		 */
+		suspend_on_slot_groups = true;
+	} else if (protm_grp && pmode_active) {
+		/* Case 2: GPU went successfully into protected mode and hasn't
+		 * exited from it yet and the protected mode group is still
+		 * active. If there was no fault for the protected mode group
+		 * then it can be suspended in the regular way before reset.
+		 * The other normal mode on-slot groups were already implicitly
+		 * suspended on entry to protected mode so they can be marked as
+		 * suspended right away.
+		 */
+		suspend_on_slot_groups = !protm_grp->faulted;
+	} else if (!protm_grp && pmode_active) {
+		/* Case 3: GPU went successfully into protected mode and hasn't
+		 * exited from it yet but the protected mode group got deleted.
+		 * This would have happened if the FW got stuck during protected
+		 * mode for some reason (like GPU page fault or some internal
+		 * error). In normal cases FW is expected to send the pmode exit
+		 * interrupt before it handles the CSG termination request.
+		 * The other normal mode on-slot groups would already have been
+		 * implicitly suspended on entry to protected mode so they can be
+		 * marked as suspended right away.
+		 */
+		suspend_on_slot_groups = false;
+	} else if (protm_grp && !pmode_active) {
+		/* Case 4: GPU couldn't successfully enter protected mode, i.e.
+		 * PROTM_ENTER request had timed out.
+		 * All the on-slot groups need to be suspended in the regular
+		 * way before reset.
+		 */
+		suspend_on_slot_groups = true;
+	}
 
-	/* If GPU wasn't in protected mode or had exited it before the GPU reset
-	 * then all the on-slot groups can be suspended in the regular way by
-	 * sending CSG SUSPEND requests to FW.
-	 * If there wasn't a fault for protected mode group, then it would
-	 * also need to be suspended in the regular way before the reset.
-	 */
-	suspend_on_slot_groups = !(protm_grp && protm_grp->faulted);
 	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
 
-	if (!protm_grp)
+	if (likely(!pmode_active))
 		goto unlock;
 
-	/* GPU is in protected mode, so all the on-slot groups barring the
+	/* GPU hasn't exited protected mode, so all the on-slot groups barring
 	 * the protected mode group can be marked as suspended right away.
 	 */
 	for (csg_nr = 0; csg_nr < num_groups; csg_nr++) {
@@ -4174,19 +4829,25 @@ unlock:
 	return suspend_on_slot_groups;
 }
 
+static void cancel_tock_work(struct kbase_csf_scheduler *const scheduler)
+{
+	cancel_delayed_work_sync(&scheduler->tock_work);
+	scheduler->tock_pending_request = false;
+}
+
 static void scheduler_inner_reset(struct kbase_device *kbdev)
 {
 	u32 const num_groups = kbdev->csf.global_iface.group_num;
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
 	unsigned long flags;
 
-	WARN_ON(csgs_active(kbdev));
+	WARN_ON(kbase_csf_scheduler_get_nr_active_csgs(kbdev));
 
 	/* Cancel any potential queued delayed work(s) */
 	cancel_work_sync(&kbdev->csf.scheduler.gpu_idle_work);
 	cancel_tick_timer(kbdev);
 	cancel_work_sync(&scheduler->tick_work);
-	cancel_delayed_work_sync(&scheduler->tock_work);
+	cancel_tock_work(scheduler);
 	cancel_delayed_work_sync(&scheduler->ping_work);
 
 	mutex_lock(&scheduler->lock);
@@ -4292,10 +4953,11 @@ static void firmware_aliveness_monitor(struct work_struct *work)
 	}
 #endif
 
-	if (kbdev->csf.scheduler.state == SCHED_SUSPENDED)
+	if (kbdev->csf.scheduler.state == SCHED_SUSPENDED ||
+	    kbdev->csf.scheduler.state == SCHED_SLEEPING)
 		goto exit;
 
-	if (get_nr_active_csgs(kbdev) != 1)
+	if (kbase_csf_scheduler_get_nr_active_csgs(kbdev) != 1)
 		goto exit;
 
 	if (kbase_csf_scheduler_protected_mode_in_use(kbdev))
@@ -4307,7 +4969,7 @@ static void firmware_aliveness_monitor(struct work_struct *work)
 		goto exit;
 	}
 
-	kbase_pm_wait_for_desired_state(kbdev);
+	kbase_csf_scheduler_wait_mcu_active(kbdev);
 
 	err = kbase_csf_firmware_ping_wait(kbdev);
 
@@ -4318,7 +4980,7 @@ static void firmware_aliveness_monitor(struct work_struct *work)
 		if (kbase_prepare_to_reset_gpu(
 			    kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
 			kbase_reset_gpu(kbdev);
-	} else if (get_nr_active_csgs(kbdev) == 1) {
+	} else if (kbase_csf_scheduler_get_nr_active_csgs(kbdev) == 1) {
 		queue_delayed_work(system_long_wq,
 			&kbdev->csf.scheduler.ping_work,
 			msecs_to_jiffies(FIRMWARE_PING_INTERVAL_MS));
@@ -4337,13 +4999,42 @@ int kbase_csf_scheduler_group_copy_suspend_buf(struct kbase_queue_group *group,
 	struct kbase_context *const kctx = group->kctx;
 	struct kbase_device *const kbdev = kctx->kbdev;
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+	bool on_slot;
 	int err = 0;
 
 	kbase_reset_gpu_assert_prevented(kbdev);
 	lockdep_assert_held(&kctx->csf.lock);
 	mutex_lock(&scheduler->lock);
 
-	if (kbasep_csf_scheduler_group_is_on_slot_locked(group)) {
+	on_slot = kbasep_csf_scheduler_group_is_on_slot_locked(group);
+
+#ifdef KBASE_PM_RUNTIME
+	if (on_slot && (scheduler->state == SCHED_SLEEPING)) {
+		if (wait_for_scheduler_to_exit_sleep(kbdev)) {
+			dev_warn(
+				kbdev->dev,
+				"Wait for scheduler to exit sleep state timedout when copying suspend buffer for group %d of ctx %d_%d on slot %d",
+				group->handle, group->kctx->tgid,
+				group->kctx->id, group->csg_nr);
+
+			scheduler_wakeup(kbdev, true);
+
+			/* Wait for MCU firmware to start running */
+			if (kbase_csf_scheduler_wait_mcu_active(kbdev))
+				dev_warn(
+					kbdev->dev,
+					"Wait for MCU active failed when copying suspend buffer for group %d of ctx %d_%d on slot %d",
+					group->handle, group->kctx->tgid,
+					group->kctx->id, group->csg_nr);
+		}
+
+		/* Check the group state again as scheduler lock would have been
+		 * released when waiting for the exit from SLEEPING state.
+		 */
+		on_slot = kbasep_csf_scheduler_group_is_on_slot_locked(group);
+	}
+#endif
+	if (on_slot) {
 		DECLARE_BITMAP(slot_mask, MAX_SUPPORTED_CSGS) = {0};
 
 		set_bit(kbase_csf_scheduler_group_get_slot(group), slot_mask);
@@ -4353,8 +5044,9 @@ int kbase_csf_scheduler_group_copy_suspend_buf(struct kbase_queue_group *group,
 		err = wait_csg_slots_suspend(kbdev, slot_mask,
 					     kbdev->csf.fw_timeout_ms);
 		if (err) {
-			dev_warn(kbdev->dev, "Timed out waiting for the group %d to suspend on slot %d",
-					group->handle, group->csg_nr);
+			dev_warn(kbdev->dev, "[%llu] Timeout waiting for the group %d to suspend on slot %d",
+				 kbase_backend_get_cycle_cnt(kbdev),
+				 group->handle, group->csg_nr);
 			goto exit;
 		}
 	}
@@ -4547,20 +5239,22 @@ void kbase_csf_scheduler_group_protm_enter(struct kbase_queue_group *group)
 }
 
 /**
- * check_sync_update_for_idle_group_protm() - Check the sync wait condition
- *                                            for all the queues bound to
- *                                            the given group.
+ * check_sync_update_for_on_slot_group() - Check the sync wait condition
+ *                                         for all the queues bound to
+ *                                         the given on-slot group.
  *
- * @group:    Pointer to the group that requires evaluation.
+ * @group:    Pointer to the on-slot group that requires evaluation.
  *
  * This function is called if the GPU is in protected mode and there are on
- * slot idle groups with higher priority than the active protected mode group.
+ * slot idle groups with higher priority than the active protected mode group
+ * or this function is called when CQS object is signaled whilst GPU is in
+ * sleep state.
  * This function will evaluate the sync condition, if any, of all the queues
  * bound to the given group.
  *
  * Return true if the sync condition of at least one queue has been satisfied.
  */
-static bool check_sync_update_for_idle_group_protm(
+static bool check_sync_update_for_on_slot_group(
 		struct kbase_queue_group *group)
 {
 	struct kbase_device *const kbdev = group->kctx->kbdev;
@@ -4680,7 +5374,7 @@ static bool check_sync_update_for_idle_groups_protm(struct kbase_device *kbdev)
 			 * has a higher priority than the protm group, then we
 			 * need to exit protected mode.
 			 */
-			if (check_sync_update_for_idle_group_protm(group))
+			if (check_sync_update_for_on_slot_group(group))
 				exit_protm = true;
 		}
 	}
@@ -4688,6 +5382,28 @@ static bool check_sync_update_for_idle_groups_protm(struct kbase_device *kbdev)
 	return exit_protm;
 }
 
+static void check_sync_update_in_sleep_mode(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+	u32 const num_groups = kbdev->csf.global_iface.group_num;
+	u32 csg_nr;
+
+	lockdep_assert_held(&scheduler->lock);
+
+	for (csg_nr = 0; csg_nr < num_groups; csg_nr++) {
+		struct kbase_queue_group *const group =
+			kbdev->csf.scheduler.csg_slots[csg_nr].resident_group;
+
+		if (!group)
+			continue;
+
+		if (check_sync_update_for_on_slot_group(group)) {
+			scheduler_wakeup(kbdev, true);
+			return;
+		}
+	}
+}
+
 /**
  * check_group_sync_update_worker() - Check the sync wait condition for all the
  *                                    blocked queue groups
@@ -4709,6 +5425,7 @@ static void check_group_sync_update_worker(struct work_struct *work)
 		struct kbase_context, csf.sched.sync_update_work);
 	struct kbase_device *const kbdev = kctx->kbdev;
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+	bool sync_updated = false;
 
 	mutex_lock(&scheduler->lock);
 
@@ -4719,6 +5436,7 @@ static void check_group_sync_update_worker(struct work_struct *work)
 		list_for_each_entry_safe(group, temp,
 				&kctx->csf.sched.idle_wait_groups, link) {
 			if (group_sync_updated(group)) {
+				sync_updated = true;
 				/* Move this group back in to the runnable
 				 * groups list of the context.
 				 */
@@ -4730,8 +5448,17 @@ static void check_group_sync_update_worker(struct work_struct *work)
 		WARN_ON(!list_empty(&kctx->csf.sched.idle_wait_groups));
 	}
 
-	if (check_sync_update_for_idle_groups_protm(kbdev))
+	if (check_sync_update_for_idle_groups_protm(kbdev)) {
 		scheduler_force_protm_exit(kbdev);
+		sync_updated = true;
+	}
+
+	/* If scheduler is in sleep or suspended state, re-activate it
+	 * to serve on-slot CSGs blocked on CQS which has been signaled.
+	 */
+	if (!sync_updated && (scheduler->state == SCHED_SLEEPING))
+		check_sync_update_in_sleep_mode(kbdev);
+
 	KBASE_KTRACE_ADD(kbdev, GROUP_SYNC_UPDATE_WORKER_END, kctx, 0u);
 
 	mutex_unlock(&scheduler->lock);
@@ -4829,7 +5556,6 @@ int kbase_csf_scheduler_early_init(struct kbase_device *kbdev)
 	INIT_DEFERRABLE_WORK(&scheduler->tock_work, schedule_on_tock);
 
 	INIT_DEFERRABLE_WORK(&scheduler->ping_work, firmware_aliveness_monitor);
-	BUILD_BUG_ON(CSF_FIRMWARE_TIMEOUT_MS >= FIRMWARE_PING_INTERVAL_MS);
 
 	mutex_init(&scheduler->lock);
 	spin_lock_init(&scheduler->interrupt_lock);
@@ -4869,16 +5595,22 @@ void kbase_csf_scheduler_term(struct kbase_device *kbdev)
 {
 	if (kbdev->csf.scheduler.csg_slots) {
 		WARN_ON(atomic_read(&kbdev->csf.scheduler.non_idle_offslot_grps));
-		WARN_ON(csgs_active(kbdev));
+		/* The unload of Driver can take place only when all contexts have
+		 * been terminated. The groups that were not terminated by the User
+		 * are terminated on context termination. So no CSGs are expected
+		 * to be active at the time of Driver unload.
+		 */
+		WARN_ON(kbase_csf_scheduler_get_nr_active_csgs(kbdev));
 		flush_work(&kbdev->csf.scheduler.gpu_idle_work);
 		mutex_lock(&kbdev->csf.scheduler.lock);
+
 		if (WARN_ON(kbdev->csf.scheduler.state != SCHED_SUSPENDED))
 			scheduler_suspend(kbdev);
 		mutex_unlock(&kbdev->csf.scheduler.lock);
 		cancel_delayed_work_sync(&kbdev->csf.scheduler.ping_work);
 		cancel_tick_timer(kbdev);
 		cancel_work_sync(&kbdev->csf.scheduler.tick_work);
-		cancel_delayed_work_sync(&kbdev->csf.scheduler.tock_work);
+		cancel_tock_work(&kbdev->csf.scheduler);
 		mutex_destroy(&kbdev->csf.scheduler.lock);
 		kfree(kbdev->csf.scheduler.csg_slots);
 		kbdev->csf.scheduler.csg_slots = NULL;
@@ -4911,7 +5643,8 @@ static void scheduler_enable_tick_timer_nolock(struct kbase_device *kbdev)
 		return;
 
 	WARN_ON((scheduler->state != SCHED_INACTIVE) &&
-		(scheduler->state != SCHED_SUSPENDED));
+		(scheduler->state != SCHED_SUSPENDED) &&
+		(scheduler->state != SCHED_SLEEPING));
 
 	if (scheduler->total_runnable_grps > 0) {
 		enqueue_tick_work(kbdev);
@@ -4953,6 +5686,7 @@ void kbase_csf_scheduler_timer_set_enabled(struct kbase_device *kbdev,
 		scheduler->timer_enabled = false;
 		cancel_tick_timer(kbdev);
 		cancel_delayed_work(&scheduler->tock_work);
+		scheduler->tock_pending_request = false;
 		mutex_unlock(&scheduler->lock);
 		/* The non-sync version to cancel the normal work item is not
 		 * available, so need to drop the lock before cancellation.
@@ -4990,7 +5724,7 @@ void kbase_csf_scheduler_pm_suspend(struct kbase_device *kbdev)
 
 	/* Cancel any potential queued delayed work(s) */
 	cancel_work_sync(&scheduler->tick_work);
-	cancel_delayed_work_sync(&scheduler->tock_work);
+	cancel_tock_work(scheduler);
 
 	if (kbase_reset_gpu_prevent_and_wait(kbdev)) {
 		dev_warn(kbdev->dev,
@@ -5002,6 +5736,15 @@ void kbase_csf_scheduler_pm_suspend(struct kbase_device *kbdev)
 
 	disable_gpu_idle_fw_timer(kbdev);
 
+#ifdef KBASE_PM_RUNTIME
+	/* If scheduler is in sleeping state, then MCU needs to be activated
+	 * to suspend CSGs.
+	 */
+	if (scheduler->state == SCHED_SLEEPING) {
+		dev_info(kbdev->dev, "Activating MCU out of sleep on system suspend");
+		force_scheduler_to_exit_sleep(kbdev);
+	}
+#endif
 	if (scheduler->state != SCHED_SUSPENDED) {
 		suspend_active_groups_on_powerdown(kbdev, true);
 		dev_info(kbdev->dev, "Scheduler PM suspend");
@@ -5019,9 +5762,8 @@ void kbase_csf_scheduler_pm_resume(struct kbase_device *kbdev)
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
 
 	mutex_lock(&scheduler->lock);
-
-	if (scheduler->total_runnable_grps > 0) {
-		WARN_ON(scheduler->state != SCHED_SUSPENDED);
+	if ((scheduler->total_runnable_grps > 0) &&
+	    (scheduler->state == SCHED_SUSPENDED)) {
 		dev_info(kbdev->dev, "Scheduler PM resume");
 		scheduler_wakeup(kbdev, true);
 	}
@@ -5031,33 +5773,141 @@ KBASE_EXPORT_TEST_API(kbase_csf_scheduler_pm_resume);
 
 void kbase_csf_scheduler_pm_active(struct kbase_device *kbdev)
 {
+	/* Here the lock is taken to synchronize against the runtime suspend
+	 * callback function, which may need to wake up the MCU for suspending
+	 * the CSGs before powering down the GPU.
+	 */
+	mutex_lock(&kbdev->csf.scheduler.lock);
+	scheduler_pm_active_handle_suspend(kbdev,
+			KBASE_PM_SUSPEND_HANDLER_NOT_POSSIBLE);
+	mutex_unlock(&kbdev->csf.scheduler.lock);
+}
+KBASE_EXPORT_TEST_API(kbase_csf_scheduler_pm_active);
+
+void kbase_csf_scheduler_pm_idle(struct kbase_device *kbdev)
+{
+	/* Here the lock is taken just to maintain symmetry with
+	 * kbase_csf_scheduler_pm_active().
+	 */
+	mutex_lock(&kbdev->csf.scheduler.lock);
+	scheduler_pm_idle(kbdev);
+	mutex_unlock(&kbdev->csf.scheduler.lock);
+}
+KBASE_EXPORT_TEST_API(kbase_csf_scheduler_pm_idle);
+
+int kbase_csf_scheduler_wait_mcu_active(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
 	unsigned long flags;
-	u32 prev_count;
+	int err;
 
+	kbase_pm_lock(kbdev);
+	WARN_ON(!kbdev->pm.active_count);
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	prev_count = kbdev->csf.scheduler.pm_active_count++;
+	WARN_ON(!scheduler->pm_active_count);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	kbase_pm_unlock(kbdev);
 
-	/* On 0 => 1, make a pm_ctx_active request */
-	if (!prev_count)
-		kbase_pm_context_active(kbdev);
-	else
-		WARN_ON(prev_count == U32_MAX);
+	kbase_pm_wait_for_poweroff_work_complete(kbdev);
+
+	err = kbase_pm_wait_for_desired_state(kbdev);
+	if (!err) {
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		WARN_ON(kbdev->pm.backend.mcu_state != KBASE_MCU_ON);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	}
+
+	return err;
 }
-KBASE_EXPORT_TEST_API(kbase_csf_scheduler_pm_active);
+KBASE_EXPORT_TEST_API(kbase_csf_scheduler_wait_mcu_active);
 
-void kbase_csf_scheduler_pm_idle(struct kbase_device *kbdev)
+#ifdef KBASE_PM_RUNTIME
+int kbase_csf_scheduler_handle_runtime_suspend(struct kbase_device *kbdev)
 {
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
 	unsigned long flags;
-	u32 prev_count;
+	int ret;
+
+	dev_dbg(kbdev->dev, "Handling runtime suspend");
+
+	kbase_reset_gpu_assert_prevented(kbdev);
+	lockdep_assert_held(&scheduler->lock);
+	WARN_ON(scheduler->pm_active_count);
+
+	if (scheduler->state == SCHED_SUSPENDED) {
+		WARN_ON(kbdev->pm.backend.gpu_sleep_mode_active);
+		return 0;
+	}
+
+	ret = suspend_active_groups_on_powerdown(kbdev, false);
+
+	if (ret) {
+		dev_dbg(kbdev->dev, "Aborting runtime suspend (grps: %d)",
+			 atomic_read(&scheduler->non_idle_offslot_grps));
+
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		kbdev->pm.backend.exit_gpu_sleep_mode = true;
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
+		kbase_csf_scheduler_invoke_tick(kbdev);
+		return ret;
+	}
+
+	scheduler->state = SCHED_SUSPENDED;
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	prev_count = kbdev->csf.scheduler.pm_active_count--;
+	kbdev->pm.backend.gpu_sleep_mode_active = false;
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
-	if (prev_count == 1)
-		kbase_pm_context_idle(kbdev);
-	else
-		WARN_ON(prev_count == 0);
+	wake_up_all(&kbdev->csf.event_wait);
+	return 0;
+}
+
+void kbase_csf_scheduler_reval_idleness_post_sleep(struct kbase_device *kbdev)
+{
+	u32 csg_nr;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	WARN_ON(kbdev->pm.backend.mcu_state != KBASE_MCU_IN_SLEEP);
+
+	for (csg_nr = 0; csg_nr < kbdev->csf.global_iface.group_num; csg_nr++) {
+		struct kbase_csf_cmd_stream_group_info *ginfo =
+			&kbdev->csf.global_iface.groups[csg_nr];
+		bool csg_idle;
+
+		 if (!kbdev->csf.scheduler.csg_slots[csg_nr].resident_group)
+			continue;
+
+		csg_idle =
+			kbase_csf_firmware_csg_output(ginfo, CSG_STATUS_STATE) &
+			CSG_STATUS_STATE_IDLE_MASK;
+		if (!csg_idle) {
+			dev_dbg(kbdev->dev,
+				"Re-activate Scheduler after MCU sleep");
+			kbdev->pm.backend.exit_gpu_sleep_mode = true;
+			kbase_csf_scheduler_invoke_tick(kbdev);
+			break;
+		}
+	}
+}
+
+void kbase_csf_scheduler_force_sleep(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+
+	mutex_lock(&scheduler->lock);
+	if (kbase_pm_gpu_sleep_allowed(kbdev) &&
+	    (scheduler->state == SCHED_INACTIVE))
+		scheduler_sleep_on_idle(kbdev);
+	mutex_unlock(&scheduler->lock);
+}
+#endif
+
+void kbase_csf_scheduler_force_wakeup(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+
+	mutex_lock(&scheduler->lock);
+	scheduler_wakeup(kbdev, true);
+	mutex_unlock(&scheduler->lock);
 }
-KBASE_EXPORT_TEST_API(kbase_csf_scheduler_pm_idle);
diff --git a/mali_kbase/csf/mali_kbase_csf_scheduler.h b/mali_kbase/csf/mali_kbase_csf_scheduler.h
index 428ecbe..73ebb66 100644
--- a/mali_kbase/csf/mali_kbase_csf_scheduler.h
+++ b/mali_kbase/csf/mali_kbase_csf_scheduler.h
@@ -374,7 +374,11 @@ static inline bool kbase_csf_scheduler_protected_mode_in_use(
  * kbase_csf_scheduler_pm_active - Perform scheduler power active operation
  *
  * Note: This function will increase the scheduler's internal pm_active_count
- * value, ensuring that both GPU and MCU are powered for access.
+ * value, ensuring that both GPU and MCU are powered for access. The MCU may
+ * not have actually become active when this function returns, so need to
+ * call kbase_csf_scheduler_wait_mcu_active() for that.
+ *
+ * This function should not be called with global scheduler lock held.
  *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  */
@@ -384,13 +388,27 @@ void kbase_csf_scheduler_pm_active(struct kbase_device *kbdev);
  * kbase_csf_scheduler_pm_idle - Perform the scheduler power idle operation
  *
  * Note: This function will decrease the scheduler's internal pm_active_count
- * value. On reaching 0, the MCU and GPU could be powered off.
+ * value. On reaching 0, the MCU and GPU could be powered off. This function
+ * should not be called with global scheduler lock held.
  *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
  */
 void kbase_csf_scheduler_pm_idle(struct kbase_device *kbdev);
 
 /**
+ * kbase_csf_scheduler_wait_mcu_active - Wait for the MCU to actually become active
+ *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ *
+ * This function will wait for the MCU to actually become active. It is supposed
+ * to be called after calling kbase_csf_scheduler_pm_active(). It is needed as
+ * kbase_csf_scheduler_pm_active() may not make the MCU active right away.
+ *
+ * Return: 0 if the MCU was successfully activated otherwise an error code.
+ */
+int kbase_csf_scheduler_wait_mcu_active(struct kbase_device *kbdev);
+
+/**
  * kbase_csf_scheduler_pm_resume - Reactivate the scheduler on system resume
  *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
@@ -472,6 +490,26 @@ static inline void kbase_csf_scheduler_advance_tick(struct kbase_device *kbdev)
 }
 
 /**
+ * kbase_csf_scheduler_invoke_tick() - Invoke the scheduling tick
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function will queue the scheduling tick work item for immediate
+ * execution if tick timer is not active. This can be called from interrupt
+ * context to resume the scheduling after GPU was put to sleep.
+ */
+static inline void kbase_csf_scheduler_invoke_tick(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+	unsigned long flags;
+
+	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
+	if (!scheduler->tick_timer_active)
+		queue_work(scheduler->wq, &scheduler->tick_work);
+	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
+}
+
+/**
  * kbase_csf_scheduler_queue_has_trace() - report whether the queue has been
  *                                         configured to operate with the
  *                                         cs_trace feature.
@@ -491,4 +529,97 @@ static inline bool kbase_csf_scheduler_queue_has_trace(struct kbase_queue *queue
 	return (queue->trace_buffer_size && queue->trace_buffer_base);
 }
 
+#ifdef KBASE_PM_RUNTIME
+/**
+ * kbase_csf_scheduler_reval_idleness_post_sleep() - Check GPU's idleness after
+ *                                                   putting MCU to sleep state
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function re-evaluates the idleness of on-slot queue groups after MCU
+ * was put to the sleep state and invokes the scheduling tick if any of the
+ * on-slot queue group became non-idle.
+ * CSG_OUTPUT_BLOCK.CSG_STATUS_STATE.IDLE bit is checked to determine the
+ * idleness which is updated by MCU firmware on handling of the sleep request.
+ *
+ * This function is needed to detect if more work was flushed in the window
+ * between the GPU idle notification and the enabling of Doorbell mirror
+ * interrupt (from MCU state machine). Once Doorbell mirror interrupt is
+ * enabled, Host can receive the notification on User doorbell rings.
+ */
+void kbase_csf_scheduler_reval_idleness_post_sleep(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_scheduler_handle_runtime_suspend() - Handle runtime suspend by
+ *                                                suspending CSGs.
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function is called from the runtime suspend callback function for
+ * suspending all the on-slot queue groups. If any of the group is found to
+ * be non-idle after the completion of CSG suspend operation or the CSG
+ * suspend operation times out, then the scheduling tick is invoked and an
+ * error is returned so that the GPU power down can be aborted.
+ *
+ * Return: 0 if all the CSGs were suspended, otherwise an error code.
+ */
+int kbase_csf_scheduler_handle_runtime_suspend(struct kbase_device *kbdev);
+#endif
+
+/**
+ * kbase_csf_scheduler_get_nr_active_csgs() - Get the number of active CSGs
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function calculates the number of CSG slots that have a queue group
+ * resident on them.
+ *
+ * Note: This function should not be used if the interrupt_lock is held. Use
+ * kbase_csf_scheduler_get_nr_active_csgs_locked() instead.
+ *
+ * Return: number of active CSGs.
+ */
+u32 kbase_csf_scheduler_get_nr_active_csgs(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_scheduler_get_nr_active_csgs_locked() - Get the number of active
+ *                                                   CSGs
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function calculates the number of CSG slots that have a queue group
+ * resident on them.
+ *
+ * Note: This function should be called with interrupt_lock held.
+ *
+ * Return: number of active CSGs.
+ */
+u32 kbase_csf_scheduler_get_nr_active_csgs_locked(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_scheduler_force_wakeup() - Forcefully resume the scheduling of CSGs
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function is called to forcefully resume the scheduling of CSGs, even
+ * when there wasn't any work submitted for them.
+ * This function is only used for testing purpose.
+ */
+void kbase_csf_scheduler_force_wakeup(struct kbase_device *kbdev);
+
+#ifdef KBASE_PM_RUNTIME
+/**
+ * kbase_csf_scheduler_force_sleep() - Forcefully put the Scheduler to sleeping
+ *                                     state.
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function is called to forcefully put the Scheduler to sleeping state
+ * and trigger the sleep of MCU. If the CSGs are not idle, then the Scheduler
+ * would get reactivated again immediately.
+ * This function is only used for testing purpose.
+ */
+void kbase_csf_scheduler_force_sleep(struct kbase_device *kbdev);
+#endif
+
 #endif /* _KBASE_CSF_SCHEDULER_H_ */
diff --git a/mali_kbase/csf/mali_kbase_csf_tiler_heap.c b/mali_kbase/csf/mali_kbase_csf_tiler_heap.c
index 8ecf235..06a7824 100644
--- a/mali_kbase/csf/mali_kbase_csf_tiler_heap.c
+++ b/mali_kbase/csf/mali_kbase_csf_tiler_heap.c
@@ -66,8 +66,6 @@ static u64 encode_chunk_ptr(u32 const chunk_size, u64 const chunk_addr)
 static struct kbase_csf_tiler_heap_chunk *get_last_chunk(
 	struct kbase_csf_tiler_heap *const heap)
 {
-	lockdep_assert_held(&heap->kctx->csf.tiler_heaps.lock);
-
 	if (list_empty(&heap->chunks_list))
 		return NULL;
 
@@ -176,7 +174,7 @@ static int init_chunk(struct kbase_csf_tiler_heap *const heap,
  * Return: 0 if successful or a negative error code on failure.
  */
 static int create_chunk(struct kbase_csf_tiler_heap *const heap,
-		bool link_with_prev)
+			bool link_with_prev)
 {
 	int err = 0;
 	struct kbase_context *const kctx = heap->kctx;
@@ -186,14 +184,17 @@ static int create_chunk(struct kbase_csf_tiler_heap *const heap,
 		BASE_MEM_COHERENT_LOCAL;
 	struct kbase_csf_tiler_heap_chunk *chunk = NULL;
 
-	flags |= base_mem_group_id_set(kctx->jit_group_id);
+	/* Calls to this function are inherently synchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC;
+
+	flags |= kbase_mem_group_id_set(kctx->jit_group_id);
 
 #if defined(CONFIG_MALI_DEBUG) || defined(CONFIG_MALI_VECTOR_DUMP)
 	flags |= BASE_MEM_PROT_CPU_RD;
 #endif
 
-	lockdep_assert_held(&kctx->csf.tiler_heaps.lock);
-
 	chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
 	if (unlikely(!chunk)) {
 		dev_err(kctx->kbdev->dev,
@@ -203,8 +204,8 @@ static int create_chunk(struct kbase_csf_tiler_heap *const heap,
 
 	/* Allocate GPU memory for the new chunk. */
 	INIT_LIST_HEAD(&chunk->link);
-	chunk->region = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0,
-		&flags, &chunk->gpu_va);
+	chunk->region = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags,
+					&chunk->gpu_va, mmu_sync_info);
 
 	if (unlikely(!chunk->region)) {
 		dev_err(kctx->kbdev->dev,
@@ -251,8 +252,6 @@ static void delete_chunk(struct kbase_csf_tiler_heap *const heap,
 {
 	struct kbase_context *const kctx = heap->kctx;
 
-	lockdep_assert_held(&kctx->csf.tiler_heaps.lock);
-
 	kbase_gpu_vm_lock(kctx);
 	chunk->region->flags &= ~KBASE_REG_NO_USER_FREE;
 	kbase_mem_free_region(kctx, chunk->region);
@@ -273,9 +272,6 @@ static void delete_chunk(struct kbase_csf_tiler_heap *const heap,
 static void delete_all_chunks(struct kbase_csf_tiler_heap *heap)
 {
 	struct list_head *entry = NULL, *tmp = NULL;
-	struct kbase_context *const kctx = heap->kctx;
-
-	lockdep_assert_held(&kctx->csf.tiler_heaps.lock);
 
 	list_for_each_safe(entry, tmp, &heap->chunks_list) {
 		struct kbase_csf_tiler_heap_chunk *chunk = list_entry(
@@ -429,6 +425,9 @@ int kbase_csf_tiler_heap_init(struct kbase_context *const kctx,
 		"Creating a tiler heap with %u chunks (limit: %u) of size %u\n",
 		initial_chunks, max_chunks, chunk_size);
 
+	if (!kbase_mem_allow_alloc(kctx))
+		return -EINVAL;
+
 	if (chunk_size == 0)
 		return -EINVAL;
 
@@ -459,11 +458,9 @@ int kbase_csf_tiler_heap_init(struct kbase_context *const kctx,
 
 	heap->gpu_va = kbase_csf_heap_context_allocator_alloc(ctx_alloc);
 
-	mutex_lock(&kctx->csf.tiler_heaps.lock);
-
 	if (unlikely(!heap->gpu_va)) {
-		dev_err(kctx->kbdev->dev,
-			"Failed to allocate a tiler heap context\n");
+		dev_dbg(kctx->kbdev->dev,
+			"Failed to allocate a tiler heap context");
 		err = -ENOMEM;
 	} else {
 		err = create_initial_chunks(heap, initial_chunks);
@@ -480,13 +477,14 @@ int kbase_csf_tiler_heap_init(struct kbase_context *const kctx,
 			list_first_entry(&heap->chunks_list,
 				struct kbase_csf_tiler_heap_chunk, link);
 
+		*heap_gpu_va = heap->gpu_va;
+		*first_chunk_va = first_chunk->gpu_va;
+
+		mutex_lock(&kctx->csf.tiler_heaps.lock);
 		kctx->csf.tiler_heaps.nr_of_heaps++;
 		heap->heap_id = kctx->csf.tiler_heaps.nr_of_heaps;
 		list_add(&heap->link, &kctx->csf.tiler_heaps.list);
 
-		*heap_gpu_va = heap->gpu_va;
-		*first_chunk_va = first_chunk->gpu_va;
-
 		KBASE_TLSTREAM_AUX_TILER_HEAP_STATS(
 			kctx->kbdev, kctx->id, heap->heap_id,
 			PFN_UP(heap->chunk_size * heap->max_chunks),
@@ -496,10 +494,9 @@ int kbase_csf_tiler_heap_init(struct kbase_context *const kctx,
 
 		dev_dbg(kctx->kbdev->dev, "Created tiler heap 0x%llX\n",
 			heap->gpu_va);
+		mutex_unlock(&kctx->csf.tiler_heaps.lock);
 	}
 
-	mutex_unlock(&kctx->csf.tiler_heaps.lock);
-
 	return err;
 }
 
diff --git a/mali_kbase/csf/mali_kbase_csf_timeout.c b/mali_kbase/csf/mali_kbase_csf_timeout.c
index 4d93fe5..f52cbab 100644
--- a/mali_kbase/csf/mali_kbase_csf_timeout.c
+++ b/mali_kbase/csf/mali_kbase_csf_timeout.c
@@ -100,7 +100,7 @@ static ssize_t progress_timeout_store(struct device * const dev,
 	if (!err) {
 		kbase_csf_scheduler_pm_active(kbdev);
 
-		err = kbase_pm_wait_for_desired_state(kbdev);
+		err = kbase_csf_scheduler_wait_mcu_active(kbdev);
 		if (!err)
 			err = kbase_csf_firmware_set_timeout(kbdev, timeout);
 
diff --git a/mali_kbase/csf/mali_kbase_csf_tl_reader.c b/mali_kbase/csf/mali_kbase_csf_tl_reader.c
index 1824c2d..563faec 100644
--- a/mali_kbase/csf/mali_kbase_csf_tl_reader.c
+++ b/mali_kbase/csf/mali_kbase_csf_tl_reader.c
@@ -171,13 +171,12 @@ static int kbase_ts_converter_init(
  *
  * Return: The CPU timestamp.
  */
-static void kbase_ts_converter_convert(
-	const struct kbase_ts_converter *self,
-	u64 *gpu_ts)
+void kbase_ts_converter_convert(const struct kbase_ts_converter *self,
+				u64 *gpu_ts)
 {
 	u64 old_gpu_ts = *gpu_ts;
-	*gpu_ts = div64_u64(old_gpu_ts * self->multiplier,
-		self->divisor) + self->offset;
+	*gpu_ts = div64_u64(old_gpu_ts * self->multiplier, self->divisor) +
+		  self->offset;
 }
 
 /**
@@ -256,6 +255,7 @@ static void tl_reader_reset(struct kbase_csf_tl_reader *self)
 	self->tl_header.btc = 0;
 }
 
+
 int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self)
 {
 	int ret = 0;
@@ -280,6 +280,7 @@ int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self)
 		return -EBUSY;
 	}
 
+
 	/* Copying the whole buffer in a single shot. We assume
 	 * that the buffer will not contain partially written messages.
 	 */
@@ -330,9 +331,8 @@ int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self)
 		{
 			struct kbase_csffw_tl_message *msg =
 				(struct kbase_csffw_tl_message *) csffw_data_it;
-			kbase_ts_converter_convert(
-				&self->ts_converter,
-				&msg->timestamp);
+			kbase_ts_converter_convert(&self->ts_converter,
+						   &msg->timestamp);
 		}
 
 		/* Copy the message out to the tl_stream. */
diff --git a/mali_kbase/csf/mali_kbase_csf_tl_reader.h b/mali_kbase/csf/mali_kbase_csf_tl_reader.h
index 1b0fcd7..891a8f3 100644
--- a/mali_kbase/csf/mali_kbase_csf_tl_reader.h
+++ b/mali_kbase/csf/mali_kbase_csf_tl_reader.h
@@ -43,9 +43,9 @@ struct kbase_device;
  * struct kbase_ts_converter -
  * System timestamp to CPU timestamp converter state.
  *
- * @multiplier:	Numerator of the converter's fraction.
- * @divisor:	Denominator of the converter's fraction.
- * @offset:	Converter's offset term.
+ * @multiplier:		Numerator of the converter's fraction.
+ * @divisor:		Denominator of the converter's fraction.
+ * @offset:		Converter's offset term.
  *
  * According to Generic timer spec, system timer:
  * - Increments at a fixed frequency
diff --git a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_codes_jm.h b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_codes_jm.h
index f419f70..6ba98b7 100644
--- a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_codes_jm.h
+++ b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_codes_jm.h
@@ -56,6 +56,14 @@ int dummy_array[] = {
 	 */
 	/* info_val==exit code; gpu_addr==chain gpuaddr */
 	KBASE_KTRACE_CODE_MAKE_CODE(JM_JOB_DONE),
+	/* gpu_addr==JS_HEAD read
+	 * info_val==event code
+	 */
+	KBASE_KTRACE_CODE_MAKE_CODE(JM_RETURN_ATOM_TO_JS),
+	/* gpu_addr==JS_HEAD read
+	 * info_val==event code
+	 */
+	KBASE_KTRACE_CODE_MAKE_CODE(JM_MARK_FOR_RETURN_TO_JS),
 	/* gpu_addr==JS_HEAD_NEXT written, info_val==lower 32 bits of
 	 * affinity
 	 */
@@ -120,6 +128,13 @@ int dummy_array[] = {
 	KBASE_KTRACE_CODE_MAKE_CODE(JS_ADD_JOB),
 	/* gpu_addr==last value written/would be written to JS_HEAD */
 	KBASE_KTRACE_CODE_MAKE_CODE(JS_REMOVE_JOB),
+	/* gpu_addr==value to write into JS_HEAD
+	 * info_val==priority of atom as a KBASE_JS_ATOM_SCHED_PRIO_<...> value
+	 * (0 highest)
+	 */
+	KBASE_KTRACE_CODE_MAKE_CODE(JS_PULL_JOB),
+	/* gpu_addr==value that would be written to JS_HEAD if run again */
+	KBASE_KTRACE_CODE_MAKE_CODE(JS_UNPULL_JOB),
 	KBASE_KTRACE_CODE_MAKE_CODE(JS_TRY_SCHEDULE_HEAD_CTX),
 	/* gpu_addr==value to write into JS_HEAD */
 	KBASE_KTRACE_CODE_MAKE_CODE(JS_JOB_DONE_TRY_RUN_NEXT_JOB),
@@ -146,6 +161,25 @@ int dummy_array[] = {
 	KBASE_KTRACE_CODE_MAKE_CODE(JS_CTX_ATTR_NOW_OFF_CTX),
 	/* info_val == the ctx attribute now off runpool */
 	KBASE_KTRACE_CODE_MAKE_CODE(JS_CTX_ATTR_NOW_OFF_RUNPOOL),
+	/* gpu_addr==value to write into JS_HEAD */
+	KBASE_KTRACE_CODE_MAKE_CODE(JS_RETURN_WORKER),
+	/* gpu_addr==value to write into JS_HEAD */
+	KBASE_KTRACE_CODE_MAKE_CODE(JS_RETURN_WORKER_END),
+	/* info_val==priority level blocked (0 highest) */
+	KBASE_KTRACE_CODE_MAKE_CODE(JS_SLOT_PRIO_BLOCKED),
+	/* info_val==priority level unblocked (0 highest)
+	 * note that the priority level may still be blocked on higher levels
+	 */
+	KBASE_KTRACE_CODE_MAKE_CODE(JS_SLOT_PRIO_UNBLOCKED),
+	/* gpu_addr==value to write into JS_HEAD
+	 * info_val==priority level unblocked - priorities at this and higher
+	 *           are unblocked (0 highest)
+	 */
+	KBASE_KTRACE_CODE_MAKE_CODE(JS_SLOT_PRIO_AND_HIGHER_UNBLOCKED),
+	/* gpu_addr==value to write into JS_HEAD
+	 * info_val==priority level blocked (0 highest)
+	 */
+	KBASE_KTRACE_CODE_MAKE_CODE(JS_SLOT_PRIO_IS_BLOCKED),
 	/*
 	 * Scheduler Policy events
 	 */
diff --git a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_defs_jm.h b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_defs_jm.h
index c01f930..efa8ab0 100644
--- a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_defs_jm.h
+++ b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_defs_jm.h
@@ -45,9 +45,12 @@
  *
  * ftrace backend now outputs kctx field (as %d_%u format).
  *
+ * 2.2:
+ * Add tracing codes for pulling, unpulling, and returns atoms to JS for
+ * diagnosing soft-stop path and preemption problems
  */
 #define KBASE_KTRACE_VERSION_MAJOR 2
-#define KBASE_KTRACE_VERSION_MINOR 1
+#define KBASE_KTRACE_VERSION_MINOR 2
 #endif /* KBASE_KTRACE_TARGET_RBUF */
 
 /*
diff --git a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.c b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.c
index fed9c1f..05d1677 100644
--- a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.c
+++ b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.c
@@ -71,10 +71,11 @@ void kbasep_ktrace_backend_format_msg(struct kbase_ktrace_msg *trace_msg,
 }
 
 void kbasep_ktrace_add_jm(struct kbase_device *kbdev,
-		enum kbase_ktrace_code code, struct kbase_context *kctx,
-		struct kbase_jd_atom *katom, u64 gpu_addr,
-		kbase_ktrace_flag_t flags, int refcount, int jobslot,
-		u64 info_val)
+			  enum kbase_ktrace_code code,
+			  struct kbase_context *kctx,
+			  const struct kbase_jd_atom *katom, u64 gpu_addr,
+			  kbase_ktrace_flag_t flags, int refcount, int jobslot,
+			  u64 info_val)
 {
 	unsigned long irqflags;
 	struct kbase_ktrace_msg *trace_msg;
diff --git a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.h b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.h
index 8b09d05..ffae8d4 100644
--- a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.h
+++ b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.h
@@ -41,10 +41,11 @@
  * PRIVATE: do not use directly. Use KBASE_KTRACE_ADD_JM() instead.
  */
 void kbasep_ktrace_add_jm(struct kbase_device *kbdev,
-		enum kbase_ktrace_code code, struct kbase_context *kctx,
-		struct kbase_jd_atom *katom, u64 gpu_addr,
-		kbase_ktrace_flag_t flags, int refcount, int jobslot,
-		u64 info_val);
+			  enum kbase_ktrace_code code,
+			  struct kbase_context *kctx,
+			  const struct kbase_jd_atom *katom, u64 gpu_addr,
+			  kbase_ktrace_flag_t flags, int refcount, int jobslot,
+			  u64 info_val);
 
 #define KBASE_KTRACE_RBUF_ADD_JM(kbdev, code, kctx, katom, gpu_addr, flags, \
 		refcount, jobslot, info_val) \
diff --git a/mali_kbase/debug/backend/mali_kbase_debug_linux_ktrace_jm.h b/mali_kbase/debug/backend/mali_kbase_debug_linux_ktrace_jm.h
index 2e88e69..8fa4e2a 100644
--- a/mali_kbase/debug/backend/mali_kbase_debug_linux_ktrace_jm.h
+++ b/mali_kbase/debug/backend/mali_kbase_debug_linux_ktrace_jm.h
@@ -50,6 +50,8 @@ DECLARE_EVENT_CLASS(mali_jm_slot_template,
 DEFINE_EVENT(mali_jm_slot_template, mali_##name, \
 	TP_PROTO(struct kbase_context *kctx, int jobslot, u64 info_val), \
 	TP_ARGS(kctx, jobslot, info_val))
+DEFINE_MALI_JM_SLOT_EVENT(JM_RETURN_ATOM_TO_JS);
+DEFINE_MALI_JM_SLOT_EVENT(JM_MARK_FOR_RETURN_TO_JS);
 DEFINE_MALI_JM_SLOT_EVENT(JM_SUBMIT);
 DEFINE_MALI_JM_SLOT_EVENT(JM_JOB_DONE);
 DEFINE_MALI_JM_SLOT_EVENT(JM_UPDATE_HEAD);
@@ -68,6 +70,7 @@ DEFINE_MALI_JM_SLOT_EVENT(JS_CORE_REF_REGISTER_ON_RECHECK_FAILED);
 DEFINE_MALI_JM_SLOT_EVENT(JS_AFFINITY_SUBMIT_TO_BLOCKED);
 DEFINE_MALI_JM_SLOT_EVENT(JS_AFFINITY_CURRENT);
 DEFINE_MALI_JM_SLOT_EVENT(JD_DONE_TRY_RUN_NEXT_JOB);
+DEFINE_MALI_JM_SLOT_EVENT(JS_PULL_JOB);
 DEFINE_MALI_JM_SLOT_EVENT(JS_CORE_REF_REQUEST_CORES_FAILED);
 DEFINE_MALI_JM_SLOT_EVENT(JS_CORE_REF_REGISTER_INUSE_FAILED);
 DEFINE_MALI_JM_SLOT_EVENT(JS_CORE_REF_REQUEST_ON_RECHECK_FAILED);
@@ -76,6 +79,10 @@ DEFINE_MALI_JM_SLOT_EVENT(JS_JOB_DONE_TRY_RUN_NEXT_JOB);
 DEFINE_MALI_JM_SLOT_EVENT(JS_JOB_DONE_RETRY_NEEDED);
 DEFINE_MALI_JM_SLOT_EVENT(JS_POLICY_DEQUEUE_JOB);
 DEFINE_MALI_JM_SLOT_EVENT(JS_POLICY_DEQUEUE_JOB_IRQ);
+DEFINE_MALI_JM_SLOT_EVENT(JS_SLOT_PRIO_BLOCKED);
+DEFINE_MALI_JM_SLOT_EVENT(JS_SLOT_PRIO_UNBLOCKED);
+DEFINE_MALI_JM_SLOT_EVENT(JS_SLOT_PRIO_AND_HIGHER_UNBLOCKED);
+DEFINE_MALI_JM_SLOT_EVENT(JS_SLOT_PRIO_IS_BLOCKED);
 #undef DEFINE_MALI_JM_SLOT_EVENT
 
 DECLARE_EVENT_CLASS(mali_jm_refcount_template,
@@ -152,10 +159,13 @@ DEFINE_MALI_JM_ADD_EVENT(JM_ZAP_SCHEDULED);
 DEFINE_MALI_JM_ADD_EVENT(JM_ZAP_DONE);
 DEFINE_MALI_JM_ADD_EVENT(JM_SUBMIT_AFTER_RESET);
 DEFINE_MALI_JM_ADD_EVENT(JM_JOB_COMPLETE);
+DEFINE_MALI_JM_ADD_EVENT(JS_UNPULL_JOB);
 DEFINE_MALI_JM_ADD_EVENT(JS_CTX_ATTR_NOW_ON_RUNPOOL);
 DEFINE_MALI_JM_ADD_EVENT(JS_CTX_ATTR_NOW_OFF_RUNPOOL);
 DEFINE_MALI_JM_ADD_EVENT(JS_CTX_ATTR_NOW_ON_CTX);
 DEFINE_MALI_JM_ADD_EVENT(JS_CTX_ATTR_NOW_OFF_CTX);
+DEFINE_MALI_JM_ADD_EVENT(JS_RETURN_WORKER);
+DEFINE_MALI_JM_ADD_EVENT(JS_RETURN_WORKER_END);
 DEFINE_MALI_JM_ADD_EVENT(JS_POLICY_TIMER_END);
 DEFINE_MALI_JM_ADD_EVENT(JS_POLICY_TIMER_START);
 DEFINE_MALI_JM_ADD_EVENT(JS_POLICY_ENQUEUE_JOB);
diff --git a/mali_kbase/debug/mali_kbase_debug_ktrace_codes.h b/mali_kbase/debug/mali_kbase_debug_ktrace_codes.h
index 3309834..1c6b4cd 100644
--- a/mali_kbase/debug/mali_kbase_debug_ktrace_codes.h
+++ b/mali_kbase/debug/mali_kbase_debug_ktrace_codes.h
@@ -138,6 +138,10 @@ int dummy_array[] = {
 	/* info_val == policy number */
 	KBASE_KTRACE_CODE_MAKE_CODE(PM_CURRENT_POLICY_TERM),
 
+	KBASE_KTRACE_CODE_MAKE_CODE(PM_POWEROFF_WAIT_WQ),
+	KBASE_KTRACE_CODE_MAKE_CODE(PM_RUNTIME_SUSPEND_CALLBACK),
+	KBASE_KTRACE_CODE_MAKE_CODE(PM_RUNTIME_RESUME_CALLBACK),
+
 	/*
 	 * Context Scheduler events
 	 */
diff --git a/mali_kbase/debug/mali_kbase_debug_linux_ktrace.h b/mali_kbase/debug/mali_kbase_debug_linux_ktrace.h
index b56dec4..5fac763 100644
--- a/mali_kbase/debug/mali_kbase_debug_linux_ktrace.h
+++ b/mali_kbase/debug/mali_kbase_debug_linux_ktrace.h
@@ -95,6 +95,9 @@ DEFINE_MALI_ADD_EVENT(PM_CA_SET_POLICY);
 DEFINE_MALI_ADD_EVENT(PM_CONTEXT_ACTIVE);
 DEFINE_MALI_ADD_EVENT(PM_CONTEXT_IDLE);
 DEFINE_MALI_ADD_EVENT(PM_WAKE_WAITERS);
+DEFINE_MALI_ADD_EVENT(PM_POWEROFF_WAIT_WQ);
+DEFINE_MALI_ADD_EVENT(PM_RUNTIME_SUSPEND_CALLBACK);
+DEFINE_MALI_ADD_EVENT(PM_RUNTIME_RESUME_CALLBACK);
 DEFINE_MALI_ADD_EVENT(SCHED_RETAIN_CTX_NOLOCK);
 DEFINE_MALI_ADD_EVENT(SCHED_RELEASE_CTX);
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
diff --git a/mali_kbase/device/backend/mali_kbase_device_csf.c b/mali_kbase/device/backend/mali_kbase_device_csf.c
index 0c5052b..7b37a96 100644
--- a/mali_kbase/device/backend/mali_kbase_device_csf.c
+++ b/mali_kbase/device/backend/mali_kbase_device_csf.c
@@ -37,6 +37,7 @@
 #include <backend/gpu/mali_kbase_clk_rate_trace_mgr.h>
 #include <csf/mali_kbase_csf_csg_debugfs.h>
 #include <mali_kbase_hwcnt_virtualizer.h>
+#include <mali_kbase_kinstr_prfcnt.h>
 #include <mali_kbase_vinstr.h>
 
 /**
@@ -51,6 +52,7 @@
 static void kbase_device_firmware_hwcnt_term(struct kbase_device *kbdev)
 {
 	if (kbdev->csf.firmware_inited) {
+		kbase_kinstr_prfcnt_term(kbdev->kinstr_prfcnt_ctx);
 		kbase_vinstr_term(kbdev->vinstr_ctx);
 		kbase_hwcnt_virtualizer_term(kbdev->hwcnt_gpu_virt);
 		kbase_hwcnt_backend_csf_metadata_term(&kbdev->hwcnt_gpu_iface);
@@ -266,6 +268,8 @@ static const struct kbase_device_init dev_init[] = {
 	  "Timeline stream initialization failed" },
 	{ kbase_clk_rate_trace_manager_init, kbase_clk_rate_trace_manager_term,
 	  "Clock rate trace manager initialization failed" },
+	{ kbase_lowest_gpu_freq_init, NULL,
+	  "Lowest freq initialization failed" },
 	{ kbase_device_hwcnt_backend_csf_if_init,
 	  kbase_device_hwcnt_backend_csf_if_term,
 	  "GPU hwcnt backend CSF interface creation failed" },
@@ -390,8 +394,19 @@ static int kbase_device_hwcnt_csf_deferred_init(struct kbase_device *kbdev)
 		goto vinstr_fail;
 	}
 
+	ret = kbase_kinstr_prfcnt_init(kbdev->hwcnt_gpu_virt,
+				       &kbdev->kinstr_prfcnt_ctx);
+	if (ret) {
+		dev_err(kbdev->dev,
+			"Performance counter instrumentation initialization failed");
+		goto kinstr_prfcnt_fail;
+	}
+
 	return ret;
 
+kinstr_prfcnt_fail:
+	kbase_vinstr_term(kbdev->vinstr_ctx);
+
 vinstr_fail:
 	kbase_hwcnt_virtualizer_term(kbdev->hwcnt_gpu_virt);
 
@@ -418,8 +433,6 @@ static int kbase_csf_firmware_deferred_init(struct kbase_device *kbdev)
 
 	lockdep_assert_held(&kbdev->fw_load_lock);
 
-	kbase_pm_context_active(kbdev);
-
 	err = kbase_csf_firmware_init(kbdev);
 	if (!err) {
 		unsigned long flags;
@@ -432,8 +445,6 @@ static int kbase_csf_firmware_deferred_init(struct kbase_device *kbdev)
 		dev_err(kbdev->dev, "Firmware initialization failed");
 	}
 
-	kbase_pm_context_idle(kbdev);
-
 	return err;
 }
 
@@ -444,6 +455,8 @@ int kbase_device_firmware_init_once(struct kbase_device *kbdev)
 	mutex_lock(&kbdev->fw_load_lock);
 
 	if (!kbdev->csf.firmware_inited) {
+		kbase_pm_context_active(kbdev);
+
 		ret = kbase_csf_firmware_deferred_init(kbdev);
 		if (ret)
 			goto out;
@@ -455,9 +468,10 @@ int kbase_device_firmware_init_once(struct kbase_device *kbdev)
 		}
 
 		kbase_csf_debugfs_init(kbdev);
+out:
+		kbase_pm_context_idle(kbdev);
 	}
 
-out:
 	mutex_unlock(&kbdev->fw_load_lock);
 
 	return ret;
diff --git a/mali_kbase/device/backend/mali_kbase_device_hw_csf.c b/mali_kbase/device/backend/mali_kbase_device_hw_csf.c
index 8427edb..ae6dc1b 100644
--- a/mali_kbase/device/backend/mali_kbase_device_hw_csf.c
+++ b/mali_kbase/device/backend/mali_kbase_device_hw_csf.c
@@ -80,6 +80,7 @@ static void kbase_gpu_fault_interrupt(struct kbase_device *kbdev)
 		}
 	} else
 		kbase_report_gpu_fault(kbdev, status, as_nr, as_valid);
+
 }
 
 void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val)
@@ -124,6 +125,9 @@ void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val)
 		if (kbase_prepare_to_reset_gpu(
 			    kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
 			kbase_reset_gpu(kbdev);
+
+		/* Defer the clearing to the GPU reset sequence */
+		val &= ~GPU_PROTECTED_FAULT;
 	}
 
 	if (val & RESET_COMPLETED)
@@ -132,6 +136,20 @@ void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val)
 	KBASE_KTRACE_ADD(kbdev, CORE_GPU_IRQ_CLEAR, NULL, val);
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_CLEAR), val);
 
+#ifdef KBASE_PM_RUNTIME
+	if (val & DOORBELL_MIRROR) {
+		unsigned long flags;
+
+		dev_dbg(kbdev->dev, "Doorbell mirror interrupt received");
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		WARN_ON(!kbase_csf_scheduler_get_nr_active_csgs(kbdev));
+		kbase_pm_disable_db_mirror_interrupt(kbdev);
+		kbdev->pm.backend.exit_gpu_sleep_mode = true;
+		kbase_csf_scheduler_invoke_tick(kbdev);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	}
+#endif
+
 	/* kbase_pm_check_transitions (called by kbase_pm_power_changed) must
 	 * be called after the IRQ has been cleared. This is because it might
 	 * trigger further power transitions and we don't want to miss the
@@ -160,3 +178,60 @@ void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val)
 
 	KBASE_KTRACE_ADD(kbdev, CORE_GPU_IRQ_DONE, NULL, val);
 }
+
+#if !IS_ENABLED(CONFIG_MALI_NO_MALI)
+static bool kbase_is_register_accessible(u32 offset)
+{
+#ifdef CONFIG_MALI_DEBUG
+	if (((offset >= MCU_SUBSYSTEM_BASE) && (offset < IPA_CONTROL_BASE)) ||
+	    ((offset >= GPU_CONTROL_MCU_BASE) && (offset < USER_BASE))) {
+		WARN(1, "Invalid register offset 0x%x", offset);
+		return false;
+	}
+#endif
+
+	return true;
+}
+
+void kbase_reg_write(struct kbase_device *kbdev, u32 offset, u32 value)
+{
+	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered);
+	KBASE_DEBUG_ASSERT(kbdev->dev != NULL);
+
+	if (!kbase_is_register_accessible(offset))
+		return;
+
+	writel(value, kbdev->reg + offset);
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	if (unlikely(kbdev->io_history.enabled))
+		kbase_io_history_add(&kbdev->io_history, kbdev->reg + offset,
+				     value, 1);
+#endif /* CONFIG_DEBUG_FS */
+	dev_dbg(kbdev->dev, "w: reg %08x val %08x", offset, value);
+}
+KBASE_EXPORT_TEST_API(kbase_reg_write);
+
+u32 kbase_reg_read(struct kbase_device *kbdev, u32 offset)
+{
+	u32 val;
+
+	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered);
+	KBASE_DEBUG_ASSERT(kbdev->dev != NULL);
+
+	if (!kbase_is_register_accessible(offset))
+		return 0;
+
+	val = readl(kbdev->reg + offset);
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	if (unlikely(kbdev->io_history.enabled))
+		kbase_io_history_add(&kbdev->io_history, kbdev->reg + offset,
+				     val, 0);
+#endif /* CONFIG_DEBUG_FS */
+	dev_dbg(kbdev->dev, "r: reg %08x val %08x", offset, val);
+
+	return val;
+}
+KBASE_EXPORT_TEST_API(kbase_reg_read);
+#endif /* !IS_ENABLED(CONFIG_MALI_NO_MALI) */
diff --git a/mali_kbase/device/backend/mali_kbase_device_hw_jm.c b/mali_kbase/device/backend/mali_kbase_device_hw_jm.c
index c4e6eb8..e8f8953 100644
--- a/mali_kbase/device/backend/mali_kbase_device_hw_jm.c
+++ b/mali_kbase/device/backend/mali_kbase_device_hw_jm.c
@@ -51,6 +51,7 @@ static void kbase_report_gpu_fault(struct kbase_device *kbdev, int multiple)
 		address);
 	if (multiple)
 		dev_warn(kbdev->dev, "There were multiple GPU faults - some have not been reported\n");
+
 }
 
 void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val)
@@ -96,3 +97,41 @@ void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val)
 
 	KBASE_KTRACE_ADD(kbdev, CORE_GPU_IRQ_DONE, NULL, val);
 }
+
+#if !IS_ENABLED(CONFIG_MALI_NO_MALI)
+void kbase_reg_write(struct kbase_device *kbdev, u32 offset, u32 value)
+{
+	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered);
+	KBASE_DEBUG_ASSERT(kbdev->dev != NULL);
+
+	writel(value, kbdev->reg + offset);
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	if (unlikely(kbdev->io_history.enabled))
+		kbase_io_history_add(&kbdev->io_history, kbdev->reg + offset,
+				     value, 1);
+#endif /* CONFIG_DEBUG_FS */
+	dev_dbg(kbdev->dev, "w: reg %08x val %08x", offset, value);
+}
+KBASE_EXPORT_TEST_API(kbase_reg_write);
+
+u32 kbase_reg_read(struct kbase_device *kbdev, u32 offset)
+{
+	u32 val;
+
+	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered);
+	KBASE_DEBUG_ASSERT(kbdev->dev != NULL);
+
+	val = readl(kbdev->reg + offset);
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	if (unlikely(kbdev->io_history.enabled))
+		kbase_io_history_add(&kbdev->io_history, kbdev->reg + offset,
+				     val, 0);
+#endif /* CONFIG_DEBUG_FS */
+	dev_dbg(kbdev->dev, "r: reg %08x val %08x", offset, val);
+
+	return val;
+}
+KBASE_EXPORT_TEST_API(kbase_reg_read);
+#endif /* !IS_ENABLED(CONFIG_MALI_NO_MALI) */
diff --git a/mali_kbase/device/backend/mali_kbase_device_jm.c b/mali_kbase/device/backend/mali_kbase_device_jm.c
index 6a6ab60..7288e8e 100644
--- a/mali_kbase/device/backend/mali_kbase_device_jm.c
+++ b/mali_kbase/device/backend/mali_kbase_device_jm.c
@@ -185,6 +185,8 @@ static const struct kbase_device_init dev_init[] = {
 	  "Timeline stream initialization failed" },
 	{ kbase_clk_rate_trace_manager_init, kbase_clk_rate_trace_manager_term,
 	  "Clock rate trace manager initialization failed" },
+	{ kbase_lowest_gpu_freq_init, NULL,
+	  "Lowest freq initialization failed" },
 	{ kbase_instr_backend_init, kbase_instr_backend_term,
 	  "Instrumentation backend initialization failed" },
 	{ kbase_device_hwcnt_backend_jm_init,
@@ -197,6 +199,8 @@ static const struct kbase_device_init dev_init[] = {
 	  "GPU hwcnt virtualizer initialization failed" },
 	{ kbase_device_vinstr_init, kbase_device_vinstr_term,
 	  "Virtual instrumentation initialization failed" },
+	{ kbase_device_kinstr_prfcnt_init, kbase_device_kinstr_prfcnt_term,
+	  "Performance counter instrumentation initialization failed" },
 	{ kbase_backend_late_init, kbase_backend_late_term,
 	  "Late backend initialization failed" },
 #ifdef MALI_KBASE_BUILD
diff --git a/mali_kbase/device/mali_kbase_device.c b/mali_kbase/device/mali_kbase_device.c
index 0f992c3..518aaf9 100644
--- a/mali_kbase/device/mali_kbase_device.c
+++ b/mali_kbase/device/mali_kbase_device.c
@@ -40,6 +40,7 @@
 #include <linux/priority_control_manager.h>
 
 #include <tl/mali_kbase_timeline.h>
+#include "mali_kbase_kinstr_prfcnt.h"
 #include "mali_kbase_vinstr.h"
 #include "mali_kbase_hwcnt_context.h"
 #include "mali_kbase_hwcnt_virtualizer.h"
@@ -49,6 +50,7 @@
 #include "backend/gpu/mali_kbase_pm_internal.h"
 #include "backend/gpu/mali_kbase_irq_internal.h"
 #include "mali_kbase_regs_history_debugfs.h"
+#include "mali_kbase_pbha.h"
 
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
 #include "arbiter/mali_kbase_arbiter_pm.h"
@@ -273,6 +275,14 @@ int kbase_device_misc_init(struct kbase_device * const kbdev)
 	if (err)
 		goto dma_set_mask_failed;
 
+	/* There is no limit for Mali, so set to max. We only do this if dma_parms
+	 * is already allocated by the platform.
+	 */
+	if (kbdev->dev->dma_parms)
+		err = dma_set_max_seg_size(kbdev->dev, UINT_MAX);
+	if (err)
+		goto dma_set_mask_failed;
+
 	kbdev->nr_hw_address_spaces = kbdev->gpu_props.num_address_spaces;
 
 	err = kbase_device_all_as_init(kbdev);
@@ -282,6 +292,9 @@ int kbase_device_misc_init(struct kbase_device * const kbdev)
 	err = kbase_ktrace_init(kbdev);
 	if (err)
 		goto term_as;
+	err = kbase_pbha_read_dtb(kbdev);
+	if (err)
+		goto term_ktrace;
 
 	init_waitqueue_head(&kbdev->cache_clean_wait);
 
@@ -309,6 +322,8 @@ int kbase_device_misc_init(struct kbase_device * const kbdev)
 	}
 	return 0;
 
+term_ktrace:
+	kbase_ktrace_term(kbdev);
 term_as:
 	kbase_device_all_as_term(kbdev);
 dma_set_mask_failed:
@@ -395,6 +410,17 @@ void kbase_device_vinstr_term(struct kbase_device *kbdev)
 	kbase_vinstr_term(kbdev->vinstr_ctx);
 }
 
+int kbase_device_kinstr_prfcnt_init(struct kbase_device *kbdev)
+{
+	return kbase_kinstr_prfcnt_init(kbdev->hwcnt_gpu_virt,
+					&kbdev->kinstr_prfcnt_ctx);
+}
+
+void kbase_device_kinstr_prfcnt_term(struct kbase_device *kbdev)
+{
+	kbase_kinstr_prfcnt_term(kbdev->kinstr_prfcnt_ctx);
+}
+
 int kbase_device_io_history_init(struct kbase_device *kbdev)
 {
 	return kbase_io_history_init(&kbdev->io_history,
@@ -461,6 +487,11 @@ int kbase_device_early_init(struct kbase_device *kbdev)
 	if (err)
 		goto fail_runtime_pm;
 
+	/* This spinlock is initialized before doing the first access to GPU
+	 * registers and installing interrupt handlers.
+	 */
+	spin_lock_init(&kbdev->hwaccess_lock);
+
 	/* Ensure we can access the GPU registers */
 	kbase_pm_register_access_enable(kbdev);
 
@@ -470,10 +501,6 @@ int kbase_device_early_init(struct kbase_device *kbdev)
 	/* We're done accessing the GPU registers for now. */
 	kbase_pm_register_access_disable(kbdev);
 
-	/* This spinlock has to be initialized before installing interrupt
-	 * handlers that require to hold it to process interrupts.
-	 */
-	spin_lock_init(&kbdev->hwaccess_lock);
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
 	if (kbdev->arb.arb_if)
 		err = kbase_arbiter_pm_install_interrupts(kbdev);
diff --git a/mali_kbase/device/mali_kbase_device_hw.c b/mali_kbase/device/mali_kbase_device_hw.c
index e80559a..4c98ae1 100644
--- a/mali_kbase/device/mali_kbase_device_hw.c
+++ b/mali_kbase/device/mali_kbase_device_hw.c
@@ -28,44 +28,6 @@
 #include <mmu/mali_kbase_mmu.h>
 
 #if !IS_ENABLED(CONFIG_MALI_NO_MALI)
-void kbase_reg_write(struct kbase_device *kbdev, u32 offset, u32 value)
-{
-	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered);
-	KBASE_DEBUG_ASSERT(kbdev->dev != NULL);
-
-	writel(value, kbdev->reg + offset);
-
-#if IS_ENABLED(CONFIG_DEBUG_FS)
-	if (unlikely(kbdev->io_history.enabled))
-		kbase_io_history_add(&kbdev->io_history, kbdev->reg + offset,
-				value, 1);
-#endif /* CONFIG_DEBUG_FS */
-	dev_dbg(kbdev->dev, "w: reg %08x val %08x", offset, value);
-}
-
-KBASE_EXPORT_TEST_API(kbase_reg_write);
-
-u32 kbase_reg_read(struct kbase_device *kbdev, u32 offset)
-{
-	u32 val;
-
-	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered);
-	KBASE_DEBUG_ASSERT(kbdev->dev != NULL);
-
-	val = readl(kbdev->reg + offset);
-
-#if IS_ENABLED(CONFIG_DEBUG_FS)
-	if (unlikely(kbdev->io_history.enabled))
-		kbase_io_history_add(&kbdev->io_history, kbdev->reg + offset,
-				val, 0);
-#endif /* CONFIG_DEBUG_FS */
-	dev_dbg(kbdev->dev, "r: reg %08x val %08x", offset, val);
-
-	return val;
-}
-
-KBASE_EXPORT_TEST_API(kbase_reg_read);
-
 bool kbase_is_gpu_removed(struct kbase_device *kbdev)
 {
 	u32 val;
@@ -99,7 +61,7 @@ void kbase_gpu_start_cache_clean_nolock(struct kbase_device *kbdev)
 
 	KBASE_KTRACE_ADD(kbdev, CORE_GPU_CLEAN_INV_CACHES, NULL, 0);
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
-					GPU_COMMAND_CLEAN_INV_CACHES);
+			GPU_COMMAND_CACHE_CLN_INV_L2);
 
 	kbdev->cache_clean_in_progress = true;
 }
@@ -134,7 +96,7 @@ void kbase_clean_caches_done(struct kbase_device *kbdev)
 
 		KBASE_KTRACE_ADD(kbdev, CORE_GPU_CLEAN_INV_CACHES, NULL, 0);
 		kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
-				GPU_COMMAND_CLEAN_INV_CACHES);
+				GPU_COMMAND_CACHE_CLN_INV_L2);
 	} else {
 		/* Disable interrupt */
 		irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
diff --git a/mali_kbase/device/mali_kbase_device_internal.h b/mali_kbase/device/mali_kbase_device_internal.h
index d422407..d4f6875 100644
--- a/mali_kbase/device/mali_kbase_device_internal.h
+++ b/mali_kbase/device/mali_kbase_device_internal.h
@@ -39,6 +39,9 @@ struct kbase_device_init {
 int kbase_device_vinstr_init(struct kbase_device *kbdev);
 void kbase_device_vinstr_term(struct kbase_device *kbdev);
 
+int kbase_device_kinstr_prfcnt_init(struct kbase_device *kbdev);
+void kbase_device_kinstr_prfcnt_term(struct kbase_device *kbdev);
+
 int kbase_device_timeline_init(struct kbase_device *kbdev);
 void kbase_device_timeline_term(struct kbase_device *kbdev);
 
diff --git a/mali_kbase/gpu/backend/mali_kbase_gpu_fault_csf.c b/mali_kbase/gpu/backend/mali_kbase_gpu_fault_csf.c
index f9d4c14..7499729 100644
--- a/mali_kbase/gpu/backend/mali_kbase_gpu_fault_csf.c
+++ b/mali_kbase/gpu/backend/mali_kbase_gpu_fault_csf.c
@@ -42,15 +42,19 @@ const char *kbase_gpu_exception_name(u32 const exception_code)
 	case CS_FATAL_EXCEPTION_TYPE_CS_ENDPOINT_FAULT:
 		e = "FATAL_CS_ENDPOINT_FAULT";
 		break;
-	case CS_FATAL_EXCEPTION_TYPE_CS_BUS_FAULT:
-		e = "FATAL_CS_BUS_FAULT";
-		break;
 	case CS_FATAL_EXCEPTION_TYPE_CS_INVALID_INSTRUCTION:
 		e = "FATAL_CS_INVALID_INSTRUCTION";
 		break;
 	case CS_FATAL_EXCEPTION_TYPE_CS_CALL_STACK_OVERFLOW:
 		e = "FATAL_CS_CALL_STACK_OVERFLOW";
 		break;
+	/*
+	 * CS_FAULT_EXCEPTION_TYPE_CS_BUS_FAULT and CS_FATAL_EXCEPTION_TYPE_CS_BUS_FAULT share the same error code
+	 * Type of CS_BUS_FAULT will be differentiated by CSF exception handler
+	 */
+	case CS_FAULT_EXCEPTION_TYPE_CS_BUS_FAULT:
+		e = "CS_BUS_FAULT";
+		break;
 	/* Shader exceptions */
 	case CS_FAULT_EXCEPTION_TYPE_INSTR_INVALID_PC:
 		e = "INSTR_INVALID_PC";
@@ -61,6 +65,10 @@ const char *kbase_gpu_exception_name(u32 const exception_code)
 	case CS_FAULT_EXCEPTION_TYPE_INSTR_BARRIER_FAULT:
 		e = "INSTR_BARRIER_FAULT";
 		break;
+	/* Iterator exceptions */
+	case CS_FAULT_EXCEPTION_TYPE_KABOOM:
+		e = "KABOOM";
+		break;
 	/* Misc exceptions */
 	case CS_FAULT_EXCEPTION_TYPE_DATA_INVALID_FAULT:
 		e = "DATA_INVALID_FAULT";
diff --git a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.c b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.c
index 4737b0e..e240117 100644
--- a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.c
+++ b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.c
@@ -44,8 +44,9 @@ static inline u32 kbase_ipa_read_hwcnt(
 	u32 offset)
 {
 	u8 *p = (u8 *)model_data->dump_buf.dump_buf;
+	u64 val = *(u64 *)&p[offset];
 
-	return *(u32 *)&p[offset];
+	return (val > U32_MAX) ? U32_MAX : (u32)val;
 }
 
 static inline s64 kbase_ipa_add_saturate(s64 a, s64 b)
diff --git a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.h b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.h
index 3486a9b..faf08ef 100644
--- a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.h
+++ b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.h
@@ -30,7 +30,7 @@
 #define KBASE_IPA_MAX_GROUP_DEF_NUM  16
 
 /* Number of bytes per hardware counter in a vinstr_buffer. */
-#define KBASE_IPA_NR_BYTES_PER_CNT    4
+#define KBASE_IPA_NR_BYTES_PER_CNT (sizeof(u64))
 
 /* Number of hardware counters per block in a vinstr_buffer. */
 #define KBASE_IPA_NR_CNT_PER_BLOCK   64
diff --git a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_csf.c b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_csf.c
index 1852c3c..a47699c 100644
--- a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_csf.c
+++ b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_csf.c
@@ -25,14 +25,18 @@
 /* MEMSYS counter block offsets */
 #define L2_RD_MSG_IN            (16)
 #define L2_WR_MSG_IN            (18)
+#define L2_RD_MSG_OUT           (22)
 #define L2_READ_LOOKUP          (26)
 #define L2_EXT_WRITE_NOSNP_FULL (43)
 
 /* SC counter block offsets */
+#define FRAG_STARVING           (8)
+#define FRAG_PARTIAL_QUADS_RAST (10)
 #define FRAG_QUADS_EZS_UPDATE   (13)
 #define FULL_QUAD_WARPS         (21)
 #define EXEC_INSTR_FMA          (27)
 #define EXEC_INSTR_CVT          (28)
+#define EXEC_INSTR_MSG          (30)
 #define TEX_FILT_NUM_OPS        (39)
 #define LS_MEM_READ_SHORT       (45)
 #define LS_MEM_WRITE_SHORT      (47)
@@ -44,6 +48,8 @@
 #define VFETCH_POS_READ_WAIT    (29)
 #define VFETCH_VERTEX_WAIT      (30)
 #define IDVS_VAR_SHAD_STALL     (38)
+#define ITER_STALL              (40)
+#define PMGR_PTR_RD_STALL       (48)
 
 #define COUNTER_DEF(cnt_name, coeff, cnt_idx, block_type)	\
 	{							\
@@ -80,6 +86,33 @@ static const struct kbase_ipa_counter ipa_top_level_cntrs_def_todx[] = {
 	TILER_COUNTER_DEF("vfetch_pos_read_wait", -119118, VFETCH_POS_READ_WAIT),
 };
 
+static const struct kbase_ipa_counter ipa_top_level_cntrs_def_tgrx[] = {
+	MEMSYS_COUNTER_DEF("l2_rd_msg_in", 295631, L2_RD_MSG_IN),
+	MEMSYS_COUNTER_DEF("l2_ext_write_nosnp_ull", 325168, L2_EXT_WRITE_NOSNP_FULL),
+
+	TILER_COUNTER_DEF("prefetch_stall", 145435, PREFETCH_STALL),
+	TILER_COUNTER_DEF("idvs_var_shad_stall", -171917, IDVS_VAR_SHAD_STALL),
+	TILER_COUNTER_DEF("idvs_pos_shad_stall", 109980, IDVS_POS_SHAD_STALL),
+	TILER_COUNTER_DEF("vfetch_pos_read_wait", -119118, VFETCH_POS_READ_WAIT),
+};
+
+static const struct kbase_ipa_counter ipa_top_level_cntrs_def_tvax[] = {
+	MEMSYS_COUNTER_DEF("l2_rd_msg_out", 491414, L2_RD_MSG_OUT),
+	MEMSYS_COUNTER_DEF("l2_wr_msg_in", 408645, L2_WR_MSG_IN),
+
+	TILER_COUNTER_DEF("iter_stall", 893324, ITER_STALL),
+	TILER_COUNTER_DEF("pmgr_ptr_rd_stall", -975117, PMGR_PTR_RD_STALL),
+	TILER_COUNTER_DEF("idvs_pos_shad_stall", 22555, IDVS_POS_SHAD_STALL),
+};
+
+static const struct kbase_ipa_counter ipa_top_level_cntrs_def_ttux[] = {
+	MEMSYS_COUNTER_DEF("l2_rd_msg_in", 800836, L2_RD_MSG_IN),
+	MEMSYS_COUNTER_DEF("l2_wr_msg_in", 415579, L2_WR_MSG_IN),
+	MEMSYS_COUNTER_DEF("l2_read_lookup", -198124, L2_READ_LOOKUP),
+
+	TILER_COUNTER_DEF("idvs_pos_shad_stall", 117358, IDVS_POS_SHAD_STALL),
+	TILER_COUNTER_DEF("vfetch_vertex_wait", -391964, VFETCH_VERTEX_WAIT),
+};
 
 /* These tables provide a description of each performance counter
   * used by the shader cores counter model for energy estimation.
@@ -93,6 +126,32 @@ static const struct kbase_ipa_counter ipa_shader_core_cntrs_def_todx[] = {
 	SC_COUNTER_DEF("vary_slot_16", 181069, VARY_SLOT_16),
 };
 
+static const struct kbase_ipa_counter ipa_shader_core_cntrs_def_tgrx[] = {
+	SC_COUNTER_DEF("exec_instr_fma", 505449, EXEC_INSTR_FMA),
+	SC_COUNTER_DEF("tex_filt_num_operations", 574869, TEX_FILT_NUM_OPS),
+	SC_COUNTER_DEF("ls_mem_read_short", 60917, LS_MEM_READ_SHORT),
+	SC_COUNTER_DEF("frag_quads_ezs_update", 694555, FRAG_QUADS_EZS_UPDATE),
+	SC_COUNTER_DEF("ls_mem_write_short", 698290, LS_MEM_WRITE_SHORT),
+	SC_COUNTER_DEF("vary_slot_16", 181069, VARY_SLOT_16),
+};
+
+static const struct kbase_ipa_counter ipa_shader_core_cntrs_def_tvax[] = {
+	SC_COUNTER_DEF("tex_filt_num_operations", 142536, TEX_FILT_NUM_OPS),
+	SC_COUNTER_DEF("exec_instr_fma", 243497, EXEC_INSTR_FMA),
+	SC_COUNTER_DEF("exec_instr_msg", 1344410, EXEC_INSTR_MSG),
+	SC_COUNTER_DEF("vary_slot_16", -119612, VARY_SLOT_16),
+	SC_COUNTER_DEF("frag_partial_quads_rast", 676201, FRAG_PARTIAL_QUADS_RAST),
+	SC_COUNTER_DEF("frag_starving", 62421, FRAG_STARVING),
+};
+
+static const struct kbase_ipa_counter ipa_shader_core_cntrs_def_ttux[] = {
+	SC_COUNTER_DEF("exec_instr_fma", 457012, EXEC_INSTR_FMA),
+	SC_COUNTER_DEF("tex_filt_num_operations", 441911, TEX_FILT_NUM_OPS),
+	SC_COUNTER_DEF("ls_mem_read_short", 322525, LS_MEM_READ_SHORT),
+	SC_COUNTER_DEF("full_quad_warps", 844124, FULL_QUAD_WARPS),
+	SC_COUNTER_DEF("exec_instr_cvt", 226411, EXEC_INSTR_CVT),
+	SC_COUNTER_DEF("frag_quads_ezs_update",372032, FRAG_QUADS_EZS_UPDATE),
+};
 
 #define IPA_POWER_MODEL_OPS(gpu, init_token) \
 	const struct kbase_ipa_model_ops kbase_ ## gpu ## _ipa_model_ops = { \
@@ -128,13 +187,21 @@ static const struct kbase_ipa_counter ipa_shader_core_cntrs_def_todx[] = {
 /* Reference voltage value is 750 mV.
  */
 STANDARD_POWER_MODEL(todx, 750);
+STANDARD_POWER_MODEL(tgrx, 750);
+STANDARD_POWER_MODEL(tvax, 750);
 
+STANDARD_POWER_MODEL(ttux, 750);
 
 /* Assuming LODX is an alias of TODX for IPA */
 ALIAS_POWER_MODEL(lodx, todx);
 
+/* Assuming LTUX is an alias of TTUX for IPA */
+ALIAS_POWER_MODEL(ltux, ttux);
+
 static const struct kbase_ipa_model_ops *ipa_counter_model_ops[] = {
 	&kbase_todx_ipa_model_ops, &kbase_lodx_ipa_model_ops,
+	&kbase_tgrx_ipa_model_ops, &kbase_tvax_ipa_model_ops,
+	&kbase_ttux_ipa_model_ops, &kbase_ltux_ipa_model_ops
 };
 
 const struct kbase_ipa_model_ops *kbase_ipa_counter_model_ops_find(
@@ -165,6 +232,14 @@ const char *kbase_ipa_counter_model_name_from_id(u32 gpu_id)
 		return "mali-todx-power-model";
 	case GPU_ID2_PRODUCT_LODX:
 		return "mali-lodx-power-model";
+	case GPU_ID2_PRODUCT_TGRX:
+		return "mali-tgrx-power-model";
+	case GPU_ID2_PRODUCT_TVAX:
+		return "mali-tvax-power-model";
+	case GPU_ID2_PRODUCT_TTUX:
+		return "mali-ttux-power-model";
+	case GPU_ID2_PRODUCT_LTUX:
+		return "mali-ltux-power-model";
 	default:
 		return NULL;
 	}
diff --git a/mali_kbase/ipa/mali_kbase_ipa_debugfs.c b/mali_kbase/ipa/mali_kbase_ipa_debugfs.c
index 5976389..14df542 100644
--- a/mali_kbase/ipa/mali_kbase_ipa_debugfs.c
+++ b/mali_kbase/ipa/mali_kbase_ipa_debugfs.c
@@ -247,7 +247,7 @@ static void kbase_ipa_model_debugfs_init(struct kbase_ipa_model *model)
 	dir = debugfs_create_dir(model->ops->name,
 				 model->kbdev->mali_debugfs_directory);
 
-	if (!dir) {
+	if (IS_ERR_OR_NULL(dir)) {
 		dev_err(model->kbdev->dev,
 			"Couldn't create mali debugfs %s directory",
 			model->ops->name);
diff --git a/mali_kbase/jm/mali_kbase_jm_defs.h b/mali_kbase/jm/mali_kbase_jm_defs.h
index c490f1c..cb1c276 100644
--- a/mali_kbase/jm/mali_kbase_jm_defs.h
+++ b/mali_kbase/jm/mali_kbase_jm_defs.h
@@ -87,8 +87,6 @@
 #define KBASE_KATOM_FLAG_FAIL_BLOCKER (1<<8)
 /* Atom is currently in the list of atoms blocked on cross-slot dependencies */
 #define KBASE_KATOM_FLAG_JSCTX_IN_X_DEP_LIST (1<<9)
-/* Atom is currently holding a context reference */
-#define KBASE_KATOM_FLAG_HOLDING_CTX_REF (1<<10)
 /* Atom requires GPU to be in protected mode */
 #define KBASE_KATOM_FLAG_PROTECTED (1<<11)
 /* Atom has been stored in runnable_tree */
@@ -176,7 +174,7 @@ struct kbase_jd_atom_dependency {
 static inline const struct kbase_jd_atom *
 kbase_jd_katom_dep_atom(const struct kbase_jd_atom_dependency *dep)
 {
-	LOCAL_ASSERT(dep != NULL);
+	KBASE_DEBUG_ASSERT(dep != NULL);
 
 	return (const struct kbase_jd_atom *)(dep->atom);
 }
@@ -191,7 +189,7 @@ kbase_jd_katom_dep_atom(const struct kbase_jd_atom_dependency *dep)
 static inline u8 kbase_jd_katom_dep_type(
 		const struct kbase_jd_atom_dependency *dep)
 {
-	LOCAL_ASSERT(dep != NULL);
+	KBASE_DEBUG_ASSERT(dep != NULL);
 
 	return dep->dep_type;
 }
@@ -209,7 +207,7 @@ static inline void kbase_jd_katom_dep_set(
 {
 	struct kbase_jd_atom_dependency *dep;
 
-	LOCAL_ASSERT(const_dep != NULL);
+	KBASE_DEBUG_ASSERT(const_dep != NULL);
 
 	dep = (struct kbase_jd_atom_dependency *)const_dep;
 
@@ -227,7 +225,7 @@ static inline void kbase_jd_katom_dep_clear(
 {
 	struct kbase_jd_atom_dependency *dep;
 
-	LOCAL_ASSERT(const_dep != NULL);
+	KBASE_DEBUG_ASSERT(const_dep != NULL);
 
 	dep = (struct kbase_jd_atom_dependency *)const_dep;
 
@@ -653,6 +651,48 @@ static inline bool kbase_jd_katom_is_protected(
 	return (bool)(katom->atom_flags & KBASE_KATOM_FLAG_PROTECTED);
 }
 
+/**
+ * kbase_atom_is_younger - query if one atom is younger by age than another
+ * @katom_a the first atom
+ * @katom_a the second atom
+ *
+ * Return: true if the first atom is strictly younger than the second, false
+ * otherwise.
+ */
+static inline bool kbase_jd_atom_is_younger(const struct kbase_jd_atom *katom_a,
+					    const struct kbase_jd_atom *katom_b)
+{
+	return ((s32)(katom_a->age - katom_b->age) < 0);
+}
+
+/**
+ * kbase_jd_atom_is_earlier
+ * @katom_a: the first atom
+ * @katom_b: the second atom
+ *
+ * Return: true if the first atom has been submitted earlier than the
+ * second atom. It is used to understand if an atom that is ready has been
+ * submitted earlier than the currently running atom, so that the currently
+ * running atom should be preempted to allow the ready atom to run.
+ */
+static inline bool kbase_jd_atom_is_earlier(const struct kbase_jd_atom *katom_a,
+					    const struct kbase_jd_atom *katom_b)
+{
+	/* No seq_nr set? */
+	if (!katom_a->seq_nr || !katom_b->seq_nr)
+		return false;
+
+	/* Efficiently handle the unlikely case of wrapping.
+	 * The following code assumes that the delta between the sequence number
+	 * of the two atoms is less than INT64_MAX.
+	 * In the extremely unlikely case where the delta is higher, the comparison
+	 * defaults for no preemption.
+	 * The code also assumes that the conversion from unsigned to signed types
+	 * works because the signed integers are 2's complement.
+	 */
+	return (s64)(katom_a->seq_nr - katom_b->seq_nr) < 0;
+}
+
 /*
  * Theory of operations:
  *
diff --git a/mali_kbase/jm/mali_kbase_jm_js.h b/mali_kbase/jm/mali_kbase_jm_js.h
index 5e0c4bc..5a972a5 100644
--- a/mali_kbase/jm/mali_kbase_jm_js.h
+++ b/mali_kbase/jm/mali_kbase_jm_js.h
@@ -108,6 +108,52 @@ int kbasep_js_kctx_init(struct kbase_context *const kctx);
  */
 void kbasep_js_kctx_term(struct kbase_context *kctx);
 
+/* kbase_jsctx_slot_prio_blocked_set - Set a context as being blocked for a job
+ *                                     slot at and below a given priority level
+ * @kctx: The kbase_context
+ * @js: The job slot
+ * @sched_prio: The priority levels that the context is blocked at for @js (all
+ *              priority levels at this level and below will be blocked)
+ *
+ * To preserve ordering and dependencies of atoms on soft-stopping (both within
+ * an between priority levels), a context must be marked as blocked for that
+ * atom's job slot, for all priority levels at or below the atom's priority.
+ *
+ * This must only be called due to an atom that was pulled from the context,
+ * otherwise there will be no way of unblocking the context when the atom is
+ * completed/unpulled.
+ *
+ * Atoms of higher priority might still be able to be pulled from the context
+ * on @js. This helps with starting a high priority atom as soon as possible.
+ */
+static inline void kbase_jsctx_slot_prio_blocked_set(struct kbase_context *kctx,
+						     int js, int sched_prio)
+{
+	struct kbase_jsctx_slot_tracking *slot_tracking =
+		&kctx->slot_tracking[js];
+
+	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
+	WARN(!slot_tracking->atoms_pulled_pri[sched_prio],
+	     "When marking slot %d as blocked for priority %d on a kctx, no atoms were pulled - the slot cannot become unblocked",
+	     js, sched_prio);
+
+	slot_tracking->blocked |= ((kbase_js_prio_bitmap_t)1) << sched_prio;
+	KBASE_KTRACE_ADD_JM_SLOT_INFO(kctx->kbdev, JS_SLOT_PRIO_BLOCKED, kctx,
+				      NULL, 0, js, (unsigned int)sched_prio);
+}
+
+/* kbase_jsctx_atoms_pulled - Return number of atoms pulled on a context
+ * @kctx: The kbase_context
+ *
+ * Having atoms pulled indicates the context is not idle.
+ *
+ * Return: the number of atoms pulled on @kctx
+ */
+static inline int kbase_jsctx_atoms_pulled(struct kbase_context *kctx)
+{
+	return atomic_read(&kctx->atoms_pulled_all_slots);
+}
+
 /**
  * kbasep_js_add_job - Add a job chain to the Job Scheduler,
  *                     and take necessary actions to
@@ -947,7 +993,38 @@ static inline base_jd_prio kbasep_js_sched_prio_to_atom_prio(int sched_prio)
  *
  * Return: The same or lower priority than requested.
  */
-
 base_jd_prio kbase_js_priority_check(struct kbase_device *kbdev, base_jd_prio priority);
 
+/**
+ * kbase_js_atom_runs_before - determine if atoms for the same slot have an
+ *                             ordering relation
+ * @kbdev: kbase device
+ * @katom_a: the first atom
+ * @katom_b: the second atom.
+ * @order_flags: combination of KBASE_ATOM_ORDERING_FLAG_<...> for the ordering
+ *               relation
+ *
+ * This is for making consistent decisions about the ordering of atoms when we
+ * need to do pre-emption on a slot, which includes stopping existing atoms
+ * when a new atom is ready to run, and also which other atoms to remove from
+ * the slot when the atom in JSn_HEAD is being pre-empted.
+ *
+ * This only handles @katom_a and @katom_b being for the same job slot, as
+ * pre-emption only operates within a slot.
+ *
+ * Note: there is currently no use-case for this as a sorting comparison
+ * functions, hence only a boolean returned instead of int -1, 0, +1 return. If
+ * required in future, a modification to do so would be better than calling
+ * twice with katom_a and katom_b swapped.
+ *
+ * Return:
+ * true if @katom_a should run before @katom_b, false otherwise.
+ * A false return value does not distinguish between "no ordering relation" and
+ * "@katom_a should run after @katom_b".
+ */
+bool kbase_js_atom_runs_before(struct kbase_device *kbdev,
+			       const struct kbase_jd_atom *katom_a,
+			       const struct kbase_jd_atom *katom_b,
+			       const kbase_atom_ordering_flag_t order_flags);
+
 #endif	/* _KBASE_JM_JS_H_ */
diff --git a/mali_kbase/jm/mali_kbase_js_defs.h b/mali_kbase/jm/mali_kbase_js_defs.h
index 75152fb..a1d40ba 100644
--- a/mali_kbase/jm/mali_kbase_js_defs.h
+++ b/mali_kbase/jm/mali_kbase_js_defs.h
@@ -187,6 +187,33 @@ enum {
  */
 #define KBASE_JS_ATOM_SCHED_PRIO_DEFAULT KBASE_JS_ATOM_SCHED_PRIO_MED
 
+/* Atom priority bitmaps, where bit 0 is the highest priority, and higher bits
+ * indicate successively lower KBASE_JS_ATOM_SCHED_PRIO_<...> levels.
+ *
+ * Must be strictly larger than the number of bits to represent a bitmap of
+ * priorities, so that we can do calculations such as:
+ *   (1 << KBASE_JS_ATOM_SCHED_PRIO_COUNT) - 1
+ * ...without causing undefined behavior due to a shift beyond the width of the
+ * type
+ *
+ * If KBASE_JS_ATOM_SCHED_PRIO_COUNT starts requiring 32 bits, then it's worth
+ * moving to DECLARE_BITMAP()
+ */
+typedef u8 kbase_js_prio_bitmap_t;
+
+/* Ordering modification for kbase_js_atom_runs_before() */
+typedef u32 kbase_atom_ordering_flag_t;
+
+/* Atoms of the same context and priority should have their ordering decided by
+ * their seq_nr instead of their age.
+ *
+ * seq_nr is used as a more slowly changing variant of age - it increases once
+ * per group of related atoms, as determined by user-space. Hence, it can be
+ * used to limit re-ordering decisions (such as pre-emption) to only re-order
+ * between such groups, rather than re-order within those groups of atoms.
+ */
+#define KBASE_ATOM_ORDERING_FLAG_SEQNR (((kbase_atom_ordering_flag_t)1) << 0)
+
 /**
  * struct kbasep_js_device_data - KBase Device Data Job Scheduler sub-structure
  * @runpool_irq: Sub-structure to collect together Job Scheduling data used in
@@ -393,4 +420,23 @@ struct kbasep_js_atom_retained_state {
  */
 #define KBASEP_JS_TICK_RESOLUTION_US 1
 
+/**
+ * struct kbase_jsctx_slot_tracking - Job Scheduling tracking of a context's
+ *                                    use of a job slot
+ * @blocked: bitmap of priorities that this slot is blocked at
+ * @atoms_pulled: counts of atoms that have been pulled from this slot,
+ *                across all priority levels
+ * @atoms_pulled_pri: counts of atoms that have been pulled from this slot, per
+ *                    priority level
+ *
+ * Controls how a slot from the &struct kbase_context's jsctx_queue is managed,
+ * for example to ensure correct ordering of atoms when atoms of different
+ * priorities are unpulled.
+ */
+struct kbase_jsctx_slot_tracking {
+	kbase_js_prio_bitmap_t blocked;
+	atomic_t atoms_pulled;
+	int atoms_pulled_pri[KBASE_JS_ATOM_SCHED_PRIO_COUNT];
+};
+
 #endif /* _KBASE_JS_DEFS_H_ */
diff --git a/mali_kbase/mali_base_hwconfig_features.h b/mali_kbase/mali_base_hwconfig_features.h
index 93cd05f..2e81cb1 100644
--- a/mali_kbase/mali_base_hwconfig_features.h
+++ b/mali_kbase/mali_base_hwconfig_features.h
@@ -28,26 +28,7 @@
 #define _BASE_HWCONFIG_FEATURES_H_
 
 enum base_hw_feature {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_WARPING,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_TLS_HASHING,
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
@@ -55,6 +36,7 @@ enum base_hw_feature {
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_ASN_HASH,
+	BASE_HW_FEATURE_GPU_SLEEP,
 	BASE_HW_FEATURE_END
 };
 
@@ -63,240 +45,69 @@ static const enum base_hw_feature base_hw_features_generic[] = {
 };
 
 static const enum base_hw_feature base_hw_features_tMIx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_END
 };
 
 static const enum base_hw_feature base_hw_features_tHEx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_END
 };
 
 static const enum base_hw_feature base_hw_features_tSIx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_END
 };
 
 static const enum base_hw_feature base_hw_features_tDVx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_END
 };
 
 static const enum base_hw_feature base_hw_features_tNOx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_TLS_HASHING,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
 };
 
 static const enum base_hw_feature base_hw_features_tGOx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_TLS_HASHING,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
 };
 
 static const enum base_hw_feature base_hw_features_tTRx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
 	BASE_HW_FEATURE_END
 };
 
 static const enum base_hw_feature base_hw_features_tNAx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
 	BASE_HW_FEATURE_END
 };
 
 static const enum base_hw_feature base_hw_features_tBEx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
@@ -304,27 +115,8 @@ static const enum base_hw_feature base_hw_features_tBEx[] = {
 };
 
 static const enum base_hw_feature base_hw_features_tBAx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
@@ -332,27 +124,8 @@ static const enum base_hw_feature base_hw_features_tBAx[] = {
 };
 
 static const enum base_hw_feature base_hw_features_tDUx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
@@ -360,85 +133,37 @@ static const enum base_hw_feature base_hw_features_tDUx[] = {
 };
 
 static const enum base_hw_feature base_hw_features_tODx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
 	BASE_HW_FEATURE_END
 };
 
 static const enum base_hw_feature base_hw_features_tGRx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
 	BASE_HW_FEATURE_END
 };
 
 static const enum base_hw_feature base_hw_features_tVAx[] = {
-	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
-	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
-	BASE_HW_FEATURE_XAFFINITY,
-	BASE_HW_FEATURE_WARPING,
-	BASE_HW_FEATURE_INTERPIPE_REG_ALIASING,
-	BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS,
-	BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL,
-	BASE_HW_FEATURE_BRNDOUT_CC,
-	BASE_HW_FEATURE_BRNDOUT_KILL,
-	BASE_HW_FEATURE_LD_ST_LEA_TEX,
-	BASE_HW_FEATURE_LD_ST_TILEBUFFER,
-	BASE_HW_FEATURE_LINEAR_FILTER_FLOAT,
-	BASE_HW_FEATURE_MRT,
-	BASE_HW_FEATURE_MSAA_16X,
-	BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE,
-	BASE_HW_FEATURE_OUT_OF_ORDER_EXEC,
-	BASE_HW_FEATURE_T7XX_PAIRING_RULES,
-	BASE_HW_FEATURE_TEST4_DATUM_MODE,
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
-	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_L2_CONFIG,
 	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
 	BASE_HW_FEATURE_END
 };
 
+static const enum base_hw_feature base_hw_features_tTUx[] = {
+	BASE_HW_FEATURE_FLUSH_REDUCTION,
+	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
+	BASE_HW_FEATURE_L2_CONFIG,
+	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
+	BASE_HW_FEATURE_ASN_HASH,
+	BASE_HW_FEATURE_END
+};
+
 
 #endif /* _BASE_HWCONFIG_FEATURES_H_ */
diff --git a/mali_kbase/mali_base_hwconfig_issues.h b/mali_kbase/mali_base_hwconfig_issues.h
index beda1e4..d188120 100644
--- a/mali_kbase/mali_base_hwconfig_issues.h
+++ b/mali_kbase/mali_base_hwconfig_issues.h
@@ -59,6 +59,7 @@ enum base_hw_issue {
 	BASE_HW_ISSUE_TTRX_3464,
 	BASE_HW_ISSUE_TTRX_3485,
 	BASE_HW_ISSUE_GPU2019_3212,
+	BASE_HW_ISSUE_TURSEHW_1997,
 	BASE_HW_ISSUE_END
 };
 
@@ -637,5 +638,21 @@ static const enum base_hw_issue base_hw_issues_model_tVAx[] = {
 	BASE_HW_ISSUE_END
 };
 
+static const enum base_hw_issue base_hw_issues_model_tTUx[] = {
+	BASE_HW_ISSUE_5736,
+	BASE_HW_ISSUE_9435,
+	BASE_HW_ISSUE_TSIX_2033,
+	BASE_HW_ISSUE_TTRX_1337,
+	BASE_HW_ISSUE_END
+};
+
+static const enum base_hw_issue base_hw_issues_tTUx_r0p0[] = {
+	BASE_HW_ISSUE_9435,
+	BASE_HW_ISSUE_TSIX_2033,
+	BASE_HW_ISSUE_TTRX_1337,
+	BASE_HW_ISSUE_TURSEHW_1997,
+	BASE_HW_ISSUE_END
+};
+
 
 #endif /* _BASE_HWCONFIG_ISSUES_H_ */
diff --git a/mali_kbase/mali_kbase.h b/mali_kbase/mali_kbase.h
index b4e50ae..6bcb754 100644
--- a/mali_kbase/mali_kbase.h
+++ b/mali_kbase/mali_kbase.h
@@ -491,6 +491,46 @@ void kbase_pm_metrics_start(struct kbase_device *kbdev);
  */
 void kbase_pm_metrics_stop(struct kbase_device *kbdev);
 
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+/**
+ * kbase_pm_handle_runtime_suspend - Handle the runtime suspend of GPU
+ *
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * This function is called from the runtime suspend callback function for
+ * saving the HW state and powering down GPU, if GPU was in sleep state mode.
+ * It does the following steps
+ * - Powers up the L2 cache and re-activates the MCU.
+ * - Suspend the CSGs
+ * - Halts the MCU
+ * - Powers down the L2 cache.
+ * - Invokes the power_off callback to power down the GPU.
+ *
+ * Return: 0 if the GPU was already powered down or no error was encountered
+ * in the power down, otherwise an error code.
+ */
+int kbase_pm_handle_runtime_suspend(struct kbase_device *kbdev);
+
+/**
+ * kbase_pm_force_mcu_wakeup_after_sleep - Force the wake up of MCU from sleep
+ *
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * This function forces the wake up of MCU from sleep state and wait for
+ * MCU to become active.
+ * It usually gets called from the runtime suspend callback function.
+ * It also gets called from the GPU reset handler or at the time of system
+ * suspend or when User tries to terminate/suspend the on-slot group.
+ *
+ * Note: @gpu_wakeup_override flag that forces the reactivation of MCU is
+ *       set by this function and it is the caller's responsibility to
+ *       clear the flag.
+ *
+ * Return: 0 if the wake up was successful.
+ */
+int kbase_pm_force_mcu_wakeup_after_sleep(struct kbase_device *kbdev);
+#endif
+
 #if !MALI_USE_CSF
 /**
  * Return the atom's ID, as was originally supplied by userspace in
@@ -498,7 +538,8 @@ void kbase_pm_metrics_stop(struct kbase_device *kbdev);
  * @kctx:  KBase context pointer
  * @katom: Atome for which to return ID
  */
-static inline int kbase_jd_atom_id(struct kbase_context *kctx, struct kbase_jd_atom *katom)
+static inline int kbase_jd_atom_id(struct kbase_context *kctx,
+				   const struct kbase_jd_atom *katom)
 {
 	int result;
 
diff --git a/mali_kbase/mali_kbase_as_fault_debugfs.c b/mali_kbase/mali_kbase_as_fault_debugfs.c
index 027eb8c..deb412c 100644
--- a/mali_kbase/mali_kbase_as_fault_debugfs.c
+++ b/mali_kbase/mali_kbase_as_fault_debugfs.c
@@ -93,7 +93,10 @@ void kbase_as_fault_debugfs_init(struct kbase_device *kbdev)
 	debugfs_directory = debugfs_create_dir("address_spaces",
 					       kbdev->mali_debugfs_directory);
 
-	if (debugfs_directory) {
+	if (IS_ERR_OR_NULL(debugfs_directory)) {
+		dev_warn(kbdev->dev,
+			 "unable to create address_spaces debugfs directory");
+	} else {
 		for (i = 0; i < kbdev->nr_hw_address_spaces; i++) {
 			snprintf(as_name, ARRAY_SIZE(as_name), "as%u", i);
 			debugfs_create_file(as_name, S_IRUGO,
@@ -101,9 +104,6 @@ void kbase_as_fault_debugfs_init(struct kbase_device *kbdev)
 					    (void *)(uintptr_t)i,
 					    &as_fault_fops);
 		}
-	} else {
-		dev_warn(kbdev->dev,
-			 "unable to create address_spaces debugfs directory");
 	}
 
 #endif /* CONFIG_MALI_DEBUG */
diff --git a/mali_kbase/mali_kbase_config.h b/mali_kbase/mali_kbase_config.h
index e7eb334..8b7ee13 100644
--- a/mali_kbase/mali_kbase_config.h
+++ b/mali_kbase/mali_kbase_config.h
@@ -170,6 +170,12 @@ struct kbase_pm_callback_conf {
 	 * the clocks to the GPU, or to completely power down the GPU.
 	 * The platform specific private pointer kbase_device::platform_context can be accessed and modified in here. It is the
 	 * platform \em callbacks responsibility to initialize and terminate this pointer if used (see @ref kbase_platform_funcs_conf).
+	 *
+	 * If runtime PM is enabled and @power_runtime_gpu_idle_callback is used
+	 * then this callback should power off the GPU (or switch off the clocks
+	 * to GPU) immediately. If @power_runtime_gpu_idle_callback is not used,
+	 * then this callback can set the autosuspend timeout (if desired) and
+	 * let the GPU be powered down later.
 	 */
 	void (*power_off_callback)(struct kbase_device *kbdev);
 
@@ -289,6 +295,49 @@ struct kbase_pm_callback_conf {
 	 * be raised. On error, return the corresponding OS error code.
 	 */
 	int (*soft_reset_callback)(struct kbase_device *kbdev);
+
+	/*
+	 * Optional callback invoked after GPU becomes idle, not supported on
+	 * JM GPUs.
+	 *
+	 * This callback will be invoked by the Kbase when GPU becomes idle.
+	 * For JM GPUs or when runtime PM is disabled, Kbase will not invoke
+	 * this callback and @power_off_callback will be invoked directly.
+	 *
+	 * This callback is supposed to decrement the runtime PM core reference
+	 * count to zero and trigger the auto-suspend timer, which implies that
+	 * @power_off_callback shouldn't initiate the runtime suspend.
+	 *
+	 * GPU registers still remain accessible until @power_off_callback gets
+	 * invoked later on the expiry of auto-suspend timer.
+	 *
+	 * Note: The Linux kernel must have CONFIG_PM_RUNTIME enabled to use
+	 * this feature.
+	 */
+	void (*power_runtime_gpu_idle_callback)(struct kbase_device *kbdev);
+
+	/*
+	 * Optional callback invoked to change the runtime PM core state to
+	 * active.
+	 *
+	 * This callback will be invoked by Kbase when GPU needs to be
+	 * reactivated, but only if @power_runtime_gpu_idle_callback was invoked
+	 * previously. So both @power_runtime_gpu_idle_callback and this
+	 * callback needs to be implemented at the same time.
+	 *
+	 * Kbase will invoke @power_on_callback first before invoking this
+	 * callback if the GPU was powered down previously, otherwise directly.
+	 *
+	 * This callback is supposed to increment the runtime PM core reference
+	 * count to 1, which implies that @power_on_callback shouldn't initiate
+	 * the runtime resume. The runtime resume may not happen synchronously
+	 * to avoid a potential deadlock due to the runtime suspend happening
+	 * simultaneously from some other thread.
+	 *
+	 * Note: The Linux kernel must have CONFIG_PM_RUNTIME enabled to use
+	 * this feature.
+	 */
+	void (*power_runtime_gpu_active_callback)(struct kbase_device *kbdev);
 };
 
 /* struct kbase_gpu_clk_notifier_data - Data for clock rate change notifier.
diff --git a/mali_kbase/mali_kbase_config_defaults.h b/mali_kbase/mali_kbase_config_defaults.h
index 63c36e2..8d64184 100644
--- a/mali_kbase/mali_kbase_config_defaults.h
+++ b/mali_kbase/mali_kbase_config_defaults.h
@@ -177,6 +177,19 @@ enum {
  */
 #define DEFAULT_RESET_TIMEOUT_MS (3000) /* 3s */
 
+/* Waiting timeout for status change acknowledgment, in clock cycles
+ * Based on 3000ms timeout at nominal 100MHz, as is required for Android - based
+ * on scaling from a 50MHz GPU system.
+ */
+#define DEFAULT_REF_TIMEOUT_FREQ_KHZ (100000)
+#define CSF_FIRMWARE_TIMEOUT_CYCLES (300000000)
+
+/* A default timeout to be used when an invalid timeout selector is
+ * used to retrieve the timeout, on JM GPUs. CSF GPUs use the Firmware
+ * timeout as the default.
+ */
+#define JM_DEFAULT_TIMEOUT_CYCLES (150000000)
+
 /**
  * Default timeslice that a context is scheduled in for, in nanoseconds.
  *
diff --git a/mali_kbase/mali_kbase_core_linux.c b/mali_kbase/mali_kbase_core_linux.c
index e7fc41e..2472c7c 100644
--- a/mali_kbase/mali_kbase_core_linux.c
+++ b/mali_kbase/mali_kbase_core_linux.c
@@ -53,6 +53,7 @@
 #include "mali_kbase_hwcnt_context.h"
 #include "mali_kbase_hwcnt_virtualizer.h"
 #include "mali_kbase_hwcnt_legacy.h"
+#include "mali_kbase_kinstr_prfcnt.h"
 #include "mali_kbase_vinstr.h"
 #if MALI_USE_CSF
 #include "csf/mali_kbase_csf_firmware.h"
@@ -71,6 +72,9 @@
 #endif
 #include "backend/gpu/mali_kbase_pm_internal.h"
 #include "mali_kbase_dvfs_debugfs.h"
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+#include "mali_kbase_pbha_debugfs.h"
+#endif
 
 #include <linux/module.h>
 #include <linux/init.h>
@@ -403,6 +407,22 @@ static int kbase_api_handshake_dummy(struct kbase_file *kfile,
 	return -EPERM;
 }
 
+static int kbase_api_kinstr_prfcnt_enum_info(
+	struct kbase_file *kfile,
+	struct kbase_ioctl_kinstr_prfcnt_enum_info *prfcnt_enum_info)
+{
+	return kbase_kinstr_prfcnt_enum_info(kfile->kbdev->kinstr_prfcnt_ctx,
+					     prfcnt_enum_info);
+}
+
+static int kbase_api_kinstr_prfcnt_setup(
+	struct kbase_file *kfile,
+	union kbase_ioctl_kinstr_prfcnt_setup *prfcnt_setup)
+{
+	return kbase_kinstr_prfcnt_setup(kfile->kbdev->kinstr_prfcnt_ctx,
+					 prfcnt_setup);
+}
+
 static struct kbase_device *to_kbase_device(struct device *dev)
 {
 	return dev_get_drvdata(dev);
@@ -808,16 +828,13 @@ static int kbase_api_mem_alloc(struct kbase_context *kctx,
 	u64 flags = alloc->in.flags;
 	u64 gpu_va;
 
-	rcu_read_lock();
-	/* Don't allow memory allocation until user space has set up the
-	 * tracking page (which sets kctx->process_mm). Also catches when we've
-	 * forked.
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
 	 */
-	if (rcu_dereference(kctx->process_mm) != current->mm) {
-		rcu_read_unlock();
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
+	if (!kbase_mem_allow_alloc(kctx))
 		return -EINVAL;
-	}
-	rcu_read_unlock();
 
 	if (flags & BASEP_MEM_FLAGS_KERNEL_ONLY)
 		return -ENOMEM;
@@ -849,7 +866,8 @@ static int kbase_api_mem_alloc(struct kbase_context *kctx,
 #endif
 
 	reg = kbase_mem_alloc(kctx, alloc->in.va_pages, alloc->in.commit_pages,
-			      alloc->in.extension, &flags, &gpu_va);
+			      alloc->in.extension, &flags, &gpu_va,
+			      mmu_sync_info);
 
 	if (!reg)
 		return -ENOMEM;
@@ -1643,6 +1661,20 @@ static long kbase_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 				struct kbase_ioctl_set_flags,
 				kfile);
 		break;
+
+	case KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO:
+		KBASE_HANDLE_IOCTL_INOUT(
+			KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO,
+			kbase_api_kinstr_prfcnt_enum_info,
+			struct kbase_ioctl_kinstr_prfcnt_enum_info, kfile);
+		break;
+
+	case KBASE_IOCTL_KINSTR_PRFCNT_SETUP:
+		KBASE_HANDLE_IOCTL_INOUT(KBASE_IOCTL_KINSTR_PRFCNT_SETUP,
+					 kbase_api_kinstr_prfcnt_setup,
+					 union kbase_ioctl_kinstr_prfcnt_setup,
+					 kfile);
+		break;
 	}
 
 	kctx = kbase_file_get_kctx_if_setup_complete(kfile);
@@ -3097,6 +3129,10 @@ static ssize_t kbase_show_gpuinfo(struct device *dev,
 		  .name = "Mali-G510" },
 		{ .id = GPU_ID2_PRODUCT_TVAX >> GPU_ID_VERSION_PRODUCT_ID_SHIFT,
 		  .name = "Mali-G310" },
+		{ .id = GPU_ID2_PRODUCT_TTUX >> GPU_ID_VERSION_PRODUCT_ID_SHIFT,
+		  .name = "Mali-TTUX" },
+		{ .id = GPU_ID2_PRODUCT_LTUX >> GPU_ID_VERSION_PRODUCT_ID_SHIFT,
+		  .name = "Mali-LTUX" },
 	};
 	const char *product_name = "(Unknown Mali GPU)";
 	struct kbase_device *kbdev;
@@ -4574,25 +4610,31 @@ MAKE_QUIRK_ACCESSORS(tiler);
 MAKE_QUIRK_ACCESSORS(mmu);
 MAKE_QUIRK_ACCESSORS(gpu);
 
-static ssize_t kbase_device_debugfs_reset_write(struct file *file,
-		const char __user *ubuf, size_t count, loff_t *ppos)
+/**
+ * kbase_device_debugfs_reset_write() - Reset the GPU
+ *
+ * @data:           Pointer to the Kbase device.
+ * @wait_for_reset: Value written to the file.
+ *
+ * This function will perform the GPU reset, and if the value written to
+ * the file is 1 it will also wait for the reset to complete.
+ *
+ * Return: 0 in case of no error otherwise a negative value.
+ */
+static int kbase_device_debugfs_reset_write(void *data, u64 wait_for_reset)
 {
-	struct kbase_device *kbdev = file->private_data;
-	CSTD_UNUSED(ubuf);
-	CSTD_UNUSED(count);
-	CSTD_UNUSED(ppos);
+	struct kbase_device *kbdev = data;
 
 	trigger_reset(kbdev);
 
-	return count;
+	if (wait_for_reset == 1)
+		return kbase_reset_gpu_wait(kbdev);
+
+	return 0;
 }
 
-static const struct file_operations fops_trigger_reset = {
-	.owner = THIS_MODULE,
-	.open = simple_open,
-	.write = kbase_device_debugfs_reset_write,
-	.llseek = default_llseek,
-};
+DEFINE_SIMPLE_ATTRIBUTE(fops_trigger_reset,
+		NULL, &kbase_device_debugfs_reset_write, "%llu\n");
 
 /**
  * debugfs_protected_debug_mode_read - "protected_debug_mode" debugfs read
@@ -4692,7 +4734,7 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev)
 
 	kbdev->mali_debugfs_directory = debugfs_create_dir(kbdev->devname,
 			NULL);
-	if (!kbdev->mali_debugfs_directory) {
+	if (IS_ERR_OR_NULL(kbdev->mali_debugfs_directory)) {
 		dev_err(kbdev->dev,
 			"Couldn't create mali debugfs directory: %s\n",
 			kbdev->devname);
@@ -4702,7 +4744,7 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev)
 
 	kbdev->debugfs_ctx_directory = debugfs_create_dir("ctx",
 			kbdev->mali_debugfs_directory);
-	if (!kbdev->debugfs_ctx_directory) {
+	if (IS_ERR_OR_NULL(kbdev->debugfs_ctx_directory)) {
 		dev_err(kbdev->dev, "Couldn't create mali debugfs ctx directory\n");
 		err = -ENOMEM;
 		goto out;
@@ -4710,7 +4752,7 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev)
 
 	kbdev->debugfs_instr_directory = debugfs_create_dir("instrumentation",
 			kbdev->mali_debugfs_directory);
-	if (!kbdev->debugfs_instr_directory) {
+	if (IS_ERR_OR_NULL(kbdev->debugfs_instr_directory)) {
 		dev_err(kbdev->dev, "Couldn't create mali debugfs instrumentation directory\n");
 		err = -ENOMEM;
 		goto out;
@@ -4718,7 +4760,7 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev)
 
 	debugfs_ctx_defaults_directory = debugfs_create_dir("defaults",
 			kbdev->debugfs_ctx_directory);
-	if (!debugfs_ctx_defaults_directory) {
+	if (IS_ERR_OR_NULL(debugfs_ctx_defaults_directory)) {
 		dev_err(kbdev->dev, "Couldn't create mali debugfs ctx defaults directory\n");
 		err = -ENOMEM;
 		goto out;
@@ -4735,6 +4777,8 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev)
 #ifdef CONFIG_MALI_PRFCNT_SET_SELECT_VIA_DEBUG_FS
 	kbase_instr_backend_debugfs_init(kbdev);
 #endif
+	kbase_pbha_debugfs_init(kbdev);
+
 	/* fops_* variables created by invocations of macro
 	 * MAKE_QUIRK_ACCESSORS() above.
 	 */
@@ -5293,11 +5337,19 @@ static int kbase_device_resume(struct device *dev)
 static int kbase_device_runtime_suspend(struct device *dev)
 {
 	struct kbase_device *kbdev = to_kbase_device(dev);
+	int ret = 0;
 
 	if (!kbdev)
 		return -ENODEV;
 
 	dev_dbg(dev, "Callback %s\n", __func__);
+	KBASE_KTRACE_ADD(kbdev, PM_RUNTIME_SUSPEND_CALLBACK, NULL, 0);
+
+#if MALI_USE_CSF
+	ret = kbase_pm_handle_runtime_suspend(kbdev);
+	if (ret)
+		return ret;
+#endif
 
 #ifdef CONFIG_MALI_MIDGARD_DVFS
 	kbase_pm_metrics_stop(kbdev);
@@ -5312,7 +5364,7 @@ static int kbase_device_runtime_suspend(struct device *dev)
 		kbdev->pm.backend.callback_power_runtime_off(kbdev);
 		dev_dbg(dev, "runtime suspend\n");
 	}
-	return 0;
+	return ret;
 }
 #endif /* KBASE_PM_RUNTIME */
 
@@ -5336,6 +5388,7 @@ static int kbase_device_runtime_resume(struct device *dev)
 		return -ENODEV;
 
 	dev_dbg(dev, "Callback %s\n", __func__);
+	KBASE_KTRACE_ADD(kbdev, PM_RUNTIME_RESUME_CALLBACK, NULL, 0);
 	if (kbdev->pm.backend.callback_power_runtime_on) {
 		ret = kbdev->pm.backend.callback_power_runtime_on(kbdev);
 		dev_dbg(dev, "runtime resume\n");
diff --git a/mali_kbase/mali_kbase_defs.h b/mali_kbase/mali_kbase_defs.h
index 146695c..5b1fdd3 100644
--- a/mali_kbase/mali_kbase_defs.h
+++ b/mali_kbase/mali_kbase_defs.h
@@ -71,10 +71,6 @@
 #include <linux/regulator/consumer.h>
 #include <linux/memory_group_manager.h>
 
-#if defined(CONFIG_PM_RUNTIME) || defined(CONFIG_PM)
-#define KBASE_PM_RUNTIME 1
-#endif
-
 #include "debug/mali_kbase_debug_ktrace_defs.h"
 
 /** Number of milliseconds before we time out on a GPU soft/hard reset */
@@ -111,12 +107,12 @@
 /**
  * Maximum size in bytes of a MMU lock region, as a logarithm
  */
-#define KBASE_LOCK_REGION_MAX_SIZE_LOG2 (64)
+#define KBASE_LOCK_REGION_MAX_SIZE_LOG2 (48) /*  256 TB */
 
 /**
  * Minimum size in bytes of a MMU lock region, as a logarithm
  */
-#define KBASE_LOCK_REGION_MIN_SIZE_LOG2 (15)
+#define KBASE_LOCK_REGION_MIN_SIZE_LOG2 (15) /* 32 kB */
 
 /**
  * Maximum number of GPU memory region zones
@@ -269,6 +265,21 @@ struct kbase_mmu_table {
 	struct kbase_context *kctx;
 };
 
+/**
+ * struct kbase_reg_zone - Information about GPU memory region zones
+ * @base_pfn: Page Frame Number in GPU virtual address space for the start of
+ *            the Zone
+ * @va_size_pages: Size of the Zone in pages
+ *
+ * Track information about a zone KBASE_REG_ZONE() and related macros.
+ * In future, this could also store the &rb_root that are currently in
+ * &kbase_context and &kbase_csf_device.
+ */
+struct kbase_reg_zone {
+	u64 base_pfn;
+	u64 va_size_pages;
+};
+
 #if MALI_USE_CSF
 #include "csf/mali_kbase_csf_defs.h"
 #else
@@ -363,6 +374,12 @@ struct kbase_clk_rate_trace_manager {
  * 	that some code paths keep shaders/the tiler powered whilst this is 0.
  * 	Use kbase_pm_is_active() instead to check for such cases.
  * @suspending: Flag indicating suspending/suspended
+ * @runtime_active: Flag to track if the GPU is in runtime suspended or active
+ *                  state. This ensures that runtime_put and runtime_get
+ *                  functions are called in pairs. For example if runtime_get
+ *                  has already been called from the power_on callback, then
+ *                  the call to it from runtime_gpu_active callback can be
+ *                  skipped.
  * @gpu_lost: Flag indicating gpu lost
  * 	This structure contains data for the power management framework. There
  * 	is one instance of this structure per device in the system.
@@ -388,6 +405,9 @@ struct kbase_pm_device_data {
 	struct mutex lock;
 	int active_count;
 	bool suspending;
+#if MALI_USE_CSF
+	bool runtime_active;
+#endif
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
 	atomic_t gpu_lost;
 #endif /* CONFIG_MALI_ARBITER_SUPPORT */
@@ -529,8 +549,11 @@ struct kbase_devfreq_opp {
  * @entry_set_ate:    program the pte to be a valid address translation entry to
  *                    encode the physical address of the actual page being mapped.
  * @entry_set_pte:    program the pte to be a valid entry to encode the physical
- *                    address of the next lower level page table.
+ *                    address of the next lower level page table and also update
+ *                    the number of valid entries.
  * @entry_invalidate: clear out or invalidate the pte.
+ * @get_num_valid_entries: returns the number of valid entries for a specific pgd.
+ * @set_num_valid_entries: sets the number of valid entries for a specific pgd
  * @flags:            bitmask of MMU mode flags. Refer to KBASE_MMU_MODE_ constants.
  */
 struct kbase_mmu_mode {
@@ -545,8 +568,11 @@ struct kbase_mmu_mode {
 	int (*pte_is_valid)(u64 pte, int level);
 	void (*entry_set_ate)(u64 *entry, struct tagged_addr phy,
 			unsigned long flags, int level);
-	void (*entry_set_pte)(u64 *entry, phys_addr_t phy);
+	void (*entry_set_pte)(u64 *pgd, u64 vpfn, phys_addr_t phy);
 	void (*entry_invalidate)(u64 *entry);
+	unsigned int (*get_num_valid_entries)(u64 *pgd);
+	void (*set_num_valid_entries)(u64 *pgd,
+				      unsigned int num_of_valid_entries);
 	unsigned long flags;
 };
 
@@ -722,6 +748,7 @@ struct kbase_process {
  *                         kbase_hwcnt_context_enable() with @hwcnt_gpu_ctx.
  * @hwcnt_gpu_virt:        Virtualizer for GPU hardware counters.
  * @vinstr_ctx:            vinstr context created per device.
+ * @kinstr_prfcnt_ctx:     kinstr_prfcnt context created per device.
  * @timeline_flags:        Bitmask defining which sets of timeline tracepoints
  *                         are enabled. If zero, there is no timeline client and
  *                         therefore timeline is disabled.
@@ -738,6 +765,8 @@ struct kbase_process {
  * @reset_timeout_ms:      Number of milliseconds to wait for the soft stop to
  *                         complete for the GPU jobs before proceeding with the
  *                         GPU reset.
+ * @lowest_gpu_freq_khz:   Lowest frequency in KHz that the GPU can run at. Used
+ *                         to calculate suitable timeouts for wait operations.
  * @cache_clean_in_progress: Set when a cache clean has been started, and
  *                         cleared when it has finished. This prevents multiple
  *                         cache cleans being done simultaneously.
@@ -752,8 +781,6 @@ struct kbase_process {
  *                         including any contexts that might be created for
  *                         hardware counters.
  * @kctx_list_lock:        Lock protecting concurrent accesses to @kctx_list.
- * @group_max_uid_in_devices: Max value of any queue group UID in any kernel
- *                            context in the kbase device.
  * @devfreq_profile:       Describes devfreq profile for the Mali GPU device, passed
  *                         to devfreq_add_device() to add devfreq feature to Mali
  *                         GPU device.
@@ -891,6 +918,10 @@ struct kbase_process {
  * @l2_hash_override:       Used to set L2 cache hash via device tree blob
  * @l2_hash_values_override: true if @l2_hash_values is valid.
  * @l2_hash_values:         Used to set L2 asn_hash via device tree blob
+ * @sysc_alloc:             Array containing values to be programmed into
+ *                          SYSC_ALLOC[0..7] GPU registers on L2 cache
+ *                          power down. These come from either DTB or
+ *                          via DebugFS (if it is available in kernel).
  * @process_root:           rb_tree root node for maintaining a rb_tree of
  *                          kbase_process based on key tgid(thread group ID).
  * @dma_buf_root:           rb_tree root node for maintaining a rb_tree of
@@ -993,6 +1024,7 @@ struct kbase_device {
 	struct kbase_hwcnt_context *hwcnt_gpu_ctx;
 	struct kbase_hwcnt_virtualizer *hwcnt_gpu_virt;
 	struct kbase_vinstr_context *vinstr_ctx;
+	struct kbase_kinstr_prfcnt_context *kinstr_prfcnt_ctx;
 
 	atomic_t               timeline_flags;
 	struct kbase_timeline *timeline;
@@ -1002,6 +1034,8 @@ struct kbase_device {
 #endif
 	u32 reset_timeout_ms;
 
+	u64 lowest_gpu_freq_khz;
+
 	bool cache_clean_in_progress;
 	bool cache_clean_queued;
 	wait_queue_head_t cache_clean_wait;
@@ -1010,7 +1044,6 @@ struct kbase_device {
 
 	struct list_head        kctx_list;
 	struct mutex            kctx_list_lock;
-	atomic_t                group_max_uid_in_devices;
 
 #ifdef CONFIG_MALI_DEVFREQ
 	struct devfreq_dev_profile devfreq_profile;
@@ -1129,6 +1162,8 @@ struct kbase_device {
 	bool l2_hash_values_override;
 	u32 l2_hash_values[ASN_HASH_COUNT];
 
+	u32 sysc_alloc[SYSC_ALLOC_COUNT];
+
 	struct mutex fw_load_lock;
 #if MALI_USE_CSF
 	/* CSF object for the GPU device. */
@@ -1396,21 +1431,6 @@ struct kbase_sub_alloc {
 };
 
 /**
- * struct kbase_reg_zone - Information about GPU memory region zones
- * @base_pfn: Page Frame Number in GPU virtual address space for the start of
- *            the Zone
- * @va_size_pages: Size of the Zone in pages
- *
- * Track information about a zone KBASE_REG_ZONE() and related macros.
- * In future, this could also store the &rb_root that are currently in
- * &kbase_context
- */
-struct kbase_reg_zone {
-	u64 base_pfn;
-	u64 va_size_pages;
-};
-
-/**
  * struct kbase_context - Kernel base context
  *
  * @filp:                 Pointer to the struct file corresponding to device file
@@ -1561,17 +1581,10 @@ struct kbase_reg_zone {
  *                        of RB-tree holding currently runnable atoms on the job slot
  *                        and the head item of the linked list of atoms blocked on
  *                        cross-slot dependencies.
- * @atoms_pulled:         Total number of atoms currently pulled from the context.
- * @atoms_pulled_slot:    Per slot count of the number of atoms currently pulled
- *                        from the context.
- * @atoms_pulled_slot_pri: Per slot & priority count of the number of atoms currently
- *                        pulled from the context. hwaccess_lock shall be held when
- *                        accessing it.
- * @blocked_js:           Indicates if the context is blocked from submitting atoms
- *                        on a slot at a given priority. This is set to true, when
- *                        the atom corresponding to context is soft/hard stopped or
- *                        removed from the HEAD_NEXT register in response to
- *                        soft/hard stop.
+ * @slot_tracking:        Tracking and control of this context's use of all job
+ *                        slots
+ * @atoms_pulled_all_slots: Total number of atoms currently pulled from the
+ *                        context, across all slots.
  * @slots_pullable:       Bitmask of slots, indicating the slots for which the
  *                        context has pullable atoms in the runnable tree.
  * @work:                 Work structure used for deferred ASID assignment.
@@ -1717,17 +1730,14 @@ struct kbase_context {
 	struct kbase_jd_context jctx;
 	struct jsctx_queue jsctx_queue
 		[KBASE_JS_ATOM_SCHED_PRIO_COUNT][BASE_JM_MAX_NR_SLOTS];
+	struct kbase_jsctx_slot_tracking slot_tracking[BASE_JM_MAX_NR_SLOTS];
+	atomic_t atoms_pulled_all_slots;
 
 	struct list_head completed_jobs;
 	atomic_t work_count;
 	struct timer_list soft_job_timeout;
 
-	atomic_t atoms_pulled;
-	atomic_t atoms_pulled_slot[BASE_JM_MAX_NR_SLOTS];
-	int atoms_pulled_slot_pri[BASE_JM_MAX_NR_SLOTS][
-			KBASE_JS_ATOM_SCHED_PRIO_COUNT];
 	int priority;
-	bool blocked_js[BASE_JM_MAX_NR_SLOTS][KBASE_JS_ATOM_SCHED_PRIO_COUNT];
 	s16 atoms_count[KBASE_JS_ATOM_SCHED_PRIO_COUNT];
 	u32 slots_pullable;
 	u32 age_count;
@@ -1888,6 +1898,13 @@ enum kbase_share_attr_bits {
 };
 
 /**
+ * enum kbase_timeout_selector - The choice of which timeout to get scaled
+ *                               using current GPU frequency.
+ * @CSF_FIRMWARE_TIMEOUT: Response timeout from CSF firmware.
+ */
+enum kbase_timeout_selector { CSF_FIRMWARE_TIMEOUT };
+
+/**
  * kbase_device_is_cpu_coherent - Returns if the device is CPU coherent.
  * @kbdev: kbase device
  *
diff --git a/mali_kbase/mali_kbase_dma_fence.c b/mali_kbase/mali_kbase_dma_fence.c
index 69ff8cc..bf2d9cc 100644
--- a/mali_kbase/mali_kbase_dma_fence.c
+++ b/mali_kbase/mali_kbase_dma_fence.c
@@ -249,8 +249,10 @@ kbase_dma_fence_add_reservation_callback(struct kbase_jd_atom *katom,
 
 #if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
 	err = reservation_object_get_fences_rcu(
-#else
+#elif (KERNEL_VERSION(5, 14, 0) > LINUX_VERSION_CODE)
 	err = dma_resv_get_fences_rcu(
+#else
+	err = dma_resv_get_fences(
 #endif
 						resv,
 						&excl_fence,
diff --git a/mali_kbase/mali_kbase_dummy_job_wa.c b/mali_kbase/mali_kbase_dummy_job_wa.c
index 1e91ba0..bdc5d6d 100644
--- a/mali_kbase/mali_kbase_dummy_job_wa.c
+++ b/mali_kbase/mali_kbase_dummy_job_wa.c
@@ -281,6 +281,11 @@ int kbase_dummy_job_wa_load(struct kbase_device *kbdev)
 	int err;
 	struct kbase_context *kctx;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	lockdep_assert_held(&kbdev->fw_load_lock);
 
 	if (!wa_blob_load_needed(kbdev))
@@ -375,8 +380,8 @@ int kbase_dummy_job_wa_load(struct kbase_device *kbdev)
 		nr_pages = PFN_UP(blob->size);
 		flags = blob->map_flags | BASE_MEM_FLAG_MAP_FIXED;
 
-		va_region = kbase_mem_alloc(kctx, nr_pages, nr_pages,
-					    0, &flags, &gpu_va);
+		va_region = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags,
+					    &gpu_va, mmu_sync_info);
 
 		if (!va_region) {
 			dev_err(kbdev->dev, "Failed to allocate for blob\n");
diff --git a/mali_kbase/mali_kbase_gpuprops.c b/mali_kbase/mali_kbase_gpuprops.c
index e4d52c9..967c08e 100644
--- a/mali_kbase/mali_kbase_gpuprops.c
+++ b/mali_kbase/mali_kbase_gpuprops.c
@@ -661,6 +661,19 @@ int kbase_gpuprops_update_l2_features(struct kbase_device *kbdev)
 		dev_info(kbdev->dev, "Reflected L2_CONFIG is 0x%08x\n",
 			 regdump.l2_config);
 
+		if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_ASN_HASH)) {
+			int idx;
+			const bool asn_he = regdump.l2_config &
+					    L2_CONFIG_ASN_HASH_ENABLE_MASK;
+			if (!asn_he && kbdev->l2_hash_values_override)
+				dev_err(kbdev->dev,
+					"Failed to use requested ASN_HASH, fallback to default");
+			for (idx = 0; idx < ASN_HASH_COUNT; idx++)
+				dev_info(kbdev->dev,
+					 "%s ASN_HASH[%d] is [0x%08x]\n",
+					 asn_he ? "Overridden" : "Default", idx,
+					 regdump.l2_asn_hash[idx]);
+		}
 
 		/* Update gpuprops with reflected L2_FEATURES */
 		gpu_props->raw_props.l2_features = regdump.l2_features;
diff --git a/mali_kbase/mali_kbase_gpuprops_types.h b/mali_kbase/mali_kbase_gpuprops_types.h
index 02705a0..67a4d7d 100644
--- a/mali_kbase/mali_kbase_gpuprops_types.h
+++ b/mali_kbase/mali_kbase_gpuprops_types.h
@@ -35,6 +35,7 @@ struct kbase_gpuprops_regdump {
 	u32 gpu_id;
 	u32 l2_features;
 	u32 l2_config;
+	u32 l2_asn_hash[ASN_HASH_COUNT];
 	u32 core_features;
 	u32 tiler_features;
 	u32 mem_features;
diff --git a/mali_kbase/mali_kbase_hw.c b/mali_kbase/mali_kbase_hw.c
index 7ad583c..183fd18 100644
--- a/mali_kbase/mali_kbase_hw.c
+++ b/mali_kbase/mali_kbase_hw.c
@@ -81,6 +81,10 @@ void kbase_hw_set_features_mask(struct kbase_device *kbdev)
 	case GPU_ID2_PRODUCT_TVAX:
 		features = base_hw_features_tVAx;
 		break;
+	case GPU_ID2_PRODUCT_TTUX:
+	case GPU_ID2_PRODUCT_LTUX:
+		features = base_hw_features_tTUx;
+		break;
 	default:
 		features = base_hw_features_generic;
 		break;
@@ -225,6 +229,15 @@ static const enum base_hw_issue *kbase_hw_get_issues_for_new_id(
 		{ GPU_ID2_PRODUCT_TVAX,
 		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tVAx_r0p0 },
 		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_TTUX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tTUx_r0p0 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_LTUX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tTUx_r0p0 },
+		    { U32_MAX, NULL } } },
+
 	};
 
 	u32 gpu_id = kbdev->gpu_props.props.raw_props.gpu_id;
@@ -380,6 +393,11 @@ int kbase_hw_set_issues_mask(struct kbase_device *kbdev)
 		case GPU_ID2_PRODUCT_TVAX:
 			issues = base_hw_issues_model_tVAx;
 			break;
+		case GPU_ID2_PRODUCT_TTUX:
+		case GPU_ID2_PRODUCT_LTUX:
+			issues = base_hw_issues_model_tTUx;
+			break;
+
 		default:
 			dev_err(kbdev->dev,
 				"Unknown GPU ID %x", gpu_id);
diff --git a/mali_kbase/mali_kbase_hwaccess_time.h b/mali_kbase/mali_kbase_hwaccess_time.h
index 8a4ece4..27e2cb7 100644
--- a/mali_kbase/mali_kbase_hwaccess_time.h
+++ b/mali_kbase/mali_kbase_hwaccess_time.h
@@ -48,3 +48,25 @@ void kbase_backend_get_gpu_time_norequest(struct kbase_device *kbdev,
 					  struct timespec64 *ts);
 
 #endif /* _KBASE_BACKEND_TIME_H_ */
+
+/**
+ * kbase_get_timeout_ms - Choose a timeout value to get a timeout scaled
+ *                        GPU frequency, using a choice from
+ *                        kbase_timeout_selector.
+ *
+ * @kbdev:	KBase device pointer.
+ * @selector:	Value from kbase_scaled_timeout_selector enum.
+ *
+ * Return:	Timeout in milliseconds, as an unsigned integer.
+ */
+unsigned int kbase_get_timeout_ms(struct kbase_device *kbdev,
+				  enum kbase_timeout_selector selector);
+
+/**
+ * kbase_backend_get_cycle_cnt - Reads the GPU cycle counter
+ *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ *
+ * Return: Snapshot of the GPU cycle count register.
+ */
+u64 kbase_backend_get_cycle_cnt(struct kbase_device *kbdev);
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_csf.c b/mali_kbase/mali_kbase_hwcnt_backend_csf.c
index 58b5e72..7ba1671 100644
--- a/mali_kbase/mali_kbase_hwcnt_backend_csf.c
+++ b/mali_kbase/mali_kbase_hwcnt_backend_csf.c
@@ -157,19 +157,20 @@ struct kbase_hwcnt_backend_csf_info {
  * @shader_cnt:         Shader Core block count.
  * @block_cnt:          Total block count (sum of all other block counts).
  * @shader_avail_mask:  Bitmap of all shader cores in the system.
- * @offset_enable_mask: Offset of enable mask in the block.
+ * @enable_mask_offset: Offset in array elements of enable mask in each block
+ *                      starting from the beginning of block.
  * @headers_per_block:  Header size per block.
  * @counters_per_block: Counters size per block.
  * @values_per_block:   Total size per block.
  */
 struct kbase_hwcnt_csf_physical_layout {
-	size_t fe_cnt;
-	size_t tiler_cnt;
-	size_t mmu_l2_cnt;
-	size_t shader_cnt;
-	size_t block_cnt;
+	u8 fe_cnt;
+	u8 tiler_cnt;
+	u8 mmu_l2_cnt;
+	u8 shader_cnt;
+	u8 block_cnt;
 	u64 shader_avail_mask;
-	size_t offset_enable_mask;
+	size_t enable_mask_offset;
 	size_t headers_per_block;
 	size_t counters_per_block;
 	size_t values_per_block;
@@ -184,11 +185,13 @@ struct kbase_hwcnt_csf_physical_layout {
  *                              to accumulate up to.
  * @enable_state_waitq:         Wait queue object used to notify the enable
  *                              changing flag is done.
- * @to_user_buf:                HWC sample buffer for client user.
+ * @to_user_buf:                HWC sample buffer for client user, size
+ *                              metadata.dump_buf_bytes.
  * @accum_buf:                  HWC sample buffer used as an internal
- *                              accumulator.
+ *                              accumulator, size metadata.dump_buf_bytes.
  * @old_sample_buf:             HWC sample buffer to save the previous values
- *                              for delta calculation.
+ *                              for delta calculation, size
+ *                              prfcnt_info.dump_bytes.
  * @ring_buf:                   Opaque pointer for ring buffer object.
  * @ring_buf_cpu_base:          CPU base address of the allocated ring buffer.
  * @clk_enable_map:             The enable map specifying enabled clock domains.
@@ -213,8 +216,8 @@ struct kbase_hwcnt_backend_csf {
 	enum kbase_hwcnt_backend_csf_enable_state enable_state;
 	u32 insert_index_to_accumulate;
 	wait_queue_head_t enable_state_waitq;
-	u32 *to_user_buf;
-	u32 *accum_buf;
+	u64 *to_user_buf;
+	u64 *accum_buf;
 	u32 *old_sample_buf;
 	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf;
 	void *ring_buf_cpu_base;
@@ -333,34 +336,40 @@ static void kbasep_hwcnt_backend_csf_init_layout(
 	const struct kbase_hwcnt_backend_csf_if_prfcnt_info *prfcnt_info,
 	struct kbase_hwcnt_csf_physical_layout *phys_layout)
 {
+	u8 shader_core_cnt;
+	size_t values_per_block;
+
 	WARN_ON(!prfcnt_info);
 	WARN_ON(!phys_layout);
 
-	phys_layout->fe_cnt = 1;
-	phys_layout->tiler_cnt = 1;
-	phys_layout->mmu_l2_cnt = prfcnt_info->l2_count;
-	phys_layout->shader_cnt = fls64(prfcnt_info->core_mask);
-	phys_layout->block_cnt = phys_layout->fe_cnt + phys_layout->tiler_cnt +
-				 phys_layout->mmu_l2_cnt +
-				 phys_layout->shader_cnt;
-
-	phys_layout->shader_avail_mask = prfcnt_info->core_mask;
-
-	phys_layout->headers_per_block = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
-	phys_layout->values_per_block =
-		prfcnt_info->prfcnt_block_size / KBASE_HWCNT_VALUE_BYTES;
-	phys_layout->counters_per_block =
-		phys_layout->values_per_block - phys_layout->headers_per_block;
-	phys_layout->offset_enable_mask = KBASE_HWCNT_V5_PRFCNT_EN_HEADER;
+	shader_core_cnt = fls64(prfcnt_info->core_mask);
+	values_per_block =
+		prfcnt_info->prfcnt_block_size / KBASE_HWCNT_VALUE_HW_BYTES;
+
+	*phys_layout = (struct kbase_hwcnt_csf_physical_layout){
+		.fe_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT,
+		.tiler_cnt = KBASE_HWCNT_V5_TILER_BLOCK_COUNT,
+		.mmu_l2_cnt = prfcnt_info->l2_count,
+		.shader_cnt = shader_core_cnt,
+		.block_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT +
+			     KBASE_HWCNT_V5_TILER_BLOCK_COUNT +
+			     prfcnt_info->l2_count + shader_core_cnt,
+		.shader_avail_mask = prfcnt_info->core_mask,
+		.headers_per_block = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+		.values_per_block = values_per_block,
+		.counters_per_block =
+			values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+		.enable_mask_offset = KBASE_HWCNT_V5_PRFCNT_EN_HEADER,
+	};
 }
 
 static void kbasep_hwcnt_backend_csf_reset_internal_buffers(
 	struct kbase_hwcnt_backend_csf *backend_csf)
 {
-	memset(backend_csf->to_user_buf, 0,
-	       backend_csf->info->prfcnt_info.dump_bytes);
-	memset(backend_csf->accum_buf, 0,
-	       backend_csf->info->prfcnt_info.dump_bytes);
+	size_t user_buf_bytes = backend_csf->info->metadata->dump_buf_bytes;
+
+	memset(backend_csf->to_user_buf, 0, user_buf_bytes);
+	memset(backend_csf->accum_buf, 0, user_buf_bytes);
 	memset(backend_csf->old_sample_buf, 0,
 	       backend_csf->info->prfcnt_info.dump_bytes);
 }
@@ -376,7 +385,7 @@ static void kbasep_hwcnt_backend_csf_zero_sample_prfcnt_en_header(
 
 	for (block_idx = 0; block_idx < phys_layout->block_cnt; block_idx++) {
 		block_buf = sample + block_idx * phys_layout->values_per_block;
-		block_buf[phys_layout->offset_enable_mask] = 0;
+		block_buf[phys_layout->enable_mask_offset] = 0;
 	}
 }
 
@@ -400,33 +409,35 @@ static void kbasep_hwcnt_backend_csf_zero_all_prfcnt_en_header(
 static void kbasep_hwcnt_backend_csf_update_user_sample(
 	struct kbase_hwcnt_backend_csf *backend_csf)
 {
+	size_t user_buf_bytes = backend_csf->info->metadata->dump_buf_bytes;
+
 	/* Copy the data into the sample and wait for the user to get it. */
 	memcpy(backend_csf->to_user_buf, backend_csf->accum_buf,
-	       backend_csf->info->prfcnt_info.dump_bytes);
+	       user_buf_bytes);
 
 	/* After copied data into user sample, clear the accumulator values to
 	 * prepare for the next accumulator, such as the next request or
 	 * threshold.
 	 */
-	memset(backend_csf->accum_buf, 0,
-	       backend_csf->info->prfcnt_info.dump_bytes);
+	memset(backend_csf->accum_buf, 0, user_buf_bytes);
 }
 
 static void kbasep_hwcnt_backend_csf_accumulate_sample(
 	const struct kbase_hwcnt_csf_physical_layout *phys_layout,
-	size_t dump_bytes, u32 *accum_buf, const u32 *old_sample_buf,
+	size_t dump_bytes, u64 *accum_buf, const u32 *old_sample_buf,
 	const u32 *new_sample_buf, bool clearing_samples)
 {
-	size_t block_idx, ctr_idx;
+	size_t block_idx;
 	const u32 *old_block = old_sample_buf;
 	const u32 *new_block = new_sample_buf;
-	u32 *acc_block = accum_buf;
+	u64 *acc_block = accum_buf;
+	const size_t values_per_block = phys_layout->values_per_block;
 
 	for (block_idx = 0; block_idx < phys_layout->block_cnt; block_idx++) {
 		const u32 old_enable_mask =
-			old_block[phys_layout->offset_enable_mask];
+			old_block[phys_layout->enable_mask_offset];
 		const u32 new_enable_mask =
-			new_block[phys_layout->offset_enable_mask];
+			new_block[phys_layout->enable_mask_offset];
 
 		if (new_enable_mask == 0) {
 			/* Hardware block was unavailable or we didn't turn on
@@ -436,11 +447,14 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 			/* Hardware block was available and it had some counters
 			 * enabled. We need to update the accumulation buffer.
 			 */
+			size_t ctr_idx;
 
 			/* Unconditionally copy the headers. */
-			memcpy(acc_block, new_block,
-			       phys_layout->headers_per_block *
-				       KBASE_HWCNT_VALUE_BYTES);
+			for (ctr_idx = 0;
+			     ctr_idx < phys_layout->headers_per_block;
+			     ctr_idx++) {
+				acc_block[ctr_idx] = new_block[ctr_idx];
+			}
 
 			/* Accumulate counter samples
 			 *
@@ -470,8 +484,7 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 					for (ctr_idx =
 						     phys_layout
 							     ->headers_per_block;
-					     ctr_idx <
-					     phys_layout->values_per_block;
+					     ctr_idx < values_per_block;
 					     ctr_idx++) {
 						acc_block[ctr_idx] +=
 							new_block[ctr_idx];
@@ -484,8 +497,7 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 					for (ctr_idx =
 						     phys_layout
 							     ->headers_per_block;
-					     ctr_idx <
-					     phys_layout->values_per_block;
+					     ctr_idx < values_per_block;
 					     ctr_idx++) {
 						acc_block[ctr_idx] +=
 							new_block[ctr_idx] -
@@ -494,23 +506,23 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 				}
 			} else {
 				for (ctr_idx = phys_layout->headers_per_block;
-				     ctr_idx < phys_layout->values_per_block;
-				     ctr_idx++) {
+				     ctr_idx < values_per_block; ctr_idx++) {
 					acc_block[ctr_idx] +=
 						new_block[ctr_idx];
 				}
 			}
 		}
-		old_block += phys_layout->values_per_block;
-		new_block += phys_layout->values_per_block;
-		acc_block += phys_layout->values_per_block;
+		old_block += values_per_block;
+		new_block += values_per_block;
+		acc_block += values_per_block;
 	}
 
 	WARN_ON(old_block !=
-		old_sample_buf + dump_bytes / KBASE_HWCNT_VALUE_BYTES);
+		old_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
 	WARN_ON(new_block !=
-		new_sample_buf + dump_bytes / KBASE_HWCNT_VALUE_BYTES);
-	WARN_ON(acc_block != accum_buf + dump_bytes / KBASE_HWCNT_VALUE_BYTES);
+		new_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
+	WARN_ON(acc_block !=
+		accum_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
 	(void)dump_bytes;
 }
 
@@ -1218,7 +1230,7 @@ kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info,
 					     &backend_csf->phys_layout);
 
 	backend_csf->accum_buf =
-		kzalloc(csf_info->prfcnt_info.dump_bytes, GFP_KERNEL);
+		kzalloc(csf_info->metadata->dump_buf_bytes, GFP_KERNEL);
 	if (!backend_csf->accum_buf)
 		goto err_alloc_acc_buf;
 
@@ -1228,7 +1240,7 @@ kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info,
 		goto err_alloc_pre_sample_buf;
 
 	backend_csf->to_user_buf =
-		kzalloc(csf_info->prfcnt_info.dump_bytes, GFP_KERNEL);
+		kzalloc(csf_info->metadata->dump_buf_bytes, GFP_KERNEL);
 	if (!backend_csf->to_user_buf)
 		goto err_alloc_user_sample_buf;
 
@@ -1237,6 +1249,7 @@ kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info,
 		&backend_csf->ring_buf_cpu_base, &backend_csf->ring_buf);
 	if (errcode)
 		goto err_ring_buf_alloc;
+	errcode = -ENOMEM;
 
 	/* Zero all performance enable header to prepare for first enable. */
 	kbasep_hwcnt_backend_csf_zero_all_prfcnt_en_header(backend_csf);
@@ -1787,17 +1800,17 @@ int kbase_hwcnt_backend_csf_metadata_init(
 	gpu_info.clk_cnt = csf_info->prfcnt_info.clk_cnt;
 	gpu_info.prfcnt_values_per_block =
 		csf_info->prfcnt_info.prfcnt_block_size /
-		KBASE_HWCNT_VALUE_BYTES;
+		KBASE_HWCNT_VALUE_HW_BYTES;
 	errcode = kbase_hwcnt_csf_metadata_create(
 		&gpu_info, csf_info->counter_set, &csf_info->metadata);
 	if (errcode)
 		return errcode;
 
 	/*
-	 * Dump abstraction size should be exactly the same size and layout as
-	 * the physical dump size, for backwards compatibility.
+	 * Dump abstraction size should be exactly twice the size and layout as
+	 * the physical dump size since 64-bit per value used in metadata.
 	 */
-	WARN_ON(csf_info->prfcnt_info.dump_bytes !=
+	WARN_ON(csf_info->prfcnt_info.dump_bytes * 2 !=
 		csf_info->metadata->dump_buf_bytes);
 
 	return 0;
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c b/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c
index 78a8dc0..124224d 100644
--- a/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c
+++ b/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c
@@ -223,7 +223,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 	u32 prfcnt_hw_size = 0;
 	u32 prfcnt_fw_size = 0;
 	u32 prfcnt_block_size = KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK *
-				KBASE_HWCNT_VALUE_BYTES;
+				KBASE_HWCNT_VALUE_HW_BYTES;
 
 	WARN_ON(!ctx);
 	WARN_ON(!prfcnt_info);
@@ -235,6 +235,16 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 	prfcnt_fw_size = (prfcnt_size >> 16) << 8;
 	fw_ctx->buf_bytes = prfcnt_hw_size + prfcnt_fw_size;
 
+	/* Read the block size if the GPU has the register PRFCNT_FEATURES
+	 * which was introduced in architecture version 11.x.7.
+	 */
+	if ((kbdev->gpu_props.props.raw_props.gpu_id & GPU_ID2_PRODUCT_MODEL) >=
+	    GPU_ID2_PRODUCT_TTUX) {
+		prfcnt_block_size =
+			PRFCNT_FEATURES_COUNTER_BLOCK_SIZE_GET(kbase_reg_read(
+				kbdev, GPU_CONTROL_REG(PRFCNT_FEATURES)))
+			<< 8;
+	}
 
 	prfcnt_info->dump_bytes = fw_ctx->buf_bytes;
 	prfcnt_info->prfcnt_block_size = prfcnt_block_size;
@@ -246,7 +256,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 	prfcnt_info->clearing_samples = true;
 
 	/* Block size must be multiple of counter size. */
-	WARN_ON((prfcnt_info->prfcnt_block_size % KBASE_HWCNT_VALUE_BYTES) !=
+	WARN_ON((prfcnt_info->prfcnt_block_size % KBASE_HWCNT_VALUE_HW_BYTES) !=
 		0);
 	/* Total size must be multiple of block size. */
 	WARN_ON((prfcnt_info->dump_bytes % prfcnt_info->prfcnt_block_size) !=
@@ -274,6 +284,11 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
 		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	WARN_ON(!ctx);
 	WARN_ON(!cpu_dump_base);
 	WARN_ON(!out_ring_buf);
@@ -322,7 +337,8 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
 	/* Update MMU table */
 	ret = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu,
 				     gpu_va_base >> PAGE_SHIFT, phys, num_pages,
-				     flags, MCU_AS_NR, KBASE_MEM_GROUP_CSF_FW);
+				     flags, MCU_AS_NR, KBASE_MEM_GROUP_CSF_FW,
+				     mmu_sync_info);
 	if (ret)
 		goto mmu_insert_failed;
 
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_jm.c b/mali_kbase/mali_kbase_hwcnt_backend_jm.c
index 64001b1..56bb1b6 100644
--- a/mali_kbase/mali_kbase_hwcnt_backend_jm.c
+++ b/mali_kbase/mali_kbase_hwcnt_backend_jm.c
@@ -35,17 +35,47 @@
 /**
  * struct kbase_hwcnt_backend_jm_info - Information used to create an instance
  *                                      of a JM hardware counter backend.
- * @kbdev:         KBase device.
- * @counter_set:   The performance counter set to use.
- * @metadata:      Hardware counter metadata.
- * @dump_bytes:    Bytes of GPU memory required to perform a
- *                 hardware counter dump.
+ * @kbdev:          KBase device.
+ * @counter_set:    The performance counter set to use.
+ * @metadata:       Hardware counter metadata.
+ * @dump_bytes:     Bytes of GPU memory required to perform a
+ *                  hardware counter dump.
+ * @hwcnt_gpu_info: Hardware counter block information.
  */
 struct kbase_hwcnt_backend_jm_info {
 	struct kbase_device *kbdev;
 	enum kbase_hwcnt_set counter_set;
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t dump_bytes;
+	struct kbase_hwcnt_gpu_info hwcnt_gpu_info;
+};
+
+/**
+ * struct kbase_hwcnt_jm_physical_layout - HWC sample memory physical layout
+ *                                         information.
+ * @fe_cnt:             Front end block count.
+ * @tiler_cnt:          Tiler block count.
+ * @mmu_l2_cnt:         Memory system(MMU and L2 cache) block count.
+ * @shader_cnt:         Shader Core block count.
+ * @block_cnt:          Total block count (sum of all other block counts).
+ * @shader_avail_mask:  Bitmap of all shader cores in the system.
+ * @enable_mask_offset: Offset in array elements of enable mask in each block
+ *                      starting from the beginning of block.
+ * @headers_per_block:  Header size per block.
+ * @counters_per_block: Counters size per block.
+ * @values_per_block:   Total size per block.
+ */
+struct kbase_hwcnt_jm_physical_layout {
+	u8 fe_cnt;
+	u8 tiler_cnt;
+	u8 mmu_l2_cnt;
+	u8 shader_cnt;
+	u8 block_cnt;
+	u64 shader_avail_mask;
+	size_t enable_mask_offset;
+	size_t headers_per_block;
+	size_t counters_per_block;
+	size_t values_per_block;
 };
 
 /**
@@ -56,11 +86,13 @@ struct kbase_hwcnt_backend_jm_info {
  * @gpu_dump_va:      GPU hardware counter dump buffer virtual address.
  * @cpu_dump_va:      CPU mapping of gpu_dump_va.
  * @vmap:             Dump buffer vmap.
+ * @to_user_buf:      HWC sample buffer for client user, size
+ *                    metadata.dump_buf_bytes.
  * @enabled:          True if dumping has been enabled, else false.
  * @pm_core_mask:     PM state sync-ed shaders core mask for the enabled
  *                    dumping.
- * @curr_config:      Current allocated hardware resources to correctly map the src
- *                    raw dump buffer to the dst dump buffer.
+ * @curr_config:      Current allocated hardware resources to correctly map the
+ *                    source raw dump buffer to the destination dump buffer.
  * @clk_enable_map:   The enable map specifying enabled clock domains.
  * @cycle_count_elapsed:
  *                    Cycle count elapsed for a given sample period.
@@ -71,6 +103,7 @@ struct kbase_hwcnt_backend_jm_info {
  *                    sample period.
  * @rate_listener:    Clock rate listener callback state.
  * @ccswe_shader_cores: Shader cores cycle count software estimator.
+ * @phys_layout:      Physical memory layout information of HWC sample buffer.
  */
 struct kbase_hwcnt_backend_jm {
 	const struct kbase_hwcnt_backend_jm_info *info;
@@ -78,6 +111,7 @@ struct kbase_hwcnt_backend_jm {
 	u64 gpu_dump_va;
 	void *cpu_dump_va;
 	struct kbase_vmap_struct *vmap;
+	u64 *to_user_buf;
 	bool enabled;
 	u64 pm_core_mask;
 	struct kbase_hwcnt_curr_config curr_config;
@@ -86,6 +120,7 @@ struct kbase_hwcnt_backend_jm {
 	u64 prev_cycle_count[BASE_MAX_NR_CLOCKS_REGULATORS];
 	struct kbase_clk_rate_listener rate_listener;
 	struct kbase_ccswe ccswe_shader_cores;
+	struct kbase_hwcnt_jm_physical_layout phys_layout;
 };
 
 /**
@@ -127,6 +162,63 @@ kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
 	return 0;
 }
 
+static void kbasep_hwcnt_backend_jm_init_layout(
+	const struct kbase_hwcnt_gpu_info *gpu_info,
+	struct kbase_hwcnt_jm_physical_layout *phys_layout)
+{
+	u8 shader_core_cnt;
+
+	WARN_ON(!gpu_info);
+	WARN_ON(!phys_layout);
+
+	shader_core_cnt = fls64(gpu_info->core_mask);
+
+	*phys_layout = (struct kbase_hwcnt_jm_physical_layout){
+		.fe_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT,
+		.tiler_cnt = KBASE_HWCNT_V5_TILER_BLOCK_COUNT,
+		.mmu_l2_cnt = gpu_info->l2_count,
+		.shader_cnt = shader_core_cnt,
+		.block_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT +
+			     KBASE_HWCNT_V5_TILER_BLOCK_COUNT +
+			     gpu_info->l2_count + shader_core_cnt,
+		.shader_avail_mask = gpu_info->core_mask,
+		.headers_per_block = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+		.values_per_block = gpu_info->prfcnt_values_per_block,
+		.counters_per_block = gpu_info->prfcnt_values_per_block -
+				      KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+		.enable_mask_offset = KBASE_HWCNT_V5_PRFCNT_EN_HEADER,
+	};
+}
+
+static void kbasep_hwcnt_backend_jm_dump_sample(
+	const struct kbase_hwcnt_backend_jm *const backend_jm)
+{
+	size_t block_idx;
+	const u32 *new_sample_buf = backend_jm->cpu_dump_va;
+	const u32 *new_block = new_sample_buf;
+	u64 *dst_buf = backend_jm->to_user_buf;
+	u64 *dst_block = dst_buf;
+	const size_t values_per_block =
+		backend_jm->phys_layout.values_per_block;
+	const size_t dump_bytes = backend_jm->info->dump_bytes;
+
+	for (block_idx = 0; block_idx < backend_jm->phys_layout.block_cnt;
+	     block_idx++) {
+		size_t ctr_idx;
+
+		for (ctr_idx = 0; ctr_idx < values_per_block; ctr_idx++)
+			dst_block[ctr_idx] = new_block[ctr_idx];
+
+		new_block += values_per_block;
+		dst_block += values_per_block;
+	}
+
+	WARN_ON(new_block !=
+		new_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
+	WARN_ON(dst_block !=
+		dst_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
+}
+
 /**
  * kbasep_hwcnt_backend_jm_on_freq_change() - On freq change callback
  *
@@ -487,6 +579,9 @@ static int kbasep_hwcnt_backend_jm_dump_get(
 	kbase_sync_mem_regions(
 		backend_jm->kctx, backend_jm->vmap, KBASE_SYNC_TO_CPU);
 
+	/* Dump sample to the internal 64-bit user buffer. */
+	kbasep_hwcnt_backend_jm_dump_sample(backend_jm);
+
 	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk) {
 		if (!kbase_hwcnt_clk_enable_map_enabled(
 			dst_enable_map->clk_enable_map, clk))
@@ -496,7 +591,7 @@ static int kbasep_hwcnt_backend_jm_dump_get(
 		dst->clk_cnt_buf[clk] = backend_jm->cycle_count_elapsed[clk];
 	}
 
-	return kbase_hwcnt_jm_dump_get(dst, backend_jm->cpu_dump_va,
+	return kbase_hwcnt_jm_dump_get(dst, backend_jm->to_user_buf,
 				       dst_enable_map, backend_jm->pm_core_mask,
 				       &backend_jm->curr_config, accumulate);
 }
@@ -519,6 +614,11 @@ static int kbasep_hwcnt_backend_jm_dump_alloc(
 	u64 flags;
 	u64 nr_pages;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	WARN_ON(!info);
 	WARN_ON(!kctx);
 	WARN_ON(!gpu_dump_va);
@@ -531,7 +631,8 @@ static int kbasep_hwcnt_backend_jm_dump_alloc(
 
 	nr_pages = PFN_UP(info->dump_bytes);
 
-	reg = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, gpu_dump_va);
+	reg = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, gpu_dump_va,
+			      mmu_sync_info);
 
 	if (!reg)
 		return -ENOMEM;
@@ -580,6 +681,8 @@ static void kbasep_hwcnt_backend_jm_destroy(
 		kbase_destroy_context(kctx);
 	}
 
+	kfree(backend->to_user_buf);
+
 	kfree(backend);
 }
 
@@ -608,6 +711,8 @@ static int kbasep_hwcnt_backend_jm_create(
 		goto alloc_error;
 
 	backend->info = info;
+	kbasep_hwcnt_backend_jm_init_layout(&info->hwcnt_gpu_info,
+					    &backend->phys_layout);
 
 	backend->kctx = kbase_create_context(kbdev, true,
 		BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED, 0, NULL);
@@ -623,7 +728,12 @@ static int kbasep_hwcnt_backend_jm_create(
 
 	backend->cpu_dump_va = kbase_phy_alloc_mapping_get(backend->kctx,
 		backend->gpu_dump_va, &backend->vmap);
-	if (!backend->cpu_dump_va)
+	if (!backend->cpu_dump_va || !backend->vmap)
+		goto alloc_error;
+
+	backend->to_user_buf =
+		kzalloc(info->metadata->dump_buf_bytes, GFP_KERNEL);
+	if (!backend->to_user_buf)
 		goto alloc_error;
 
 	kbase_ccswe_init(&backend->ccswe_shader_cores);
@@ -710,19 +820,14 @@ static int kbasep_hwcnt_backend_jm_info_create(
 	const struct kbase_hwcnt_backend_jm_info **out_info)
 {
 	int errcode = -ENOMEM;
-	struct kbase_hwcnt_gpu_info hwcnt_gpu_info;
 	struct kbase_hwcnt_backend_jm_info *info = NULL;
 
 	WARN_ON(!kbdev);
 	WARN_ON(!out_info);
 
-	errcode = kbasep_hwcnt_backend_jm_gpu_info_init(kbdev, &hwcnt_gpu_info);
-	if (errcode)
-		return errcode;
-
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
-		goto error;
+		return errcode;
 
 	info->kbdev = kbdev;
 
@@ -735,7 +840,12 @@ static int kbasep_hwcnt_backend_jm_info_create(
 	info->counter_set = KBASE_HWCNT_SET_PRIMARY;
 #endif
 
-	errcode = kbase_hwcnt_jm_metadata_create(&hwcnt_gpu_info,
+	errcode = kbasep_hwcnt_backend_jm_gpu_info_init(kbdev,
+							&info->hwcnt_gpu_info);
+	if (errcode)
+		goto error;
+
+	errcode = kbase_hwcnt_jm_metadata_create(&info->hwcnt_gpu_info,
 						 info->counter_set,
 						 &info->metadata,
 						 &info->dump_bytes);
diff --git a/mali_kbase/mali_kbase_hwcnt_gpu.c b/mali_kbase/mali_kbase_hwcnt_gpu.c
index 2975269..97a7511 100644
--- a/mali_kbase/mali_kbase_hwcnt_gpu.c
+++ b/mali_kbase/mali_kbase_hwcnt_gpu.c
@@ -223,7 +223,7 @@ kbasep_hwcnt_backend_jm_dump_bytes(const struct kbase_hwcnt_gpu_info *gpu_info)
 	WARN_ON(!gpu_info);
 
 	return (2 + gpu_info->l2_count + fls64(gpu_info->core_mask)) *
-	       gpu_info->prfcnt_values_per_block * KBASE_HWCNT_VALUE_BYTES;
+	       gpu_info->prfcnt_values_per_block * KBASE_HWCNT_VALUE_HW_BYTES;
 }
 
 int kbase_hwcnt_jm_metadata_create(
@@ -253,10 +253,11 @@ int kbase_hwcnt_jm_metadata_create(
 		return errcode;
 
 	/*
-	 * Dump abstraction size should be exactly the same size and layout as
-	 * the physical dump size, for backwards compatibility.
+	 * The physical dump size should be half of dump abstraction size in
+	 * metadata since physical HW uses 32-bit per value but metadata
+	 * specifies 64-bit per value.
 	 */
-	WARN_ON(dump_bytes != metadata->dump_buf_bytes);
+	WARN_ON(dump_bytes * 2 != metadata->dump_buf_bytes);
 
 	*out_metadata = metadata;
 	*out_dump_bytes = dump_bytes;
@@ -302,127 +303,6 @@ void kbase_hwcnt_csf_metadata_destroy(
 	kbase_hwcnt_metadata_destroy(metadata);
 }
 
-int kbase_hwcnt_gpu_metadata_create_truncate_64(
-	const struct kbase_hwcnt_metadata **dst_md,
-	const struct kbase_hwcnt_metadata *src_md)
-{
-	struct kbase_hwcnt_description desc;
-	struct kbase_hwcnt_group_description group;
-	struct kbase_hwcnt_block_description
-		blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT];
-	size_t prfcnt_values_per_block;
-	size_t blk;
-
-	if (!dst_md || !src_md || !src_md->grp_metadata ||
-	    !src_md->grp_metadata[0].blk_metadata)
-		return -EINVAL;
-
-	/* Only support 1 group count and KBASE_HWCNT_V5_BLOCK_TYPE_COUNT block
-	 * count in the metadata.
-	 */
-	if ((kbase_hwcnt_metadata_group_count(src_md) != 1) ||
-	    (kbase_hwcnt_metadata_block_count(src_md, 0) !=
-	     KBASE_HWCNT_V5_BLOCK_TYPE_COUNT))
-		return -EINVAL;
-
-	/* Get the values count in the first block. */
-	prfcnt_values_per_block =
-		kbase_hwcnt_metadata_block_values_count(src_md, 0, 0);
-
-	/* check all blocks should have same values count. */
-	for (blk = 0; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) {
-		size_t val_cnt =
-			kbase_hwcnt_metadata_block_values_count(src_md, 0, blk);
-		if (val_cnt != prfcnt_values_per_block)
-			return -EINVAL;
-	}
-
-	/* Only support 64 and 128 entries per block. */
-	if ((prfcnt_values_per_block != 64) && (prfcnt_values_per_block != 128))
-		return -EINVAL;
-
-	if (prfcnt_values_per_block == 64) {
-		/* If the values per block is 64, no need to truncate. */
-		*dst_md = NULL;
-		return 0;
-	}
-
-	/* Truncate from 128 to 64 entries per block to keep API backward
-	 * compatibility.
-	 */
-	prfcnt_values_per_block = 64;
-
-	for (blk = 0; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) {
-		blks[blk].type =
-			kbase_hwcnt_metadata_block_type(src_md, 0, blk);
-		blks[blk].inst_cnt = kbase_hwcnt_metadata_block_instance_count(
-			src_md, 0, blk);
-		blks[blk].hdr_cnt = kbase_hwcnt_metadata_block_headers_count(
-			src_md, 0, blk);
-		blks[blk].ctr_cnt = prfcnt_values_per_block - blks[blk].hdr_cnt;
-	}
-
-	group.type = kbase_hwcnt_metadata_group_type(src_md, 0);
-	group.blk_cnt = KBASE_HWCNT_V5_BLOCK_TYPE_COUNT;
-	group.blks = blks;
-
-	desc.grp_cnt = kbase_hwcnt_metadata_group_count(src_md);
-	desc.avail_mask = src_md->avail_mask;
-	desc.clk_cnt = src_md->clk_cnt;
-	desc.grps = &group;
-
-	return kbase_hwcnt_metadata_create(&desc, dst_md);
-}
-
-void kbase_hwcnt_dump_buffer_copy_strict_narrow(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
-{
-	const struct kbase_hwcnt_metadata *metadata;
-	size_t grp, blk, blk_inst;
-	size_t clk;
-
-	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) ||
-	    WARN_ON(dst == src) || WARN_ON(dst->metadata == src->metadata) ||
-	    WARN_ON(dst->metadata->grp_cnt != src->metadata->grp_cnt) ||
-	    WARN_ON(src->metadata->grp_cnt != 1) ||
-	    WARN_ON(dst->metadata->grp_metadata[0].blk_cnt !=
-		    src->metadata->grp_metadata[0].blk_cnt) ||
-	    WARN_ON(dst->metadata->grp_metadata[0].blk_cnt != 4) ||
-	    WARN_ON(dst->metadata->grp_metadata[0].blk_metadata[0].ctr_cnt >
-		    src->metadata->grp_metadata[0].blk_metadata[0].ctr_cnt))
-		return;
-
-	/* Don't use src metadata since src buffer is bigger than dst buffer. */
-	metadata = dst->metadata;
-
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-			dst, grp, blk, blk_inst);
-		const u32 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
-			src, grp, blk, blk_inst);
-		const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
-			dst_enable_map, grp, blk, blk_inst);
-		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(
-			metadata, grp, blk);
-		/* Align upwards to include padding bytes */
-		val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
-			val_cnt, (KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
-				  KBASE_HWCNT_VALUE_BYTES));
-
-		kbase_hwcnt_dump_buffer_block_copy_strict(dst_blk, src_blk,
-							  blk_em, val_cnt);
-	}
-
-	kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
-		bool clk_enabled = kbase_hwcnt_clk_enable_map_enabled(
-			dst_enable_map->clk_enable_map, clk);
-
-		dst->clk_cnt_buf[clk] = clk_enabled ? src->clk_cnt_buf[clk] : 0;
-	}
-}
-
 static bool is_block_type_shader(
 	const u64 grp_type,
 	const u64 blk_type,
@@ -462,28 +342,26 @@ static bool is_block_type_l2_cache(
 	return is_l2_cache;
 }
 
-int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
+int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 			    const struct kbase_hwcnt_enable_map *dst_enable_map,
 			    u64 pm_core_mask,
 			    const struct kbase_hwcnt_curr_config *curr_config,
 			    bool accumulate)
 {
 	const struct kbase_hwcnt_metadata *metadata;
-	const u32 *dump_src;
-	size_t src_offset, grp, blk, blk_inst;
+	size_t grp, blk, blk_inst;
+	const u64 *dump_src = src;
+	size_t src_offset = 0;
 	u64 core_mask = pm_core_mask;
 
 	/* Variables to deal with the current configuration */
 	int l2_count = 0;
-	bool hw_res_available = true;
 
 	if (!dst || !src || !dst_enable_map ||
 	    (dst_enable_map->metadata != dst->metadata))
 		return -EINVAL;
 
 	metadata = dst->metadata;
-	dump_src = (const u32 *)src;
-	src_offset = 0;
 
 	kbase_hwcnt_metadata_for_each_block(
 		metadata, grp, blk, blk_inst) {
@@ -501,6 +379,7 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
 		const bool is_l2_cache = is_block_type_l2_cache(
 			kbase_hwcnt_metadata_group_type(metadata, grp),
 			blk_type);
+		bool hw_res_available = true;
 
 		/*
 		 * If l2 blocks is greater than the current allocated number of
@@ -525,14 +404,13 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
 		}
 
 		/*
-		 * Early out if no values in the dest block are enabled or if
-		 * the resource target of the block is not available in the HW.
+		 * Skip block if no values in the destination block are enabled.
 		 */
 		if (kbase_hwcnt_enable_map_block_enabled(
 			dst_enable_map, grp, blk, blk_inst)) {
-			u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
+			u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
 				dst, grp, blk, blk_inst);
-			const u32 *src_blk = dump_src + src_offset;
+			const u64 *src_blk = dump_src + src_offset;
 
 			if ((!is_shader_core || (core_mask & 1)) && hw_res_available) {
 				if (accumulate) {
@@ -560,21 +438,20 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
 	return 0;
 }
 
-int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
+int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 			     const struct kbase_hwcnt_enable_map *dst_enable_map,
 			     bool accumulate)
 {
 	const struct kbase_hwcnt_metadata *metadata;
-	const u32 *dump_src;
-	size_t src_offset, grp, blk, blk_inst;
+	const u64 *dump_src = src;
+	size_t src_offset = 0;
+	size_t grp, blk, blk_inst;
 
 	if (!dst || !src || !dst_enable_map ||
 	    (dst_enable_map->metadata != dst->metadata))
 		return -EINVAL;
 
 	metadata = dst->metadata;
-	dump_src = (const u32 *)src;
-	src_offset = 0;
 
 	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
 		const size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(
@@ -583,12 +460,14 @@ int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
 			kbase_hwcnt_metadata_block_counters_count(metadata, grp,
 								  blk);
 
-		/* Early out if no values in the dest block are enabled */
+		/*
+		 * Skip block if no values in the destination block are enabled.
+		 */
 		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp,
 							 blk, blk_inst)) {
-			u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
+			u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
 				dst, grp, blk, blk_inst);
-			const u32 *src_blk = dump_src + src_offset;
+			const u64 *src_blk = dump_src + src_offset;
 
 			if (accumulate) {
 				kbase_hwcnt_dump_buffer_block_accumulate(
@@ -606,48 +485,6 @@ int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
 }
 
 /**
- * kbasep_hwcnt_backend_gpu_block_map_to_physical() - Convert from a block
- *                                                    enable map abstraction to
- *                                                    a physical block enable
- *                                                    map.
- * @lo: Low 64 bits of block enable map abstraction.
- * @hi: High 64 bits of block enable map abstraction.
- *
- * The abstraction uses 128 bits to enable 128 block values, whereas the
- * physical uses just 32 bits, as bit n enables values [n*4, n*4+3].
- * Therefore, this conversion is lossy.
- *
- * Return: 32-bit physical block enable map.
- */
-static inline u32 kbasep_hwcnt_backend_gpu_block_map_to_physical(
-	u64 lo,
-	u64 hi)
-{
-	u32 phys = 0;
-	u64 dwords[2] = {lo, hi};
-	size_t dword_idx;
-
-	for (dword_idx = 0; dword_idx < 2; dword_idx++) {
-		const u64 dword = dwords[dword_idx];
-		u16 packed = 0;
-
-		size_t hword_bit;
-
-		for (hword_bit = 0; hword_bit < 16; hword_bit++) {
-			const size_t dword_bit = hword_bit * 4;
-			const u16 mask =
-				((dword >> (dword_bit + 0)) & 0x1) |
-				((dword >> (dword_bit + 1)) & 0x1) |
-				((dword >> (dword_bit + 2)) & 0x1) |
-				((dword >> (dword_bit + 3)) & 0x1);
-			packed |= (mask << hword_bit);
-		}
-		phys |= ((u32)packed) << (16 * dword_idx);
-	}
-	return phys;
-}
-
-/**
  * kbasep_hwcnt_backend_gpu_block_map_from_physical() - Convert from a physical
  *                                                      block enable map to a
  *                                                      block enable map
@@ -746,14 +583,13 @@ void kbase_hwcnt_gpu_enable_map_to_physical(
 		}
 	}
 
-	dst->fe_bm =
-		kbasep_hwcnt_backend_gpu_block_map_to_physical(fe_bm, 0);
+	dst->fe_bm = kbase_hwcnt_backend_gpu_block_map_to_physical(fe_bm, 0);
 	dst->shader_bm =
-		kbasep_hwcnt_backend_gpu_block_map_to_physical(shader_bm, 0);
+		kbase_hwcnt_backend_gpu_block_map_to_physical(shader_bm, 0);
 	dst->tiler_bm =
-		kbasep_hwcnt_backend_gpu_block_map_to_physical(tiler_bm, 0);
+		kbase_hwcnt_backend_gpu_block_map_to_physical(tiler_bm, 0);
 	dst->mmu_l2_bm =
-		kbasep_hwcnt_backend_gpu_block_map_to_physical(mmu_l2_bm, 0);
+		kbase_hwcnt_backend_gpu_block_map_to_physical(mmu_l2_bm, 0);
 }
 
 void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst,
@@ -857,12 +693,12 @@ void kbase_hwcnt_gpu_patch_dump_headers(
 	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
 		const u64 grp_type =
 			kbase_hwcnt_metadata_group_type(metadata, grp);
-		u32 *buf_blk = kbase_hwcnt_dump_buffer_block_instance(
+		u64 *buf_blk = kbase_hwcnt_dump_buffer_block_instance(
 			buf, grp, blk, blk_inst);
 		const u64 *blk_map = kbase_hwcnt_enable_map_block_instance(
 			enable_map, grp, blk, blk_inst);
 		const u32 prfcnt_en =
-			kbasep_hwcnt_backend_gpu_block_map_to_physical(
+			kbase_hwcnt_backend_gpu_block_map_to_physical(
 				blk_map[0], 0);
 
 		if ((enum kbase_hwcnt_gpu_group_type)grp_type ==
diff --git a/mali_kbase/mali_kbase_hwcnt_gpu.h b/mali_kbase/mali_kbase_hwcnt_gpu.h
index 50ae80d..648f85f 100644
--- a/mali_kbase/mali_kbase_hwcnt_gpu.h
+++ b/mali_kbase/mali_kbase_hwcnt_gpu.h
@@ -29,15 +29,25 @@ struct kbase_hwcnt_metadata;
 struct kbase_hwcnt_enable_map;
 struct kbase_hwcnt_dump_buffer;
 
+/* Hardware counter version 5 definitions, V5 is the only supported version. */
 #define KBASE_HWCNT_V5_BLOCK_TYPE_COUNT 4
 #define KBASE_HWCNT_V5_HEADERS_PER_BLOCK 4
 #define KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK 60
 #define KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK                                \
 	(KBASE_HWCNT_V5_HEADERS_PER_BLOCK +                                    \
 	 KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK)
-/** Index of the PRFCNT_EN header into a V5 counter block */
+
+/* FrontEnd block count in V5 GPU hardware counter. */
+#define KBASE_HWCNT_V5_FE_BLOCK_COUNT 1
+/* Tiler block count in V5 GPU hardware counter. */
+#define KBASE_HWCNT_V5_TILER_BLOCK_COUNT 1
+
+/* Index of the PRFCNT_EN header into a V5 counter block */
 #define KBASE_HWCNT_V5_PRFCNT_EN_HEADER 2
 
+/* Number of bytes for each counter value in hardware. */
+#define KBASE_HWCNT_VALUE_HW_BYTES (sizeof(u32))
+
 /**
  * enum kbase_hwcnt_gpu_group_type - GPU hardware counter group types, used to
  *                                   identify metadata groups.
@@ -84,11 +94,13 @@ enum kbase_hwcnt_gpu_v5_block_type {
  * @KBASE_HWCNT_SET_PRIMARY:   The Primary set of counters
  * @KBASE_HWCNT_SET_SECONDARY: The Secondary set of counters
  * @KBASE_HWCNT_SET_TERTIARY:  The Tertiary set of counters
+ * @KBASE_HWCNT_SET_UNDEFINED: Undefined set of counters
  */
 enum kbase_hwcnt_set {
 	KBASE_HWCNT_SET_PRIMARY,
 	KBASE_HWCNT_SET_SECONDARY,
 	KBASE_HWCNT_SET_TERTIARY,
+	KBASE_HWCNT_SET_UNDEFINED = 255,
 };
 
 /**
@@ -225,61 +237,19 @@ void kbase_hwcnt_csf_metadata_destroy(
 	const struct kbase_hwcnt_metadata *metadata);
 
 /**
- * kbase_hwcnt_gpu_metadata_create_truncate_64() - Create HWC metadata with HWC
- *                                                 block entries truncated
- *                                                 to 64.
- *
- * @dst_md: Non-NULL pointer to where created metadata is stored on success.
- * @src_md: Non-NULL pointer to the HWC metadata used as the source to create
- *          dst_md.
- *
- * If the total block entries in src_md is 64, metadata dst_md returns NULL
- * since no need to truncate.
- * if the total block entries in src_md is 128, then a new metadata with block
- * entries truncated to 64 will be created for dst_md, which keeps the interface
- * to user clients backward compatible.
- * If the total block entries in src_md is other values, function returns error
- * since it's not supported.
- *
- * Return: 0 on success, else error code.
- */
-int kbase_hwcnt_gpu_metadata_create_truncate_64(
-	const struct kbase_hwcnt_metadata **dst_md,
-	const struct kbase_hwcnt_metadata *src_md);
-
-/**
- * kbase_hwcnt_dump_buffer_copy_strict_narrow() - Copy all enabled values from
- *                                                src to dst.
- *
- * @dst:            Non-NULL pointer to dst dump buffer.
- * @src:            Non-NULL pointer to src dump buffer.
- * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
- *
- * After the operation, all non-enabled values (including padding bytes) will be
- * zero.
- *
- * The dst and src have different metadata, and the dst metadata is narrower
- * than src metadata.
- */
-void kbase_hwcnt_dump_buffer_copy_strict_narrow(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map);
-
-/**
  * kbase_hwcnt_jm_dump_get() - Copy or accumulate enabled counters from the raw
  *                             dump buffer in src into the dump buffer
  *                             abstraction in dst.
- * @dst:            Non-NULL pointer to dst dump buffer.
- * @src:            Non-NULL pointer to src raw dump buffer, of same length
- *                  as returned in out_dump_bytes parameter of
- *                  kbase_hwcnt_jm_metadata_create.
+ * @dst:            Non-NULL pointer to destination dump buffer.
+ * @src:            Non-NULL pointer to source raw dump buffer, of same length
+ *                  as dump_buf_bytes in the metadata of destination dump
+ *                  buffer.
  * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
  * @pm_core_mask:   PM state synchronized shaders core mask with the dump.
  * @curr_config:    Current allocated hardware resources to correctly map the
- *                  src raw dump buffer to the dst dump buffer.
- * @accumulate:     True if counters in src should be accumulated into dst,
- *                  rather than copied.
+ *                  source raw dump buffer to the destination dump buffer.
+ * @accumulate:     True if counters in source should be accumulated into
+ *                  destination, rather than copied.
  *
  * The dst and dst_enable_map MUST have been created from the same metadata as
  * returned from the call to kbase_hwcnt_jm_metadata_create as was used to get
@@ -287,7 +257,7 @@ void kbase_hwcnt_dump_buffer_copy_strict_narrow(
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
+int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 			    const struct kbase_hwcnt_enable_map *dst_enable_map,
 			    const u64 pm_core_mask,
 			    const struct kbase_hwcnt_curr_config *curr_config,
@@ -297,13 +267,12 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
  * kbase_hwcnt_csf_dump_get() - Copy or accumulate enabled counters from the raw
  *                              dump buffer in src into the dump buffer
  *                              abstraction in dst.
- * @dst:            Non-NULL pointer to dst dump buffer.
- * @src:            Non-NULL pointer to src raw dump buffer, of same length
- *                  as returned in out_dump_bytes parameter of
- *                  kbase_hwcnt_csf_metadata_create.
+ * @dst:            Non-NULL pointer to destination dump buffer.
+ * @src:            Non-NULL pointer to source raw dump buffer, of same length
+ *                  as dump_buf_bytes in the metadata of dst dump buffer.
  * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
- * @accumulate:     True if counters in src should be accumulated into dst,
- *                  rather than copied.
+ * @accumulate:     True if counters in src should be accumulated into
+ *                  destination, rather than copied.
  *
  * The dst and dst_enable_map MUST have been created from the same metadata as
  * returned from the call to kbase_hwcnt_csf_metadata_create as was used to get
@@ -311,15 +280,54 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src,
+int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 			     const struct kbase_hwcnt_enable_map *dst_enable_map,
 			     bool accumulate);
 
 /**
+ * kbase_hwcnt_backend_gpu_block_map_to_physical() - Convert from a block
+ *                                                   enable map abstraction to
+ *                                                   a physical block enable
+ *                                                   map.
+ * @lo: Low 64 bits of block enable map abstraction.
+ * @hi: High 64 bits of block enable map abstraction.
+ *
+ * The abstraction uses 128 bits to enable 128 block values, whereas the
+ * physical uses just 32 bits, as bit n enables values [n*4, n*4+3].
+ * Therefore, this conversion is lossy.
+ *
+ * Return: 32-bit physical block enable map.
+ */
+static inline u32 kbase_hwcnt_backend_gpu_block_map_to_physical(u64 lo, u64 hi)
+{
+	u32 phys = 0;
+	u64 dwords[2] = { lo, hi };
+	size_t dword_idx;
+
+	for (dword_idx = 0; dword_idx < 2; dword_idx++) {
+		const u64 dword = dwords[dword_idx];
+		u16 packed = 0;
+
+		size_t hword_bit;
+
+		for (hword_bit = 0; hword_bit < 16; hword_bit++) {
+			const size_t dword_bit = hword_bit * 4;
+			const u16 mask = ((dword >> (dword_bit + 0)) & 0x1) |
+					 ((dword >> (dword_bit + 1)) & 0x1) |
+					 ((dword >> (dword_bit + 2)) & 0x1) |
+					 ((dword >> (dword_bit + 3)) & 0x1);
+			packed |= (mask << hword_bit);
+		}
+		phys |= ((u32)packed) << (16 * dword_idx);
+	}
+	return phys;
+}
+
+/**
  * kbase_hwcnt_gpu_enable_map_to_physical() - Convert an enable map abstraction
  *                                            into a physical enable map.
- * @dst: Non-NULL pointer to dst physical enable map.
- * @src: Non-NULL pointer to src enable map abstraction.
+ * @dst: Non-NULL pointer to destination physical enable map.
+ * @src: Non-NULL pointer to source enable map abstraction.
  *
  * The src must have been created from a metadata returned from a call to
  * kbase_hwcnt_jm_metadata_create or kbase_hwcnt_csf_metadata_create.
@@ -336,8 +344,8 @@ void kbase_hwcnt_gpu_enable_map_to_physical(
  * kbase_hwcnt_gpu_set_to_physical() - Map counter set selection to physical
  *                                     SET_SELECT value.
  *
- * @dst: Non-NULL pointer to dst physical SET_SELECT value.
- * @src: Non-NULL pointer to src counter set selection.
+ * @dst: Non-NULL pointer to destination physical SET_SELECT value.
+ * @src: Non-NULL pointer to source counter set selection.
  */
 void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst,
 				     enum kbase_hwcnt_set src);
@@ -345,8 +353,8 @@ void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst,
 /**
  * kbase_hwcnt_gpu_enable_map_from_physical() - Convert a physical enable map to
  *                                              an enable map abstraction.
- * @dst: Non-NULL pointer to dst enable map abstraction.
- * @src: Non-NULL pointer to src physical enable map.
+ * @dst: Non-NULL pointer to destination enable map abstraction.
+ * @src: Non-NULL pointer to source physical enable map.
  *
  * The dst must have been created from a metadata returned from a call to
  * kbase_hwcnt_jm_metadata_create or kbase_hwcnt_csf_metadata_create.
diff --git a/mali_kbase/mali_kbase_hwcnt_gpu_narrow.c b/mali_kbase/mali_kbase_hwcnt_gpu_narrow.c
new file mode 100644
index 0000000..e2caa1c
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_gpu_narrow.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#include "mali_kbase_hwcnt_gpu.h"
+#include "mali_kbase_hwcnt_gpu_narrow.h"
+
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+int kbase_hwcnt_gpu_metadata_narrow_create(
+	const struct kbase_hwcnt_metadata_narrow **dst_md_narrow,
+	const struct kbase_hwcnt_metadata *src_md)
+{
+	struct kbase_hwcnt_description desc;
+	struct kbase_hwcnt_group_description group;
+	struct kbase_hwcnt_block_description
+		blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT];
+	size_t prfcnt_values_per_block;
+	size_t blk;
+	int err;
+	struct kbase_hwcnt_metadata_narrow *metadata_narrow;
+
+	if (!dst_md_narrow || !src_md || !src_md->grp_metadata ||
+	    !src_md->grp_metadata[0].blk_metadata)
+		return -EINVAL;
+
+	/* Only support 1 group count and KBASE_HWCNT_V5_BLOCK_TYPE_COUNT block
+	 * count in the metadata.
+	 */
+	if ((kbase_hwcnt_metadata_group_count(src_md) != 1) ||
+	    (kbase_hwcnt_metadata_block_count(src_md, 0) !=
+	     KBASE_HWCNT_V5_BLOCK_TYPE_COUNT))
+		return -EINVAL;
+
+	/* Get the values count in the first block. */
+	prfcnt_values_per_block =
+		kbase_hwcnt_metadata_block_values_count(src_md, 0, 0);
+
+	/* check all blocks should have same values count. */
+	for (blk = 1; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) {
+		size_t val_cnt =
+			kbase_hwcnt_metadata_block_values_count(src_md, 0, blk);
+		if (val_cnt != prfcnt_values_per_block)
+			return -EINVAL;
+	}
+
+	/* Only support 64 and 128 entries per block. */
+	if ((prfcnt_values_per_block != 64) && (prfcnt_values_per_block != 128))
+		return -EINVAL;
+
+	metadata_narrow = kmalloc(sizeof(*metadata_narrow), GFP_KERNEL);
+	if (!metadata_narrow)
+		return -ENOMEM;
+
+	/* Narrow to 64 entries per block to keep API backward compatibility. */
+	prfcnt_values_per_block = 64;
+
+	for (blk = 0; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) {
+		size_t blk_hdr_cnt = kbase_hwcnt_metadata_block_headers_count(
+			src_md, 0, blk);
+		blks[blk] = (struct kbase_hwcnt_block_description){
+			.type = kbase_hwcnt_metadata_block_type(src_md, 0, blk),
+			.inst_cnt = kbase_hwcnt_metadata_block_instance_count(
+				src_md, 0, blk),
+			.hdr_cnt = blk_hdr_cnt,
+			.ctr_cnt = prfcnt_values_per_block - blk_hdr_cnt,
+		};
+	}
+
+	group = (struct kbase_hwcnt_group_description){
+		.type = kbase_hwcnt_metadata_group_type(src_md, 0),
+		.blk_cnt = KBASE_HWCNT_V5_BLOCK_TYPE_COUNT,
+		.blks = blks,
+	};
+
+	desc = (struct kbase_hwcnt_description){
+		.grp_cnt = kbase_hwcnt_metadata_group_count(src_md),
+		.avail_mask = src_md->avail_mask,
+		.clk_cnt = src_md->clk_cnt,
+		.grps = &group,
+	};
+
+	err = kbase_hwcnt_metadata_create(&desc, &metadata_narrow->metadata);
+	if (!err) {
+		/* Narrow down the buffer size to half as the narrowed metadata
+		 * only supports 32-bit but the created metadata uses 64-bit for
+		 * block entry.
+		 */
+		metadata_narrow->dump_buf_bytes =
+			metadata_narrow->metadata->dump_buf_bytes >> 1;
+		*dst_md_narrow = metadata_narrow;
+	} else {
+		kfree(metadata_narrow);
+	}
+
+	return err;
+}
+
+void kbase_hwcnt_gpu_metadata_narrow_destroy(
+	const struct kbase_hwcnt_metadata_narrow *md_narrow)
+{
+	if (!md_narrow)
+		return;
+
+	kbase_hwcnt_metadata_destroy(md_narrow->metadata);
+	kfree(md_narrow);
+}
+
+int kbase_hwcnt_dump_buffer_narrow_alloc(
+	const struct kbase_hwcnt_metadata_narrow *md_narrow,
+	struct kbase_hwcnt_dump_buffer_narrow *dump_buf)
+{
+	size_t dump_buf_bytes;
+	size_t clk_cnt_buf_bytes;
+	u8 *buf;
+
+	if (!md_narrow || !dump_buf)
+		return -EINVAL;
+
+	dump_buf_bytes = md_narrow->dump_buf_bytes;
+	clk_cnt_buf_bytes =
+		sizeof(*dump_buf->clk_cnt_buf) * md_narrow->metadata->clk_cnt;
+
+	/* Make a single allocation for both dump_buf and clk_cnt_buf. */
+	buf = kmalloc(dump_buf_bytes + clk_cnt_buf_bytes, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	*dump_buf = (struct kbase_hwcnt_dump_buffer_narrow){
+		.md_narrow = md_narrow,
+		.dump_buf = (u32 *)buf,
+		.clk_cnt_buf = (u64 *)(buf + dump_buf_bytes),
+	};
+
+	return 0;
+}
+
+void kbase_hwcnt_dump_buffer_narrow_free(
+	struct kbase_hwcnt_dump_buffer_narrow *dump_buf_narrow)
+{
+	if (!dump_buf_narrow)
+		return;
+
+	kfree(dump_buf_narrow->dump_buf);
+	*dump_buf_narrow = (struct kbase_hwcnt_dump_buffer_narrow){ 0 };
+}
+
+int kbase_hwcnt_dump_buffer_narrow_array_alloc(
+	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t n,
+	struct kbase_hwcnt_dump_buffer_narrow_array *dump_bufs)
+{
+	struct kbase_hwcnt_dump_buffer_narrow *buffers;
+	size_t buf_idx;
+	unsigned int order;
+	unsigned long addr;
+	size_t dump_buf_bytes;
+	size_t clk_cnt_buf_bytes;
+	size_t total_dump_buf_size;
+
+	if (!md_narrow || !dump_bufs)
+		return -EINVAL;
+
+	dump_buf_bytes = md_narrow->dump_buf_bytes;
+	clk_cnt_buf_bytes = sizeof(*dump_bufs->bufs->clk_cnt_buf) *
+			    md_narrow->metadata->clk_cnt;
+
+	/* Allocate memory for the dump buffer struct array */
+	buffers = kmalloc_array(n, sizeof(*buffers), GFP_KERNEL);
+	if (!buffers)
+		return -ENOMEM;
+
+	/* Allocate pages for the actual dump buffers, as they tend to be fairly
+	 * large.
+	 */
+	order = get_order((dump_buf_bytes + clk_cnt_buf_bytes) * n);
+	addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
+
+	if (!addr) {
+		kfree(buffers);
+		return -ENOMEM;
+	}
+
+	*dump_bufs = (struct kbase_hwcnt_dump_buffer_narrow_array){
+		.page_addr = addr,
+		.page_order = order,
+		.buf_cnt = n,
+		.bufs = buffers,
+	};
+
+	total_dump_buf_size = dump_buf_bytes * n;
+	/* Set the buffer of each dump buf */
+	for (buf_idx = 0; buf_idx < n; buf_idx++) {
+		const size_t dump_buf_offset = dump_buf_bytes * buf_idx;
+		const size_t clk_cnt_buf_offset =
+			total_dump_buf_size + (clk_cnt_buf_bytes * buf_idx);
+
+		buffers[buf_idx] = (struct kbase_hwcnt_dump_buffer_narrow){
+			.md_narrow = md_narrow,
+			.dump_buf = (u32 *)(addr + dump_buf_offset),
+			.clk_cnt_buf = (u64 *)(addr + clk_cnt_buf_offset),
+		};
+	}
+
+	return 0;
+}
+
+void kbase_hwcnt_dump_buffer_narrow_array_free(
+	struct kbase_hwcnt_dump_buffer_narrow_array *dump_bufs)
+{
+	if (!dump_bufs)
+		return;
+
+	kfree(dump_bufs->bufs);
+	free_pages(dump_bufs->page_addr, dump_bufs->page_order);
+	memset(dump_bufs, 0, sizeof(*dump_bufs));
+}
+
+void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk,
+						      const u64 *src_blk,
+						      const u64 *blk_em,
+						      size_t val_cnt)
+{
+	size_t val;
+
+	for (val = 0; val < val_cnt; val++) {
+		bool val_enabled =
+			kbase_hwcnt_enable_map_block_value_enabled(blk_em, val);
+		u32 src_val =
+			(src_blk[val] > U32_MAX) ? U32_MAX : (u32)src_blk[val];
+
+		dst_blk[val] = val_enabled ? src_val : 0;
+	}
+}
+
+void kbase_hwcnt_dump_buffer_copy_strict_narrow(
+	struct kbase_hwcnt_dump_buffer_narrow *dst_narrow,
+	const struct kbase_hwcnt_dump_buffer *src,
+	const struct kbase_hwcnt_enable_map *dst_enable_map)
+{
+	const struct kbase_hwcnt_metadata_narrow *metadata_narrow;
+	size_t grp;
+	size_t clk;
+
+	if (WARN_ON(!dst_narrow) || WARN_ON(!src) || WARN_ON(!dst_enable_map) ||
+	    WARN_ON(dst_narrow->md_narrow->metadata == src->metadata) ||
+	    WARN_ON(dst_narrow->md_narrow->metadata->grp_cnt !=
+		    src->metadata->grp_cnt) ||
+	    WARN_ON(src->metadata->grp_cnt != 1) ||
+	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0].blk_cnt !=
+		    src->metadata->grp_metadata[0].blk_cnt) ||
+	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0].blk_cnt !=
+		    KBASE_HWCNT_V5_BLOCK_TYPE_COUNT) ||
+	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0]
+			    .blk_metadata[0]
+			    .ctr_cnt >
+		    src->metadata->grp_metadata[0].blk_metadata[0].ctr_cnt))
+		return;
+
+	/* Don't use src metadata since src buffer is bigger than dst buffer. */
+	metadata_narrow = dst_narrow->md_narrow;
+
+	for (grp = 0;
+	     grp < kbase_hwcnt_metadata_narrow_group_count(metadata_narrow);
+	     grp++) {
+		size_t blk;
+		size_t blk_cnt = kbase_hwcnt_metadata_narrow_block_count(
+			metadata_narrow, grp);
+
+		for (blk = 0; blk < blk_cnt; blk++) {
+			size_t blk_inst;
+			size_t blk_inst_cnt =
+				kbase_hwcnt_metadata_narrow_block_instance_count(
+					metadata_narrow, grp, blk);
+
+			for (blk_inst = 0; blk_inst < blk_inst_cnt;
+			     blk_inst++) {
+				/* The narrowed down buffer is only 32-bit. */
+				u32 *dst_blk =
+					kbase_hwcnt_dump_buffer_narrow_block_instance(
+						dst_narrow, grp, blk, blk_inst);
+				const u64 *src_blk =
+					kbase_hwcnt_dump_buffer_block_instance(
+						src, grp, blk, blk_inst);
+				const u64 *blk_em =
+					kbase_hwcnt_enable_map_block_instance(
+						dst_enable_map, grp, blk,
+						blk_inst);
+				size_t val_cnt =
+					kbase_hwcnt_metadata_narrow_block_values_count(
+						metadata_narrow, grp, blk);
+				/* Align upwards to include padding bytes */
+				val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
+					val_cnt,
+					(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
+					 KBASE_HWCNT_VALUE_BYTES));
+
+				kbase_hwcnt_dump_buffer_block_copy_strict_narrow(
+					dst_blk, src_blk, blk_em, val_cnt);
+			}
+		}
+	}
+
+	for (clk = 0; clk < metadata_narrow->metadata->clk_cnt; clk++) {
+		bool clk_enabled = kbase_hwcnt_clk_enable_map_enabled(
+			dst_enable_map->clk_enable_map, clk);
+
+		dst_narrow->clk_cnt_buf[clk] =
+			clk_enabled ? src->clk_cnt_buf[clk] : 0;
+	}
+}
diff --git a/mali_kbase/mali_kbase_hwcnt_gpu_narrow.h b/mali_kbase/mali_kbase_hwcnt_gpu_narrow.h
new file mode 100644
index 0000000..af6fa19
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_gpu_narrow.h
@@ -0,0 +1,347 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _KBASE_HWCNT_GPU_NARROW_H_
+#define _KBASE_HWCNT_GPU_NARROW_H_
+
+#include "mali_kbase_hwcnt_types.h"
+#include <linux/types.h>
+
+struct kbase_device;
+struct kbase_hwcnt_metadata;
+struct kbase_hwcnt_enable_map;
+struct kbase_hwcnt_dump_buffer;
+
+/**
+ * struct kbase_hwcnt_metadata_narrow - Narrow metadata describing the physical
+ *                                      layout of narrow dump buffers.
+ *                                      For backward compatibility, the narrow
+ *                                      metadata only supports 64 counters per
+ *                                      block and 32-bit per block entry.
+ * @metadata:       Non-NULL pointer to the metadata before narrow down to
+ *                  32-bit per block entry, it has 64 counters per block and
+ *                  64-bit per value.
+ * @dump_buf_bytes: The size in bytes after narrow 64-bit to 32-bit per block
+ *                  entry.
+ */
+struct kbase_hwcnt_metadata_narrow {
+	const struct kbase_hwcnt_metadata *metadata;
+	size_t dump_buf_bytes;
+};
+
+/**
+ * struct kbase_hwcnt_dump_buffer_narrow - Hardware counter narrow dump buffer.
+ * @md_narrow:   Non-NULL pointer to narrow metadata used to identify, and to
+ *               describe the layout of the narrow dump buffer.
+ * @dump_buf:    Non-NULL pointer to an array of u32 values, the array size
+ *               is md_narrow->dump_buf_bytes.
+ * @clk_cnt_buf: A pointer to an array of u64 values for cycle count elapsed
+ *               for each clock domain.
+ */
+struct kbase_hwcnt_dump_buffer_narrow {
+	const struct kbase_hwcnt_metadata_narrow *md_narrow;
+	u32 *dump_buf;
+	u64 *clk_cnt_buf;
+};
+
+/**
+ * struct kbase_hwcnt_dump_buffer_narrow_array - Hardware counter narrow dump
+ *                                               buffer array.
+ * @page_addr:  Address of first allocated page. A single allocation is used for
+ *              all narrow dump buffers in the array.
+ * @page_order: The allocation order of the pages, the order is on a logarithmic
+ *              scale.
+ * @buf_cnt:    The number of allocated dump buffers.
+ * @bufs:       Non-NULL pointer to the array of narrow dump buffer descriptors.
+ */
+struct kbase_hwcnt_dump_buffer_narrow_array {
+	unsigned long page_addr;
+	unsigned int page_order;
+	size_t buf_cnt;
+	struct kbase_hwcnt_dump_buffer_narrow *bufs;
+};
+
+/**
+ * kbase_hwcnt_metadata_narrow_group_count() - Get the number of groups from
+ *                                             narrow metadata.
+ * @md_narrow: Non-NULL pointer to narrow metadata.
+ *
+ * Return: Number of hardware counter groups described by narrow metadata.
+ */
+static inline size_t kbase_hwcnt_metadata_narrow_group_count(
+	const struct kbase_hwcnt_metadata_narrow *md_narrow)
+{
+	return kbase_hwcnt_metadata_group_count(md_narrow->metadata);
+}
+
+/**
+ * kbase_hwcnt_metadata_narrow_group_type() - Get the arbitrary type of a group
+ *                                            from narrow metadata.
+ * @md_narrow: Non-NULL pointer to narrow metadata.
+ * @grp:      Index of the group in the narrow metadata.
+ *
+ * Return: Type of the group grp.
+ */
+static inline u64 kbase_hwcnt_metadata_narrow_group_type(
+	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp)
+{
+	return kbase_hwcnt_metadata_group_type(md_narrow->metadata, grp);
+}
+
+/**
+ * kbase_hwcnt_metadata_narrow_block_count() - Get the number of blocks in a
+ *                                             group from narrow metadata.
+ * @md_narrow: Non-NULL pointer to narrow metadata.
+ * @grp:       Index of the group in the narrow metadata.
+ *
+ * Return: Number of blocks in group grp.
+ */
+static inline size_t kbase_hwcnt_metadata_narrow_block_count(
+	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp)
+{
+	return kbase_hwcnt_metadata_block_count(md_narrow->metadata, grp);
+}
+
+/**
+ * kbase_hwcnt_metadata_narrow_block_instance_count() - Get the number of
+ *                                                      instances of a block
+ *                                                      from narrow metadata.
+ * @md_narrow: Non-NULL pointer to narrow metadata.
+ * @grp:       Index of the group in the narrow metadata.
+ * @blk:       Index of the block in the group.
+ *
+ * Return: Number of instances of block blk in group grp.
+ */
+static inline size_t kbase_hwcnt_metadata_narrow_block_instance_count(
+	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp,
+	size_t blk)
+{
+	return kbase_hwcnt_metadata_block_instance_count(md_narrow->metadata,
+							 grp, blk);
+}
+
+/**
+ * kbase_hwcnt_metadata_narrow_block_headers_count() - Get the number of counter
+ *                                                     headers from narrow
+ *                                                     metadata.
+ * @md_narrow: Non-NULL pointer to narrow metadata.
+ * @grp:       Index of the group in the narrow metadata.
+ * @blk:       Index of the block in the group.
+ *
+ * Return: Number of counter headers in each instance of block blk in group grp.
+ */
+static inline size_t kbase_hwcnt_metadata_narrow_block_headers_count(
+	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp,
+	size_t blk)
+{
+	return kbase_hwcnt_metadata_block_headers_count(md_narrow->metadata,
+							grp, blk);
+}
+
+/**
+ * kbase_hwcnt_metadata_narrow_block_counters_count() - Get the number of
+ *                                                      counters from narrow
+ *                                                      metadata.
+ * @md_narrow: Non-NULL pointer to narrow metadata.
+ * @grp:       Index of the group in the narrow metadata.
+ * @blk:       Index of the block in the group.
+ *
+ * Return: Number of counters in each instance of block blk in group grp.
+ */
+static inline size_t kbase_hwcnt_metadata_narrow_block_counters_count(
+	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp,
+	size_t blk)
+{
+	return kbase_hwcnt_metadata_block_counters_count(md_narrow->metadata,
+							 grp, blk);
+}
+
+/**
+ * kbase_hwcnt_metadata_narrow_block_values_count() - Get the number of values
+ *                                                    from narrow metadata.
+ * @md_narrow: Non-NULL pointer to narrow metadata.
+ * @grp:       Index of the group in the narrow metadata.
+ * @blk:       Index of the block in the group.
+ *
+ * Return: Number of headers plus counters in each instance of block blk
+ *         in group grp.
+ */
+static inline size_t kbase_hwcnt_metadata_narrow_block_values_count(
+	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp,
+	size_t blk)
+{
+	return kbase_hwcnt_metadata_narrow_block_counters_count(md_narrow, grp,
+								blk) +
+	       kbase_hwcnt_metadata_narrow_block_headers_count(md_narrow, grp,
+							       blk);
+}
+
+/**
+ * kbase_hwcnt_dump_buffer_narrow_block_instance() - Get the pointer to a
+ *                                                   narrowed block instance's
+ *                                                   dump buffer.
+ * @buf:      Non-NULL pointer to narrow dump buffer.
+ * @grp:      Index of the group in the narrow metadata.
+ * @blk:      Index of the block in the group.
+ * @blk_inst: Index of the block instance in the block.
+ *
+ * Return: u32* to the dump buffer for the block instance.
+ */
+static inline u32 *kbase_hwcnt_dump_buffer_narrow_block_instance(
+	const struct kbase_hwcnt_dump_buffer_narrow *buf, size_t grp,
+	size_t blk, size_t blk_inst)
+{
+	return buf->dump_buf +
+	       buf->md_narrow->metadata->grp_metadata[grp].dump_buf_index +
+	       buf->md_narrow->metadata->grp_metadata[grp]
+		       .blk_metadata[blk]
+		       .dump_buf_index +
+	       (buf->md_narrow->metadata->grp_metadata[grp]
+			.blk_metadata[blk]
+			.dump_buf_stride *
+		blk_inst);
+}
+
+/**
+ * kbase_hwcnt_gpu_metadata_narrow_create() - Create HWC metadata with HWC
+ *                                            entries per block truncated to
+ *                                            64 entries and block entry size
+ *                                            narrowed down to 32-bit.
+ *
+ * @dst_md_narrow: Non-NULL pointer to where created narrow metadata is stored
+ *                 on success.
+ * @src_md:        Non-NULL pointer to the HWC metadata used as the source to
+ *                 create dst_md_narrow.
+ *
+ * For backward compatibility of the interface to user clients, a new metadata
+ * with entries per block truncated to 64 and block entry size narrowed down
+ * to 32-bit will be created for dst_md_narrow.
+ * The total entries per block in src_md must be 64 or 128, if it's other
+ * values, function returns error since it's not supported.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_gpu_metadata_narrow_create(
+	const struct kbase_hwcnt_metadata_narrow **dst_md_narrow,
+	const struct kbase_hwcnt_metadata *src_md);
+
+/**
+ * kbase_hwcnt_gpu_metadata_narrow_destroy() - Destroy a hardware counter narrow
+ *                                             metadata object.
+ * @md_narrow: Pointer to hardware counter narrow metadata.
+ */
+void kbase_hwcnt_gpu_metadata_narrow_destroy(
+	const struct kbase_hwcnt_metadata_narrow *md_narrow);
+
+/**
+ * kbase_hwcnt_dump_buffer_narrow_alloc() - Allocate a narrow dump buffer.
+ * @md_narrow: Non-NULL pointer to narrow metadata.
+ * @dump_buf:  Non-NULL pointer to narrow dump buffer to be initialised. Will be
+ *             initialised to undefined values, so must be used as a copy
+ *             destination, or cleared before use.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_dump_buffer_narrow_alloc(
+	const struct kbase_hwcnt_metadata_narrow *md_narrow,
+	struct kbase_hwcnt_dump_buffer_narrow *dump_buf);
+
+/**
+ * kbase_hwcnt_dump_buffer_narrow_free() - Free a narrow dump buffer.
+ * @dump_buf: Dump buffer to be freed.
+ *
+ * Can be safely called on an all-zeroed narrow dump buffer structure, or on an
+ * already freed narrow dump buffer.
+ */
+void kbase_hwcnt_dump_buffer_narrow_free(
+	struct kbase_hwcnt_dump_buffer_narrow *dump_buf);
+
+/**
+ * kbase_hwcnt_dump_buffer_narrow_array_alloc() - Allocate an array of narrow
+ *                                                dump buffers.
+ * @md_narrow:  Non-NULL pointer to narrow metadata.
+ * @n:          Number of narrow dump buffers to allocate
+ * @dump_bufs:  Non-NULL pointer to a kbase_hwcnt_dump_buffer_narrow_array
+ *              object to be initialised.
+ *
+ * A single zeroed contiguous page allocation will be used for all of the
+ * buffers inside the object, where:
+ * dump_bufs->bufs[n].dump_buf == page_addr + n * md_narrow.dump_buf_bytes
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_dump_buffer_narrow_array_alloc(
+	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t n,
+	struct kbase_hwcnt_dump_buffer_narrow_array *dump_bufs);
+
+/**
+ * kbase_hwcnt_dump_buffer_narrow_array_free() - Free a narrow dump buffer
+ *                                               array.
+ * @dump_bufs: Narrow Dump buffer array to be freed.
+ *
+ * Can be safely called on an all-zeroed narrow dump buffer array structure, or
+ * on an already freed narrow dump buffer array.
+ */
+void kbase_hwcnt_dump_buffer_narrow_array_free(
+	struct kbase_hwcnt_dump_buffer_narrow_array *dump_bufs);
+
+/**
+ * kbase_hwcnt_dump_buffer_block_copy_strict_narrow() - Copy all enabled block
+ *                                                      values from source to
+ *                                                      destination.
+ * @dst_blk: Non-NULL pointer to destination block obtained from a call to
+ *           kbase_hwcnt_dump_buffer_narrow_block_instance.
+ * @src_blk: Non-NULL pointer to source block obtained from a call to
+ *           kbase_hwcnt_dump_buffer_block_instance.
+ * @blk_em:  Non-NULL pointer to the block bitfield(s) obtained from a call to
+ *           kbase_hwcnt_enable_map_block_instance.
+ * @val_cnt: Number of values in the block.
+ *
+ * After the copy, any disabled values in destination will be zero, the enabled
+ * values in destination will be saturated at U32_MAX if the corresponding
+ * source value is bigger than U32_MAX, or copy the value from source if the
+ * corresponding source value is less than or equal to U32_MAX.
+ */
+void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk,
+						      const u64 *src_blk,
+						      const u64 *blk_em,
+						      size_t val_cnt);
+
+/**
+ * kbase_hwcnt_dump_buffer_copy_strict_narrow() - Copy all enabled values to a
+ *                                                narrow dump buffer.
+ * @dst_narrow:     Non-NULL pointer to destination dump buffer.
+ * @src:            Non-NULL pointer to source dump buffer.
+ * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
+ *
+ * After the operation, all non-enabled values (including padding bytes) will be
+ * zero. Slower than the non-strict variant.
+ *
+ * The enabled values in dst_narrow will be saturated at U32_MAX if the
+ * corresponding source value is bigger than U32_MAX, or copy the value from
+ * source if the corresponding source value is less than or equal to U32_MAX.
+ */
+void kbase_hwcnt_dump_buffer_copy_strict_narrow(
+	struct kbase_hwcnt_dump_buffer_narrow *dst_narrow,
+	const struct kbase_hwcnt_dump_buffer *src,
+	const struct kbase_hwcnt_enable_map *dst_enable_map);
+
+#endif /* _KBASE_HWCNT_GPU_NARROW_H_ */
diff --git a/mali_kbase/mali_kbase_hwcnt_legacy.c b/mali_kbase/mali_kbase_hwcnt_legacy.c
index 0687253..5ca4c51 100644
--- a/mali_kbase/mali_kbase_hwcnt_legacy.c
+++ b/mali_kbase/mali_kbase_hwcnt_legacy.c
@@ -23,6 +23,7 @@
 #include "mali_kbase_hwcnt_virtualizer.h"
 #include "mali_kbase_hwcnt_types.h"
 #include "mali_kbase_hwcnt_gpu.h"
+#include "mali_kbase_hwcnt_gpu_narrow.h"
 #include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h>
 
 #include <linux/slab.h>
@@ -32,14 +33,22 @@
  * struct kbase_hwcnt_legacy_client - Legacy hardware counter client.
  * @user_dump_buf: Pointer to a non-NULL user buffer, where dumps are returned.
  * @enable_map:    Counter enable map.
- * @dump_buf:      Dump buffer used to manipulate dumps before copied to user.
+ * @dump_buf:      Dump buffer used to manipulate dumps from virtualizer.
  * @hvcli:         Hardware counter virtualizer client.
+ * @dump_buf_user: Narrow dump buffer used to manipulate dumps before they are
+ *                 copied to user.
+ * @metadata_user: For compatibility with the user driver interface, this
+ *                 contains a narrowed version of the hardware counter metadata
+ *                 which is limited to 64 entries per block and 32-bit for each
+ *                 entry.
  */
 struct kbase_hwcnt_legacy_client {
 	void __user *user_dump_buf;
 	struct kbase_hwcnt_enable_map enable_map;
 	struct kbase_hwcnt_dump_buffer dump_buf;
 	struct kbase_hwcnt_virtualizer_client *hvcli;
+	struct kbase_hwcnt_dump_buffer_narrow dump_buf_user;
+	const struct kbase_hwcnt_metadata_narrow *metadata_user;
 };
 
 int kbase_hwcnt_legacy_client_create(
@@ -61,6 +70,16 @@ int kbase_hwcnt_legacy_client_create(
 	if (!hlcli)
 		return -ENOMEM;
 
+	errcode = kbase_hwcnt_gpu_metadata_narrow_create(&hlcli->metadata_user,
+							 metadata);
+	if (errcode)
+		goto error;
+
+	errcode = kbase_hwcnt_dump_buffer_narrow_alloc(hlcli->metadata_user,
+						       &hlcli->dump_buf_user);
+	if (errcode)
+		goto error;
+
 	hlcli->user_dump_buf = (void __user *)(uintptr_t)enable->dump_buffer;
 
 	errcode = kbase_hwcnt_enable_map_alloc(metadata, &hlcli->enable_map);
@@ -99,6 +118,8 @@ void kbase_hwcnt_legacy_client_destroy(struct kbase_hwcnt_legacy_client *hlcli)
 	kbase_hwcnt_virtualizer_client_destroy(hlcli->hvcli);
 	kbase_hwcnt_dump_buffer_free(&hlcli->dump_buf);
 	kbase_hwcnt_enable_map_free(&hlcli->enable_map);
+	kbase_hwcnt_dump_buffer_narrow_free(&hlcli->dump_buf_user);
+	kbase_hwcnt_gpu_metadata_narrow_destroy(hlcli->metadata_user);
 	kfree(hlcli);
 }
 
@@ -123,13 +144,20 @@ int kbase_hwcnt_legacy_client_dump(struct kbase_hwcnt_legacy_client *hlcli)
 	kbase_hwcnt_gpu_patch_dump_headers(
 		&hlcli->dump_buf, &hlcli->enable_map);
 
-	/* Zero all non-enabled counters (current values are undefined) */
-	kbase_hwcnt_dump_buffer_zero_non_enabled(
-		&hlcli->dump_buf, &hlcli->enable_map);
+	/* Copy the dump buffer to the userspace visible buffer. The strict
+	 * variant will explicitly zero any non-enabled counters to ensure
+	 * nothing except exactly what the user asked for is made visible.
+	 *
+	 * A narrow copy is required since virtualizer has a bigger buffer
+	 * but user only needs part of it.
+	 */
+	kbase_hwcnt_dump_buffer_copy_strict_narrow(
+		&hlcli->dump_buf_user, &hlcli->dump_buf, &hlcli->enable_map);
 
 	/* Copy into the user's buffer */
-	errcode = copy_to_user(hlcli->user_dump_buf, hlcli->dump_buf.dump_buf,
-		hlcli->dump_buf.metadata->dump_buf_bytes);
+	errcode = copy_to_user(hlcli->user_dump_buf,
+			       hlcli->dump_buf_user.dump_buf,
+			       hlcli->dump_buf_user.md_narrow->dump_buf_bytes);
 	/* Non-zero errcode implies user buf was invalid or too small */
 	if (errcode)
 		return -EFAULT;
diff --git a/mali_kbase/mali_kbase_hwcnt_types.c b/mali_kbase/mali_kbase_hwcnt_types.c
index 492f572..d925ed7 100644
--- a/mali_kbase/mali_kbase_hwcnt_types.c
+++ b/mali_kbase/mali_kbase_hwcnt_types.c
@@ -32,7 +32,7 @@ int kbase_hwcnt_metadata_create(
 	struct kbase_hwcnt_group_metadata *grp_mds;
 	size_t grp;
 	size_t enable_map_count; /* Number of u64 bitfields (inc padding) */
-	size_t dump_buf_count; /* Number of u32 values (inc padding) */
+	size_t dump_buf_count; /* Number of u64 values (inc padding) */
 	size_t avail_mask_bits; /* Number of availability mask bits */
 
 	size_t size;
@@ -220,7 +220,7 @@ int kbase_hwcnt_dump_buffer_alloc(
 		return -ENOMEM;
 
 	dump_buf->metadata = metadata;
-	dump_buf->dump_buf = (u32 *)buf;
+	dump_buf->dump_buf = (u64 *)buf;
 	dump_buf->clk_cnt_buf = (u64 *)(buf + dump_buf_bytes);
 
 	return 0;
@@ -282,7 +282,7 @@ int kbase_hwcnt_dump_buffer_array_alloc(
 			(dump_buf_bytes * n) + (clk_cnt_buf_bytes * buf_idx);
 
 		buffers[buf_idx].metadata = metadata;
-		buffers[buf_idx].dump_buf = (u32 *)(addr + dump_buf_offset);
+		buffers[buf_idx].dump_buf = (u64 *)(addr + dump_buf_offset);
 		buffers[buf_idx].clk_cnt_buf =
 			(u64 *)(addr + clk_cnt_buf_offset);
 	}
@@ -316,7 +316,7 @@ void kbase_hwcnt_dump_buffer_zero(
 	metadata = dst->metadata;
 
 	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		u32 *dst_blk;
+		u64 *dst_blk;
 		size_t val_cnt;
 
 		if (!kbase_hwcnt_enable_map_block_enabled(
@@ -362,7 +362,7 @@ void kbase_hwcnt_dump_buffer_zero_non_enabled(
 	metadata = dst->metadata;
 
 	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
+		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
 			dst, grp, blk, blk_inst);
 		const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
 			dst_enable_map, grp, blk, blk_inst);
@@ -406,8 +406,8 @@ void kbase_hwcnt_dump_buffer_copy(
 	metadata = dst->metadata;
 
 	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		u32 *dst_blk;
-		const u32 *src_blk;
+		u64 *dst_blk;
+		const u64 *src_blk;
 		size_t val_cnt;
 
 		if (!kbase_hwcnt_enable_map_block_enabled(
@@ -451,9 +451,9 @@ void kbase_hwcnt_dump_buffer_copy_strict(
 	metadata = dst->metadata;
 
 	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
+		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
 			dst, grp, blk, blk_inst);
-		const u32 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
+		const u64 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
 			src, grp, blk, blk_inst);
 		const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
 			dst_enable_map, grp, blk, blk_inst);
@@ -497,8 +497,8 @@ void kbase_hwcnt_dump_buffer_accumulate(
 	metadata = dst->metadata;
 
 	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		u32 *dst_blk;
-		const u32 *src_blk;
+		u64 *dst_blk;
+		const u64 *src_blk;
 		size_t hdr_cnt;
 		size_t ctr_cnt;
 
@@ -546,9 +546,9 @@ void kbase_hwcnt_dump_buffer_accumulate_strict(
 	metadata = dst->metadata;
 
 	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
+		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
 			dst, grp, blk, blk_inst);
-		const u32 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
+		const u64 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
 			src, grp, blk, blk_inst);
 		const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
 			dst_enable_map, grp, blk, blk_inst);
diff --git a/mali_kbase/mali_kbase_hwcnt_types.h b/mali_kbase/mali_kbase_hwcnt_types.h
index 6b7985b..f04c0ec 100644
--- a/mali_kbase/mali_kbase_hwcnt_types.h
+++ b/mali_kbase/mali_kbase_hwcnt_types.h
@@ -61,7 +61,7 @@
  *   An array of u64 bitfields, where each bit either enables exactly one
  *   block value, or is unused (padding).
  * Dump Buffer:
- *   An array of u32 values, where each u32 corresponds either to one block
+ *   An array of u64 values, where each u64 corresponds either to one block
  *   value, or is unused (padding).
  * Availability Mask:
  *   A bitfield, where each bit corresponds to whether a block instance is
@@ -81,6 +81,7 @@
 #define _KBASE_HWCNT_TYPES_H_
 
 #include <linux/bitops.h>
+#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/types.h>
@@ -91,8 +92,11 @@
 /* Number of bits in each bitfield */
 #define KBASE_HWCNT_BITFIELD_BITS (KBASE_HWCNT_BITFIELD_BYTES * BITS_PER_BYTE)
 
-/* Number of bytes for each counter value */
-#define KBASE_HWCNT_VALUE_BYTES (sizeof(u32))
+/* Number of bytes for each counter value.
+ * Use 64-bit per counter in driver to avoid HW 32-bit register values
+ * overflow after a long time accumulation.
+ */
+#define KBASE_HWCNT_VALUE_BYTES (sizeof(u64))
 
 /* Number of bits in an availability mask (i.e. max total number of block
  * instances supported in a Hardware Counter System)
@@ -119,8 +123,8 @@
  *                                        contiguous, Hardware Counter Blocks.
  * @type:     The arbitrary identifier used to identify the type of the block.
  * @inst_cnt: The number of Instances of the block.
- * @hdr_cnt:  The number of 32-bit Block Headers in the block.
- * @ctr_cnt:  The number of 32-bit Block Counters in the block.
+ * @hdr_cnt:  The number of 64-bit Block Headers in the block.
+ * @ctr_cnt:  The number of 64-bit Block Counters in the block.
  */
 struct kbase_hwcnt_block_description {
 	u64 type;
@@ -165,17 +169,17 @@ struct kbase_hwcnt_description {
  * @type:              The arbitrary identifier used to identify the type of the
  *                     block.
  * @inst_cnt:          The number of Instances of the block.
- * @hdr_cnt:           The number of 32-bit Block Headers in the block.
- * @ctr_cnt:           The number of 32-bit Block Counters in the block.
+ * @hdr_cnt:           The number of 64-bit Block Headers in the block.
+ * @ctr_cnt:           The number of 64-bit Block Counters in the block.
  * @enable_map_index:  Index in u64s into the parent's Enable Map where the
  *                     Enable Map bitfields of the Block Instances described by
  *                     this metadata start.
  * @enable_map_stride: Stride in u64s between the Enable Maps of each of the
  *                     Block Instances described by this metadata.
- * @dump_buf_index:    Index in u32s into the parent's Dump Buffer where the
+ * @dump_buf_index:    Index in u64s into the parent's Dump Buffer where the
  *                     Dump Buffers of the Block Instances described by this
  *                     metadata start.
- * @dump_buf_stride:   Stride in u32s between the Dump Buffers of each of the
+ * @dump_buf_stride:   Stride in u64s between the Dump Buffers of each of the
  *                     Block Instances described by this metadata.
  * @avail_mask_index:  Index in bits into the parent's Availability Mask where
  *                     the Availability Masks of the Block Instances described
@@ -208,7 +212,7 @@ struct kbase_hwcnt_block_metadata {
  * @enable_map_index: Index in u64s into the parent's Enable Map where the
  *                    Enable Maps of the blocks within the group described by
  *                    this metadata start.
- * @dump_buf_index:   Index in u32s into the parent's Dump Buffer where the
+ * @dump_buf_index:   Index in u64s into the parent's Dump Buffer where the
  *                    Dump Buffers of the blocks within the group described by
  *                    metadata start.
  * @avail_mask_index: Index in bits into the parent's Availability Mask where
@@ -225,7 +229,7 @@ struct kbase_hwcnt_group_metadata {
 };
 
 /**
- * struct kbase_hwcnt_metadata - Metadata describing the physical layout
+ * struct kbase_hwcnt_metadata - Metadata describing the memory layout
  *                               of Dump Buffers and Enable Maps within a
  *                               Hardware Counter System.
  * @grp_cnt:          The number of Hardware Counter Groups.
@@ -264,18 +268,17 @@ struct kbase_hwcnt_enable_map {
 };
 
 /**
- * struct kbase_hwcnt_dump_buffer - Hardware Counter Dump Buffer. Array of u32
- *                                  values.
+ * struct kbase_hwcnt_dump_buffer - Hardware Counter Dump Buffer.
  * @metadata: Non-NULL pointer to metadata used to identify, and to describe
  *            the layout of the Dump Buffer.
- * @dump_buf: Non-NULL pointer of size metadata->dump_buf_bytes to an array
- *            of u32 values.
+ * @dump_buf: Non-NULL pointer to an array of u64 values, the array size is
+ *            metadata->dump_buf_bytes.
  * @clk_cnt_buf: A pointer to an array of u64 values for cycle count elapsed
  *               for each clock domain.
  */
 struct kbase_hwcnt_dump_buffer {
 	const struct kbase_hwcnt_metadata *metadata;
-	u32 *dump_buf;
+	u64 *dump_buf;
 	u64 *clk_cnt_buf;
 };
 
@@ -283,7 +286,8 @@ struct kbase_hwcnt_dump_buffer {
  * struct kbase_hwcnt_dump_buffer_array - Hardware Counter Dump Buffer array.
  * @page_addr:  Address of allocated pages. A single allocation is used for all
  *              Dump Buffers in the array.
- * @page_order: The allocation order of the pages.
+ * @page_order: The allocation order of the pages, the order is on a logarithmic
+ *              scale.
  * @buf_cnt:    The number of allocated Dump Buffers.
  * @bufs:       Non-NULL pointer to the array of Dump Buffers.
  */
@@ -319,8 +323,14 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
  *
  * Return: Number of hardware counter groups described by metadata.
  */
-#define kbase_hwcnt_metadata_group_count(metadata) \
-	((metadata)->grp_cnt)
+static inline size_t
+kbase_hwcnt_metadata_group_count(const struct kbase_hwcnt_metadata *metadata)
+{
+	if (WARN_ON(!metadata))
+		return 0;
+
+	return metadata->grp_cnt;
+}
 
 /**
  * kbase_hwcnt_metadata_group_type() - Get the arbitrary type of a group.
@@ -329,8 +339,15 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
  *
  * Return: Type of the group grp.
  */
-#define kbase_hwcnt_metadata_group_type(metadata, grp) \
-	((metadata)->grp_metadata[(grp)].type)
+static inline u64
+kbase_hwcnt_metadata_group_type(const struct kbase_hwcnt_metadata *metadata,
+				size_t grp)
+{
+	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt))
+		return 0;
+
+	return metadata->grp_metadata[grp].type;
+}
 
 /**
  * kbase_hwcnt_metadata_block_count() - Get the number of blocks in a group.
@@ -339,8 +356,15 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
  *
  * Return: Number of blocks in group grp.
  */
-#define kbase_hwcnt_metadata_block_count(metadata, grp) \
-	((metadata)->grp_metadata[(grp)].blk_cnt)
+static inline size_t
+kbase_hwcnt_metadata_block_count(const struct kbase_hwcnt_metadata *metadata,
+				 size_t grp)
+{
+	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt))
+		return 0;
+
+	return metadata->grp_metadata[grp].blk_cnt;
+}
 
 /**
  * kbase_hwcnt_metadata_block_type() - Get the arbitrary type of a block.
@@ -350,8 +374,16 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
  *
  * Return: Type of the block blk in group grp.
  */
-#define kbase_hwcnt_metadata_block_type(metadata, grp, blk) \
-	((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].type)
+static inline u64
+kbase_hwcnt_metadata_block_type(const struct kbase_hwcnt_metadata *metadata,
+				size_t grp, size_t blk)
+{
+	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
+	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
+		return 0;
+
+	return metadata->grp_metadata[grp].blk_metadata[blk].type;
+}
 
 /**
  * kbase_hwcnt_metadata_block_instance_count() - Get the number of instances of
@@ -362,8 +394,15 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
  *
  * Return: Number of instances of block blk in group grp.
  */
-#define kbase_hwcnt_metadata_block_instance_count(metadata, grp, blk) \
-	((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].inst_cnt)
+static inline size_t kbase_hwcnt_metadata_block_instance_count(
+	const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk)
+{
+	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
+	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
+		return 0;
+
+	return metadata->grp_metadata[grp].blk_metadata[blk].inst_cnt;
+}
 
 /**
  * kbase_hwcnt_metadata_block_headers_count() - Get the number of counter
@@ -374,8 +413,15 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
  *
  * Return: Number of counter headers in each instance of block blk in group grp.
  */
-#define kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk) \
-	((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].hdr_cnt)
+static inline size_t kbase_hwcnt_metadata_block_headers_count(
+	const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk)
+{
+	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
+	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
+		return 0;
+
+	return metadata->grp_metadata[grp].blk_metadata[blk].hdr_cnt;
+}
 
 /**
  * kbase_hwcnt_metadata_block_counters_count() - Get the number of counters.
@@ -385,8 +431,15 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
  *
  * Return: Number of counters in each instance of block blk in group grp.
  */
-#define kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk) \
-	((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].ctr_cnt)
+static inline size_t kbase_hwcnt_metadata_block_counters_count(
+	const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk)
+{
+	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
+	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
+		return 0;
+
+	return metadata->grp_metadata[grp].blk_metadata[blk].ctr_cnt;
+}
 
 /**
  * kbase_hwcnt_metadata_block_enable_map_stride() - Get the enable map stride.
@@ -396,8 +449,15 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
  *
  * Return: enable map stride in each instance of block blk in group grp.
  */
-#define kbase_hwcnt_metadata_block_enable_map_stride(metadata, grp, blk)       \
-	((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].enable_map_stride)
+static inline size_t kbase_hwcnt_metadata_block_enable_map_stride(
+	const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk)
+{
+	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
+	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
+		return 0;
+
+	return metadata->grp_metadata[grp].blk_metadata[blk].enable_map_stride;
+}
 
 /**
  * kbase_hwcnt_metadata_block_values_count() - Get the number of values.
@@ -408,9 +468,16 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
  * Return: Number of headers plus counters in each instance of block blk
  *         in group grp.
  */
-#define kbase_hwcnt_metadata_block_values_count(metadata, grp, blk) \
-	(kbase_hwcnt_metadata_block_counters_count((metadata), (grp), (blk)) \
-	+ kbase_hwcnt_metadata_block_headers_count((metadata), (grp), (blk)))
+static inline size_t kbase_hwcnt_metadata_block_values_count(
+	const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk)
+{
+	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
+	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
+		return 0;
+
+	return kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk) +
+	       kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
+}
 
 /**
  * kbase_hwcnt_metadata_for_each_block() - Iterate over each block instance in
@@ -496,19 +563,28 @@ void kbase_hwcnt_enable_map_free(struct kbase_hwcnt_enable_map *enable_map);
 /**
  * kbase_hwcnt_enable_map_block_instance() - Get the pointer to a block
  *                                           instance's enable map.
- * @map:      Non-NULL pointer to (const) enable map.
+ * @map:      Non-NULL pointer to enable map.
  * @grp:      Index of the group in the metadata.
  * @blk:      Index of the block in the group.
  * @blk_inst: Index of the block instance in the block.
  *
- * Return: (const) u64* to the bitfield(s) used as the enable map for the
+ * Return: u64* to the bitfield(s) used as the enable map for the
  *         block instance.
  */
-#define kbase_hwcnt_enable_map_block_instance(map, grp, blk, blk_inst) \
-	((map)->hwcnt_enable_map + \
-	 (map)->metadata->grp_metadata[(grp)].enable_map_index + \
-	 (map)->metadata->grp_metadata[(grp)].blk_metadata[(blk)].enable_map_index + \
-	 (map)->metadata->grp_metadata[(grp)].blk_metadata[(blk)].enable_map_stride * (blk_inst))
+static inline u64 *
+kbase_hwcnt_enable_map_block_instance(const struct kbase_hwcnt_enable_map *map,
+				      size_t grp, size_t blk, size_t blk_inst)
+{
+	return map->hwcnt_enable_map +
+	       map->metadata->grp_metadata[grp].enable_map_index +
+	       map->metadata->grp_metadata[grp]
+		       .blk_metadata[blk]
+		       .enable_map_index +
+	       (map->metadata->grp_metadata[grp]
+			.blk_metadata[blk]
+			.enable_map_stride *
+		blk_inst);
+}
 
 /**
  * kbase_hwcnt_bitfield_count() - Calculate the number of u64 bitfields required
@@ -827,18 +903,24 @@ void kbase_hwcnt_dump_buffer_array_free(
 /**
  * kbase_hwcnt_dump_buffer_block_instance() - Get the pointer to a block
  *                                            instance's dump buffer.
- * @buf:      Non-NULL pointer to (const) dump buffer.
+ * @buf:      Non-NULL pointer to dump buffer.
  * @grp:      Index of the group in the metadata.
  * @blk:      Index of the block in the group.
  * @blk_inst: Index of the block instance in the block.
  *
- * Return: (const) u32* to the dump buffer for the block instance.
+ * Return: u64* to the dump buffer for the block instance.
  */
-#define kbase_hwcnt_dump_buffer_block_instance(buf, grp, blk, blk_inst) \
-	((buf)->dump_buf + \
-	 (buf)->metadata->grp_metadata[(grp)].dump_buf_index + \
-	 (buf)->metadata->grp_metadata[(grp)].blk_metadata[(blk)].dump_buf_index + \
-	 (buf)->metadata->grp_metadata[(grp)].blk_metadata[(blk)].dump_buf_stride * (blk_inst))
+static inline u64 *kbase_hwcnt_dump_buffer_block_instance(
+	const struct kbase_hwcnt_dump_buffer *buf, size_t grp, size_t blk,
+	size_t blk_inst)
+{
+	return buf->dump_buf + buf->metadata->grp_metadata[grp].dump_buf_index +
+	       buf->metadata->grp_metadata[grp].blk_metadata[blk].dump_buf_index +
+	       (buf->metadata->grp_metadata[grp]
+			.blk_metadata[blk]
+			.dump_buf_stride *
+		blk_inst);
+}
 
 /**
  * kbase_hwcnt_dump_buffer_zero() - Zero all enabled values in dst.
@@ -859,9 +941,8 @@ void kbase_hwcnt_dump_buffer_zero(
  *           kbase_hwcnt_dump_buffer_block_instance.
  * @val_cnt: Number of values in the block.
  */
-static inline void kbase_hwcnt_dump_buffer_block_zero(
-	u32 *dst_blk,
-	size_t val_cnt)
+static inline void kbase_hwcnt_dump_buffer_block_zero(u64 *dst_blk,
+						      size_t val_cnt)
 {
 	memset(dst_blk, 0, (val_cnt * KBASE_HWCNT_VALUE_BYTES));
 }
@@ -904,10 +985,9 @@ void kbase_hwcnt_dump_buffer_zero_non_enabled(
  *           kbase_hwcnt_enable_map_block_instance.
  * @val_cnt: Number of values in the block.
  */
-static inline void kbase_hwcnt_dump_buffer_block_zero_non_enabled(
-	u32 *dst_blk,
-	const u64 *blk_em,
-	size_t val_cnt)
+static inline void
+kbase_hwcnt_dump_buffer_block_zero_non_enabled(u64 *dst_blk, const u64 *blk_em,
+					       size_t val_cnt)
 {
 	size_t val;
 
@@ -941,10 +1021,9 @@ void kbase_hwcnt_dump_buffer_copy(
  *           kbase_hwcnt_dump_buffer_block_instance.
  * @val_cnt: Number of values in the block.
  */
-static inline void kbase_hwcnt_dump_buffer_block_copy(
-	u32 *dst_blk,
-	const u32 *src_blk,
-	size_t val_cnt)
+static inline void kbase_hwcnt_dump_buffer_block_copy(u64 *dst_blk,
+						      const u64 *src_blk,
+						      size_t val_cnt)
 {
 	/* Copy all the counters in the block instance.
 	 * Values of non-enabled counters are undefined.
@@ -987,11 +1066,10 @@ void kbase_hwcnt_dump_buffer_copy_strict(
  *
  * After the copy, any disabled values in dst will be zero.
  */
-static inline void kbase_hwcnt_dump_buffer_block_copy_strict(
-	u32 *dst_blk,
-	const u32 *src_blk,
-	const u64 *blk_em,
-	size_t val_cnt)
+static inline void kbase_hwcnt_dump_buffer_block_copy_strict(u64 *dst_blk,
+							     const u64 *src_blk,
+							     const u64 *blk_em,
+							     size_t val_cnt)
 {
 	size_t val;
 
@@ -1032,11 +1110,10 @@ void kbase_hwcnt_dump_buffer_accumulate(
  * @hdr_cnt: Number of headers in the block.
  * @ctr_cnt: Number of counters in the block.
  */
-static inline void kbase_hwcnt_dump_buffer_block_accumulate(
-	u32 *dst_blk,
-	const u32 *src_blk,
-	size_t hdr_cnt,
-	size_t ctr_cnt)
+static inline void kbase_hwcnt_dump_buffer_block_accumulate(u64 *dst_blk,
+							    const u64 *src_blk,
+							    size_t hdr_cnt,
+							    size_t ctr_cnt)
 {
 	size_t ctr;
 	/* Copy all the headers in the block instance.
@@ -1047,21 +1124,8 @@ static inline void kbase_hwcnt_dump_buffer_block_accumulate(
 	/* Accumulate all the counters in the block instance.
 	 * Values of non-enabled counters are undefined.
 	 */
-	for (ctr = hdr_cnt; ctr < ctr_cnt + hdr_cnt; ctr++) {
-		u32 *dst_ctr = dst_blk + ctr;
-		const u32 *src_ctr = src_blk + ctr;
-
-		const u32 src_counter = *src_ctr;
-		const u32 dst_counter = *dst_ctr;
-
-		/* Saturating add */
-		u32 accumulated = src_counter + dst_counter;
-
-		if (accumulated < src_counter)
-			accumulated = U32_MAX;
-
-		*dst_ctr = accumulated;
-	}
+	for (ctr = hdr_cnt; ctr < ctr_cnt + hdr_cnt; ctr++)
+		dst_blk[ctr] += src_blk[ctr];
 }
 
 /**
@@ -1103,10 +1167,7 @@ void kbase_hwcnt_dump_buffer_accumulate_strict(
  * @ctr_cnt: Number of counters in the block.
  */
 static inline void kbase_hwcnt_dump_buffer_block_accumulate_strict(
-	u32 *dst_blk,
-	const u32 *src_blk,
-	const u64 *blk_em,
-	size_t hdr_cnt,
+	u64 *dst_blk, const u64 *src_blk, const u64 *blk_em, size_t hdr_cnt,
 	size_t ctr_cnt)
 {
 	size_t ctr;
@@ -1118,25 +1179,16 @@ static inline void kbase_hwcnt_dump_buffer_block_accumulate_strict(
 		bool ctr_enabled = kbase_hwcnt_enable_map_block_value_enabled(
 			blk_em, ctr);
 
-		u32 *dst_ctr = dst_blk + ctr;
-		const u32 *src_ctr = src_blk + ctr;
-
-		const u32 src_counter = *src_ctr;
-		const u32 dst_counter = *dst_ctr;
-
-		/* Saturating add */
-		u32 accumulated = src_counter + dst_counter;
-
-		if (accumulated < src_counter)
-			accumulated = U32_MAX;
-
-		*dst_ctr = ctr_enabled ? accumulated : 0;
+		if (ctr_enabled)
+			dst_blk[ctr] += src_blk[ctr];
+		else
+			dst_blk[ctr] = 0;
 	}
 }
 
-/*
- * Iterate over each clock domain in the metadata.
- *
+/**
+ * kbase_hwcnt_metadata_for_each_clock() - Iterate over each clock domain in the
+ *                                         metadata.
  * @md:          Non-NULL pointer to metadata.
  * @clk:         size_t variable used as clock iterator.
  */
diff --git a/mali_kbase/mali_kbase_jd.c b/mali_kbase/mali_kbase_jd.c
index 2b071dd..c892455 100644
--- a/mali_kbase/mali_kbase_jd.c
+++ b/mali_kbase/mali_kbase_jd.c
@@ -76,6 +76,7 @@ static void jd_mark_atom_complete(struct kbase_jd_atom *katom)
 	kbase_kinstr_jm_atom_complete(katom);
 	dev_dbg(katom->kctx->kbdev->dev, "Atom %pK status to completed\n",
 		(void *)katom);
+	KBASE_TLSTREAM_TL_JD_ATOM_COMPLETE(katom->kctx->kbdev, katom);
 }
 
 /* Runs an atom, either by handing to the JS or by immediately running it in the case of soft-jobs
@@ -139,7 +140,13 @@ void kbase_jd_dep_clear_locked(struct kbase_jd_atom *katom)
 		/* katom dep complete, attempt to run it */
 		bool resched = false;
 
+		KBASE_TLSTREAM_TL_RUN_ATOM_START(
+			katom->kctx->kbdev, katom,
+			kbase_jd_atom_id(katom->kctx, katom));
 		resched = jd_run_atom(katom);
+		KBASE_TLSTREAM_TL_RUN_ATOM_END(katom->kctx->kbdev, katom,
+						  kbase_jd_atom_id(katom->kctx,
+								   katom));
 
 		if (katom->status == KBASE_JD_ATOM_STATE_COMPLETED) {
 			/* The atom has already finished */
@@ -715,6 +722,8 @@ bool jd_done_nolock(struct kbase_jd_atom *katom,
 	bool need_to_try_schedule_context = false;
 	int i;
 
+	KBASE_TLSTREAM_TL_JD_DONE_NO_LOCK_START(kctx->kbdev, katom);
+
 	INIT_LIST_HEAD(&completed_jobs);
 	INIT_LIST_HEAD(&runnable_jobs);
 
@@ -736,6 +745,7 @@ bool jd_done_nolock(struct kbase_jd_atom *katom,
 	}
 
 	jd_mark_atom_complete(katom);
+
 	list_add_tail(&katom->jd_item, &completed_jobs);
 
 	while (!list_empty(&completed_jobs)) {
@@ -767,7 +777,13 @@ bool jd_done_nolock(struct kbase_jd_atom *katom,
 
 			if (node->status != KBASE_JD_ATOM_STATE_COMPLETED &&
 					!kbase_ctx_flag(kctx, KCTX_DYING)) {
+				KBASE_TLSTREAM_TL_RUN_ATOM_START(
+					kctx->kbdev, node,
+					kbase_jd_atom_id(kctx, node));
 				need_to_try_schedule_context |= jd_run_atom(node);
+				KBASE_TLSTREAM_TL_RUN_ATOM_END(
+					kctx->kbdev, node,
+					kbase_jd_atom_id(kctx, node));
 			} else {
 				node->event_code = katom->event_code;
 
@@ -811,7 +827,7 @@ bool jd_done_nolock(struct kbase_jd_atom *katom,
 			 */
 			wake_up(&kctx->jctx.zero_jobs_wait);
 	}
-
+	KBASE_TLSTREAM_TL_JD_DONE_NO_LOCK_END(kctx->kbdev, katom);
 	return need_to_try_schedule_context;
 }
 
@@ -984,7 +1000,6 @@ static bool jd_submit_atom(struct kbase_context *const kctx,
 				 * dependencies.
 				 */
 				jd_trace_atom_submit(kctx, katom, NULL);
-
 				return jd_done_nolock(katom, NULL);
 			}
 		}
@@ -1049,7 +1064,6 @@ static bool jd_submit_atom(struct kbase_context *const kctx,
 				if (err >= 0)
 					kbase_finish_soft_job(katom);
 			}
-
 			return jd_done_nolock(katom, NULL);
 		}
 
@@ -1378,10 +1392,10 @@ while (false)
 			}
 			mutex_lock(&jctx->lock);
 		}
-
+		KBASE_TLSTREAM_TL_JD_SUBMIT_ATOM_START(kbdev, katom);
 		need_to_try_schedule_context |= jd_submit_atom(kctx, &user_atom,
 			&user_jc_incr, katom);
-
+		KBASE_TLSTREAM_TL_JD_SUBMIT_ATOM_END(kbdev, katom);
 		/* Register a completed job as a disjoint event when the GPU is in a disjoint state
 		 * (ie. being reset).
 		 */
@@ -1479,7 +1493,6 @@ void kbase_jd_done_worker(struct work_struct *data)
 	kbasep_js_remove_job(kbdev, kctx, katom);
 	mutex_unlock(&js_kctx_info->ctx.jsctx_mutex);
 	mutex_unlock(&js_devdata->queue_mutex);
-	katom->atom_flags &= ~KBASE_KATOM_FLAG_HOLDING_CTX_REF;
 	/* jd_done_nolock() requires the jsctx_mutex lock to be dropped */
 	jd_done_nolock(katom, &kctx->completed_jobs);
 
@@ -1498,22 +1511,23 @@ void kbase_jd_done_worker(struct work_struct *data)
 		 * drop our reference. But do not call kbase_jm_idle_ctx(), as
 		 * the context is active and fast-starting is allowed.
 		 *
-		 * If an atom has been fast-started then kctx->atoms_pulled will
-		 * be non-zero but KCTX_ACTIVE will still be false (as the
-		 * previous pm reference has been inherited). Do NOT drop our
-		 * reference, as it has been re-used, and leave the context as
-		 * active.
+		 * If an atom has been fast-started then
+		 * kbase_jsctx_atoms_pulled(kctx) will return non-zero but
+		 * KCTX_ACTIVE will still be false (as the previous pm
+		 * reference has been inherited). Do NOT drop our reference, as
+		 * it has been re-used, and leave the context as active.
 		 *
-		 * If no new atoms have been started then KCTX_ACTIVE will still
-		 * be false and atoms_pulled will be zero, so drop the reference
-		 * and call kbase_jm_idle_ctx().
+		 * If no new atoms have been started then KCTX_ACTIVE will
+		 * still be false and kbase_jsctx_atoms_pulled(kctx) will
+		 * return zero, so drop the reference and call
+		 * kbase_jm_idle_ctx().
 		 *
 		 * As the checks are done under both the queue_mutex and
 		 * hwaccess_lock is should be impossible for this to race
 		 * with the scheduler code.
 		 */
 		if (kbase_ctx_flag(kctx, KCTX_ACTIVE) ||
-		    !atomic_read(&kctx->atoms_pulled)) {
+		    !kbase_jsctx_atoms_pulled(kctx)) {
 			/* Calling kbase_jm_idle_ctx() here will ensure that
 			 * atoms are not fast-started when we drop the
 			 * hwaccess_lock. This is not performed if
diff --git a/mali_kbase/mali_kbase_jm.c b/mali_kbase/mali_kbase_jm.c
index 6995050..898606b 100644
--- a/mali_kbase/mali_kbase_jm.c
+++ b/mali_kbase/mali_kbase_jm.c
@@ -132,6 +132,9 @@ struct kbase_jd_atom *kbase_jm_return_atom_to_js(struct kbase_device *kbdev,
 	dev_dbg(kbdev->dev, "Atom %pK is returning with event code 0x%x\n",
 		(void *)katom, katom->event_code);
 
+	KBASE_KTRACE_ADD_JM(kbdev, JM_RETURN_ATOM_TO_JS, katom->kctx, katom,
+			    katom->jc, katom->event_code);
+
 	if (katom->event_code != BASE_JD_EVENT_STOPPED &&
 			katom->event_code != BASE_JD_EVENT_REMOVED_FROM_NEXT) {
 		return kbase_js_complete_atom(katom, NULL);
diff --git a/mali_kbase/mali_kbase_jm.h b/mali_kbase/mali_kbase_jm.h
index c6b28f3..eeafcb6 100644
--- a/mali_kbase/mali_kbase_jm.h
+++ b/mali_kbase/mali_kbase_jm.h
@@ -84,7 +84,7 @@ void kbase_jm_try_kick_all(struct kbase_device *kbdev);
  * by kbase_js_use_ctx().
  *
  * The context should have no atoms currently pulled from it
- * (kctx->atoms_pulled == 0).
+ * (kbase_jsctx_atoms_pulled(kctx) == 0).
  *
  * Caller must hold the hwaccess_lock
  */
diff --git a/mali_kbase/mali_kbase_js.c b/mali_kbase/mali_kbase_js.c
index 3682486..799c7e5 100644
--- a/mali_kbase/mali_kbase_js.c
+++ b/mali_kbase/mali_kbase_js.c
@@ -372,8 +372,6 @@ jsctx_rb_pull(struct kbase_context *kctx, struct kbase_jd_atom *katom)
 	rb_erase(&katom->runnable_tree_node, &rb->runnable_tree);
 }
 
-#define LESS_THAN_WRAP(a, b) ((s32)(a - b) < 0)
-
 static void
 jsctx_tree_add(struct kbase_context *kctx, struct kbase_jd_atom *katom)
 {
@@ -393,7 +391,7 @@ jsctx_tree_add(struct kbase_context *kctx, struct kbase_jd_atom *katom)
 				struct kbase_jd_atom, runnable_tree_node);
 
 		parent = *new;
-		if (LESS_THAN_WRAP(katom->age, entry->age))
+		if (kbase_jd_atom_is_younger(katom, entry))
 			new = &((*new)->rb_left);
 		else
 			new = &((*new)->rb_right);
@@ -421,6 +419,9 @@ jsctx_rb_unpull(struct kbase_context *kctx, struct kbase_jd_atom *katom)
 {
 	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
 
+	KBASE_KTRACE_ADD_JM(kctx->kbdev, JS_UNPULL_JOB, kctx, katom, katom->jc,
+			    0u);
+
 	jsctx_tree_add(kctx, katom);
 }
 
@@ -434,6 +435,67 @@ static bool kbase_js_ctx_list_add_unpullable_nolock(struct kbase_device *kbdev,
 						struct kbase_context *kctx,
 						int js);
 
+typedef bool(katom_ordering_func)(const struct kbase_jd_atom *,
+				  const struct kbase_jd_atom *);
+
+bool kbase_js_atom_runs_before(struct kbase_device *kbdev,
+			       const struct kbase_jd_atom *katom_a,
+			       const struct kbase_jd_atom *katom_b,
+			       const kbase_atom_ordering_flag_t order_flags)
+{
+	struct kbase_context *kctx_a = katom_a->kctx;
+	struct kbase_context *kctx_b = katom_b->kctx;
+	katom_ordering_func *samectxatomprio_ordering_func =
+		kbase_jd_atom_is_younger;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	if (order_flags & KBASE_ATOM_ORDERING_FLAG_SEQNR)
+		samectxatomprio_ordering_func = kbase_jd_atom_is_earlier;
+
+	/* It only makes sense to make this test for atoms on the same slot */
+	WARN_ON(katom_a->slot_nr != katom_b->slot_nr);
+
+	if (kbdev->js_ctx_scheduling_mode ==
+	    KBASE_JS_PROCESS_LOCAL_PRIORITY_MODE) {
+		/* In local priority mode, querying either way around for "a
+		 * should run before b" and "b should run before a" should
+		 * always be false when they're from different contexts
+		 */
+		if (kctx_a != kctx_b)
+			return false;
+	} else {
+		/* In system priority mode, ordering is done first strictly by
+		 * context priority, even when katom_b might be lower priority
+		 * than katom_a. This is due to scheduling of contexts in order
+		 * of highest priority first, regardless of whether the atoms
+		 * for a particular slot from such contexts have the highest
+		 * priority or not.
+		 */
+		if (kctx_a != kctx_b) {
+			if (kctx_a->priority < kctx_b->priority)
+				return true;
+			if (kctx_a->priority > kctx_b->priority)
+				return false;
+		}
+	}
+
+	/* For same contexts/contexts with the same context priority (in system
+	 * priority mode), ordering is next done by atom priority
+	 */
+	if (katom_a->sched_priority < katom_b->sched_priority)
+		return true;
+	if (katom_a->sched_priority > katom_b->sched_priority)
+		return false;
+	/* For atoms of same priority on the same kctx, they are
+	 * ordered by seq_nr/age (dependent on caller)
+	 */
+	if (kctx_a == kctx_b && samectxatomprio_ordering_func(katom_a, katom_b))
+		return true;
+
+	return false;
+}
+
 /*
  * Functions private to KBase ('Protected' functions)
  */
@@ -475,6 +537,7 @@ int kbasep_js_devdata_init(struct kbase_device * const kbdev)
 	jsdd->hard_stop_ticks_dumping = DEFAULT_JS_HARD_STOP_TICKS_DUMPING;
 	jsdd->gpu_reset_ticks_ss = DEFAULT_JS_RESET_TICKS_SS;
 	jsdd->gpu_reset_ticks_cl = DEFAULT_JS_RESET_TICKS_CL;
+
 	jsdd->gpu_reset_ticks_dumping = DEFAULT_JS_RESET_TICKS_DUMPING;
 	jsdd->ctx_timeslice_ns = DEFAULT_JS_CTX_TIMESLICE_NS;
 	atomic_set(&jsdd->soft_job_timeout_ms, DEFAULT_JS_SOFT_JOB_TIMEOUT);
@@ -662,6 +725,147 @@ void kbasep_js_kctx_term(struct kbase_context *kctx)
 	}
 }
 
+/*
+ * Priority blocking management functions
+ */
+
+/* Should not normally use directly - use kbase_jsctx_slot_atom_pulled_dec() instead */
+static void kbase_jsctx_slot_prio_blocked_clear(struct kbase_context *kctx,
+						int js, int sched_prio)
+{
+	struct kbase_jsctx_slot_tracking *slot_tracking =
+		&kctx->slot_tracking[js];
+
+	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
+
+	slot_tracking->blocked &= ~(((kbase_js_prio_bitmap_t)1) << sched_prio);
+	KBASE_KTRACE_ADD_JM_SLOT_INFO(kctx->kbdev, JS_SLOT_PRIO_UNBLOCKED, kctx,
+				      NULL, 0, js, (unsigned int)sched_prio);
+}
+
+static int kbase_jsctx_slot_atoms_pulled(struct kbase_context *kctx, int js)
+{
+	return atomic_read(&kctx->slot_tracking[js].atoms_pulled);
+}
+
+/*
+ * A priority level on a slot is blocked when:
+ * - that priority level is blocked
+ * - or, any higher priority level is blocked
+ */
+static bool kbase_jsctx_slot_prio_is_blocked(struct kbase_context *kctx, int js,
+					     int sched_prio)
+{
+	struct kbase_jsctx_slot_tracking *slot_tracking =
+		&kctx->slot_tracking[js];
+	kbase_js_prio_bitmap_t prio_bit, higher_prios_mask;
+
+	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
+
+	/* done in two separate shifts to prevent future undefined behavior
+	 * should the number of priority levels == (bit width of the type)
+	 */
+	prio_bit = (((kbase_js_prio_bitmap_t)1) << sched_prio);
+	/* all bits of sched_prio or higher, with sched_prio = 0 being the
+	 * highest priority
+	 */
+	higher_prios_mask = (prio_bit << 1) - 1u;
+	return (slot_tracking->blocked & higher_prios_mask) != 0u;
+}
+
+/**
+ * kbase_jsctx_slot_atom_pulled_inc - Increase counts of atoms that have being
+ *                                    pulled for a slot from a ctx, based on
+ *                                    this atom
+ * @kctx: kbase context
+ * @katom: atom pulled
+ *
+ * Manages counts of atoms pulled (including per-priority-level counts), for
+ * later determining when a ctx can become unblocked on a slot.
+ *
+ * Once a slot has been blocked at @katom's priority level, it should not be
+ * pulled from, hence this function should not be called in that case.
+ *
+ * The return value is to aid tracking of when @kctx becomes runnable.
+ *
+ * Return: new total count of atoms pulled from all slots on @kctx
+ */
+static int kbase_jsctx_slot_atom_pulled_inc(struct kbase_context *kctx,
+					    const struct kbase_jd_atom *katom)
+{
+	int js = katom->slot_nr;
+	int sched_prio = katom->sched_priority;
+	struct kbase_jsctx_slot_tracking *slot_tracking =
+		&kctx->slot_tracking[js];
+	int nr_atoms_pulled;
+
+	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
+
+	WARN(kbase_jsctx_slot_prio_is_blocked(kctx, js, sched_prio),
+	     "Should not have pulled atoms for slot %d from a context that is blocked at priority %d or higher",
+	     js, sched_prio);
+
+	nr_atoms_pulled = atomic_inc_return(&kctx->atoms_pulled_all_slots);
+	atomic_inc(&slot_tracking->atoms_pulled);
+	slot_tracking->atoms_pulled_pri[sched_prio]++;
+
+	return nr_atoms_pulled;
+}
+
+/**
+ * kbase_jsctx_slot_atom_pulled_dec- Decrease counts of atoms that have being
+ *                                   pulled for a slot from a ctx, and
+ *                                   re-evaluate whether a context is blocked
+ *                                   on this slot
+ * @kctx: kbase context
+ * @katom: atom that has just been removed from a job slot
+ *
+ * @kctx can become unblocked on a slot for a priority level when it no longer
+ * has any pulled atoms at that priority level on that slot, and all higher
+ * (numerically lower) priority levels are also unblocked @kctx on that
+ * slot. The latter condition is to retain priority ordering within @kctx.
+ *
+ * Return: true if the slot was previously blocked but has now become unblocked
+ * at @katom's priority level, false otherwise.
+ */
+static bool kbase_jsctx_slot_atom_pulled_dec(struct kbase_context *kctx,
+					     const struct kbase_jd_atom *katom)
+{
+	int js = katom->slot_nr;
+	int sched_prio = katom->sched_priority;
+	int atoms_pulled_pri;
+	struct kbase_jsctx_slot_tracking *slot_tracking =
+		&kctx->slot_tracking[js];
+	bool slot_prio_became_unblocked = false;
+
+	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
+
+	atomic_dec(&kctx->atoms_pulled_all_slots);
+	atomic_dec(&slot_tracking->atoms_pulled);
+
+	atoms_pulled_pri = --(slot_tracking->atoms_pulled_pri[sched_prio]);
+
+	/* We can safely clear this priority level's blocked status even if
+	 * higher priority levels are still blocked: a subsequent query to
+	 * kbase_jsctx_slot_prio_is_blocked() will still return true
+	 */
+	if (!atoms_pulled_pri &&
+	    kbase_jsctx_slot_prio_is_blocked(kctx, js, sched_prio)) {
+		kbase_jsctx_slot_prio_blocked_clear(kctx, js, sched_prio);
+
+		if (!kbase_jsctx_slot_prio_is_blocked(kctx, js, sched_prio))
+			slot_prio_became_unblocked = true;
+	}
+
+	if (slot_prio_became_unblocked)
+		KBASE_KTRACE_ADD_JM_SLOT_INFO(kctx->kbdev,
+					      JS_SLOT_PRIO_AND_HIGHER_UNBLOCKED,
+					      kctx, katom, katom->jc, js,
+					      (unsigned int)sched_prio);
+
+	return slot_prio_became_unblocked;
+}
+
 /**
  * kbase_js_ctx_list_add_pullable_nolock - Variant of
  *                                         kbase_jd_ctx_list_add_pullable()
@@ -694,7 +898,7 @@ static bool kbase_js_ctx_list_add_pullable_nolock(struct kbase_device *kbdev,
 	if (!kctx->slots_pullable) {
 		kbdev->js_data.nr_contexts_pullable++;
 		ret = true;
-		if (!atomic_read(&kctx->atoms_pulled)) {
+		if (!kbase_jsctx_atoms_pulled(kctx)) {
 			WARN_ON(kbase_ctx_flag(kctx, KCTX_RUNNABLE_REF));
 			kbase_ctx_flag_set(kctx, KCTX_RUNNABLE_REF);
 			atomic_inc(&kbdev->js_data.nr_contexts_runnable);
@@ -736,7 +940,7 @@ static bool kbase_js_ctx_list_add_pullable_head_nolock(
 	if (!kctx->slots_pullable) {
 		kbdev->js_data.nr_contexts_pullable++;
 		ret = true;
-		if (!atomic_read(&kctx->atoms_pulled)) {
+		if (!kbase_jsctx_atoms_pulled(kctx)) {
 			WARN_ON(kbase_ctx_flag(kctx, KCTX_RUNNABLE_REF));
 			kbase_ctx_flag_set(kctx, KCTX_RUNNABLE_REF);
 			atomic_inc(&kbdev->js_data.nr_contexts_runnable);
@@ -809,7 +1013,7 @@ static bool kbase_js_ctx_list_add_unpullable_nolock(struct kbase_device *kbdev,
 	if (kctx->slots_pullable == (1 << js)) {
 		kbdev->js_data.nr_contexts_pullable--;
 		ret = true;
-		if (!atomic_read(&kctx->atoms_pulled)) {
+		if (!kbase_jsctx_atoms_pulled(kctx)) {
 			WARN_ON(!kbase_ctx_flag(kctx, KCTX_RUNNABLE_REF));
 			kbase_ctx_flag_clear(kctx, KCTX_RUNNABLE_REF);
 			atomic_dec(&kbdev->js_data.nr_contexts_runnable);
@@ -851,7 +1055,7 @@ static bool kbase_js_ctx_list_remove_nolock(struct kbase_device *kbdev,
 	if (kctx->slots_pullable == (1 << js)) {
 		kbdev->js_data.nr_contexts_pullable--;
 		ret = true;
-		if (!atomic_read(&kctx->atoms_pulled)) {
+		if (!kbase_jsctx_atoms_pulled(kctx)) {
 			WARN_ON(!kbase_ctx_flag(kctx, KCTX_RUNNABLE_REF));
 			kbase_ctx_flag_clear(kctx, KCTX_RUNNABLE_REF);
 			atomic_dec(&kbdev->js_data.nr_contexts_runnable);
@@ -958,9 +1162,12 @@ static bool kbase_js_ctx_pullable(struct kbase_context *kctx, int js,
 			(void *)kctx, js);
 		return false; /* No pullable atoms */
 	}
-	if (kctx->blocked_js[js][katom->sched_priority]) {
+	if (kbase_jsctx_slot_prio_is_blocked(kctx, js, katom->sched_priority)) {
+		KBASE_KTRACE_ADD_JM_SLOT_INFO(
+			kctx->kbdev, JS_SLOT_PRIO_IS_BLOCKED, kctx, katom,
+			katom->jc, js, (unsigned int)katom->sched_priority);
 		dev_dbg(kbdev->dev,
-			"JS: kctx %pK is blocked from submitting atoms at priority %d (s:%d)\n",
+			"JS: kctx %pK is blocked from submitting atoms at priority %d and lower (s:%d)\n",
 			(void *)kctx, katom->sched_priority, js);
 		return false;
 	}
@@ -2493,9 +2700,9 @@ struct kbase_jd_atom *kbase_js_pull(struct kbase_context *kctx, int js)
 			(void *)kctx, js);
 		return NULL;
 	}
-	if (kctx->blocked_js[js][katom->sched_priority]) {
+	if (kbase_jsctx_slot_prio_is_blocked(kctx, js, katom->sched_priority)) {
 		dev_dbg(kbdev->dev,
-			"JS: kctx %pK is blocked from submitting atoms at priority %d (s:%d)\n",
+			"JS: kctx %pK is blocked from submitting atoms at priority %d and lower (s:%d)\n",
 			(void *)kctx, katom->sched_priority, js);
 		return NULL;
 	}
@@ -2509,7 +2716,7 @@ struct kbase_jd_atom *kbase_js_pull(struct kbase_context *kctx, int js)
 	 * not allow multiple runs of fail-dep atoms from the same context to be
 	 * present on the same slot
 	 */
-	if (katom->pre_dep && atomic_read(&kctx->atoms_pulled_slot[js])) {
+	if (katom->pre_dep && kbase_jsctx_slot_atoms_pulled(kctx, js)) {
 		struct kbase_jd_atom *prev_atom =
 				kbase_backend_inspect_tail(kbdev, js);
 
@@ -2535,23 +2742,21 @@ struct kbase_jd_atom *kbase_js_pull(struct kbase_context *kctx, int js)
 		}
 	}
 
+	KBASE_KTRACE_ADD_JM_SLOT_INFO(kbdev, JS_PULL_JOB, kctx, katom,
+				      katom->jc, js, katom->sched_priority);
 	kbase_ctx_flag_set(kctx, KCTX_PULLED);
 	kbase_ctx_flag_set(kctx, (KCTX_PULLED_SINCE_ACTIVE_JS0 << js));
 
-	pulled = atomic_inc_return(&kctx->atoms_pulled);
+	pulled = kbase_jsctx_slot_atom_pulled_inc(kctx, katom);
 	if (pulled == 1 && !kctx->slots_pullable) {
 		WARN_ON(kbase_ctx_flag(kctx, KCTX_RUNNABLE_REF));
 		kbase_ctx_flag_set(kctx, KCTX_RUNNABLE_REF);
 		atomic_inc(&kbdev->js_data.nr_contexts_runnable);
 	}
-	atomic_inc(&kctx->atoms_pulled_slot[katom->slot_nr]);
-	kctx->atoms_pulled_slot_pri[katom->slot_nr][katom->sched_priority]++;
 	jsctx_rb_pull(kctx, katom);
 
 	kbase_ctx_sched_retain_ctx_refcount(kctx);
 
-	katom->atom_flags |= KBASE_KATOM_FLAG_HOLDING_CTX_REF;
-
 	katom->ticks = 0;
 
 	dev_dbg(kbdev->dev, "JS: successfully pulled atom %pK from kctx %pK (s:%d)\n",
@@ -2773,15 +2978,18 @@ static void js_return_worker(struct work_struct *data)
 	struct kbasep_js_kctx_info *js_kctx_info = &kctx->jctx.sched_info;
 	struct kbasep_js_atom_retained_state retained_state;
 	int js = katom->slot_nr;
-	int prio = katom->sched_priority;
+	bool slot_became_unblocked;
 	bool timer_sync = false;
 	bool context_idle = false;
 	unsigned long flags;
 	base_jd_core_req core_req = katom->core_req;
+	u64 cache_jc = katom->jc;
 
 	dev_dbg(kbdev->dev, "%s for atom %pK with event code 0x%x\n",
 		__func__, (void *)katom, katom->event_code);
 
+	KBASE_KTRACE_ADD_JM(kbdev, JS_RETURN_WORKER, kctx, katom, katom->jc, 0);
+
 	if (katom->event_code != BASE_JD_EVENT_END_RP_DONE)
 		KBASE_TLSTREAM_TL_EVENT_ATOM_SOFTSTOP_EX(kbdev, katom);
 
@@ -2792,37 +3000,27 @@ static void js_return_worker(struct work_struct *data)
 	mutex_lock(&js_devdata->queue_mutex);
 	mutex_lock(&js_kctx_info->ctx.jsctx_mutex);
 
-	atomic_dec(&kctx->atoms_pulled);
-	atomic_dec(&kctx->atoms_pulled_slot[js]);
-
 	if (katom->event_code != BASE_JD_EVENT_END_RP_DONE)
 		atomic_dec(&katom->blocked);
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 
-	kctx->atoms_pulled_slot_pri[js][katom->sched_priority]--;
+	slot_became_unblocked = kbase_jsctx_slot_atom_pulled_dec(kctx, katom);
 
-	if (!atomic_read(&kctx->atoms_pulled_slot[js]) &&
-			jsctx_rb_none_to_pull(kctx, js))
+	if (!kbase_jsctx_slot_atoms_pulled(kctx, js) &&
+	    jsctx_rb_none_to_pull(kctx, js))
 		timer_sync |= kbase_js_ctx_list_remove_nolock(kbdev, kctx, js);
 
-	/* If this slot has been blocked due to soft-stopped atoms, and all
-	 * atoms have now been processed, then unblock the slot
+	/* If the context is now unblocked on this slot after soft-stopped
+	 * atoms, then only mark it as pullable on this slot if it is not
+	 * idle
 	 */
-	if (!kctx->atoms_pulled_slot_pri[js][prio] &&
-			kctx->blocked_js[js][prio]) {
-		kctx->blocked_js[js][prio] = false;
+	if (slot_became_unblocked && kbase_jsctx_atoms_pulled(kctx) &&
+	    kbase_js_ctx_pullable(kctx, js, true))
+		timer_sync |=
+			kbase_js_ctx_list_add_pullable_nolock(kbdev, kctx, js);
 
-		/* Only mark the slot as pullable if the context is not idle -
-		 * that case is handled below
-		 */
-		if (atomic_read(&kctx->atoms_pulled) &&
-				kbase_js_ctx_pullable(kctx, js, true))
-			timer_sync |= kbase_js_ctx_list_add_pullable_nolock(
-					kbdev, kctx, js);
-	}
-
-	if (!atomic_read(&kctx->atoms_pulled)) {
+	if (!kbase_jsctx_atoms_pulled(kctx)) {
 		dev_dbg(kbdev->dev,
 			"No atoms currently pulled from context %pK\n",
 			(void *)kctx);
@@ -2890,7 +3088,6 @@ static void js_return_worker(struct work_struct *data)
 		mutex_unlock(&kctx->jctx.lock);
 	}
 
-	katom->atom_flags &= ~KBASE_KATOM_FLAG_HOLDING_CTX_REF;
 	dev_dbg(kbdev->dev, "JS: retained state %s finished",
 		kbasep_js_has_atom_finished(&retained_state) ?
 		"has" : "hasn't");
@@ -2904,6 +3101,9 @@ static void js_return_worker(struct work_struct *data)
 
 	kbase_backend_complete_wq_post_sched(kbdev, core_req);
 
+	KBASE_KTRACE_ADD_JM(kbdev, JS_RETURN_WORKER_END, kctx, NULL, cache_jc,
+			    0);
+
 	dev_dbg(kbdev->dev, "Leaving %s for atom %pK\n",
 		__func__, (void *)katom);
 }
@@ -3113,15 +3313,16 @@ bool kbase_js_complete_atom_wq(struct kbase_context *kctx,
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 
 	if (katom->atom_flags & KBASE_KATOM_FLAG_JSCTX_IN_TREE) {
+		bool slot_became_unblocked;
+
 		dev_dbg(kbdev->dev, "Atom %pK is in runnable_tree\n",
 			(void *)katom);
 
-		context_idle = !atomic_dec_return(&kctx->atoms_pulled);
-		atomic_dec(&kctx->atoms_pulled_slot[atom_slot]);
-		kctx->atoms_pulled_slot_pri[atom_slot][prio]--;
+		slot_became_unblocked =
+			kbase_jsctx_slot_atom_pulled_dec(kctx, katom);
+		context_idle = !kbase_jsctx_atoms_pulled(kctx);
 
-		if (!atomic_read(&kctx->atoms_pulled) &&
-				!kctx->slots_pullable) {
+		if (!kbase_jsctx_atoms_pulled(kctx) && !kctx->slots_pullable) {
 			WARN_ON(!kbase_ctx_flag(kctx, KCTX_RUNNABLE_REF));
 			kbase_ctx_flag_clear(kctx, KCTX_RUNNABLE_REF);
 			atomic_dec(&kbdev->js_data.nr_contexts_runnable);
@@ -3129,15 +3330,14 @@ bool kbase_js_complete_atom_wq(struct kbase_context *kctx,
 		}
 
 		/* If this slot has been blocked due to soft-stopped atoms, and
-		 * all atoms have now been processed, then unblock the slot
+		 * all atoms have now been processed at this priority level and
+		 * higher, then unblock the slot
 		 */
-		if (!kctx->atoms_pulled_slot_pri[atom_slot][prio]
-				&& kctx->blocked_js[atom_slot][prio]) {
+		if (slot_became_unblocked) {
 			dev_dbg(kbdev->dev,
-				"kctx %pK is no longer blocked from submitting on slot %d at priority %d\n",
+				"kctx %pK is no longer blocked from submitting on slot %d at priority %d or higher\n",
 				(void *)kctx, atom_slot, prio);
 
-			kctx->blocked_js[atom_slot][prio] = false;
 			if (kbase_js_ctx_pullable(kctx, atom_slot, true))
 				timer_sync |=
 					kbase_js_ctx_list_add_pullable_nolock(
@@ -3146,8 +3346,8 @@ bool kbase_js_complete_atom_wq(struct kbase_context *kctx,
 	}
 	WARN_ON(!(katom->atom_flags & KBASE_KATOM_FLAG_JSCTX_IN_TREE));
 
-	if (!atomic_read(&kctx->atoms_pulled_slot[atom_slot]) &&
-			jsctx_rb_none_to_pull(kctx, atom_slot)) {
+	if (!kbase_jsctx_slot_atoms_pulled(kctx, atom_slot) &&
+	    jsctx_rb_none_to_pull(kctx, atom_slot)) {
 		if (!list_empty(
 			&kctx->jctx.sched_info.ctx.ctx_list_entry[atom_slot]))
 			timer_sync |= kbase_js_ctx_list_remove_nolock(
@@ -3160,8 +3360,8 @@ bool kbase_js_complete_atom_wq(struct kbase_context *kctx,
 	 * re-enable submission so that context can be scheduled again.
 	 */
 	if (!kbasep_js_is_submit_allowed(js_devdata, kctx) &&
-					!atomic_read(&kctx->atoms_pulled) &&
-					!kbase_ctx_flag(kctx, KCTX_DYING)) {
+	    !kbase_jsctx_atoms_pulled(kctx) &&
+	    !kbase_ctx_flag(kctx, KCTX_DYING)) {
 		int js;
 
 		kbasep_js_set_submit_allowed(js_devdata, kctx);
@@ -3297,7 +3497,9 @@ struct kbase_jd_atom *kbase_js_complete_atom(struct kbase_jd_atom *katom,
 	trace_sysgraph_gpu(SGR_COMPLETE, kctx->id,
 			kbase_jd_atom_id(katom->kctx, katom), katom->slot_nr);
 
+	KBASE_TLSTREAM_TL_JD_DONE_START(kbdev, katom);
 	kbase_jd_done(katom, katom->slot_nr, end_timestamp, 0);
+	KBASE_TLSTREAM_TL_JD_DONE_END(kbdev, katom);
 
 	/* Unblock cross dependency if present */
 	if (x_dep && (katom->event_code == BASE_JD_EVENT_DONE ||
@@ -3405,6 +3607,8 @@ void kbase_js_sched(struct kbase_device *kbdev, int js_mask)
 	bool ctx_waiting[BASE_JM_MAX_NR_SLOTS];
 	int js;
 
+	KBASE_TLSTREAM_TL_JS_SCHED_START(kbdev, 0);
+
 	dev_dbg(kbdev->dev, "%s kbdev %pK mask 0x%x\n",
 		__func__, (void *)kbdev, (unsigned int)js_mask);
 
@@ -3460,6 +3664,8 @@ void kbase_js_sched(struct kbase_device *kbdev, int js_mask)
 					&kctx->jctx.sched_info.ctx.jsctx_mutex);
 					mutex_unlock(&js_devdata->queue_mutex);
 					up(&js_devdata->schedule_sem);
+					KBASE_TLSTREAM_TL_JS_SCHED_END(kbdev,
+									  0);
 					return;
 				}
 				kbase_ctx_flag_set(kctx, KCTX_ACTIVE);
@@ -3604,6 +3810,7 @@ void kbase_js_sched(struct kbase_device *kbdev, int js_mask)
 
 	mutex_unlock(&js_devdata->queue_mutex);
 	up(&js_devdata->schedule_sem);
+	KBASE_TLSTREAM_TL_JS_SCHED_END(kbdev, 0);
 }
 
 void kbase_js_zap_context(struct kbase_context *kctx)
diff --git a/mali_kbase/mali_kbase_kinstr_prfcnt.c b/mali_kbase/mali_kbase_kinstr_prfcnt.c
new file mode 100644
index 0000000..ce996ca
--- /dev/null
+++ b/mali_kbase/mali_kbase_kinstr_prfcnt.c
@@ -0,0 +1,1184 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#include "mali_kbase_kinstr_prfcnt.h"
+#include "mali_kbase_hwcnt_virtualizer.h"
+#include "mali_kbase_hwcnt_types.h"
+#include <uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h>
+#include "mali_kbase_hwcnt_gpu.h"
+#include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h>
+#include "mali_malisw.h"
+#include "mali_kbase_debug.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/fcntl.h>
+#include <linux/fs.h>
+#include <linux/hrtimer.h>
+#include <linux/log2.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+/* The minimum allowed interval between dumps, in nanoseconds
+ * (equivalent to 10KHz)
+ */
+#define DUMP_INTERVAL_MIN_NS (100 * NSEC_PER_USEC)
+
+/* The minimum allowed interval between dumps, in microseconds
+ * (equivalent to 10KHz)
+ */
+#define DUMP_INTERVAL_MIN_US (DUMP_INTERVAL_MIN_NS / 1000)
+
+/* The maximum allowed buffers per client */
+#define MAX_BUFFER_COUNT 32
+
+/**
+ * struct kbase_kinstr_prfcnt_context - IOCTL interface for userspace hardware
+ *                                      counters.
+ * @hvirt:           Hardware counter virtualizer used by kinstr_prfcnt.
+ * @info_item_count: Number of metadata elements.
+ * @metadata:        Hardware counter metadata provided by virtualizer.
+ * @lock:            Lock protecting kinstr_prfcnt state.
+ * @suspend_count:   Suspend reference count. If non-zero, timer and worker
+ *                   are prevented from being re-scheduled.
+ * @client_count:    Number of kinstr_prfcnt clients.
+ * @clients:         List of kinstr_prfcnt clients.
+ * @dump_timer:      Timer that enqueues dump_work to a workqueue.
+ * @dump_work:       Worker for performing periodic counter dumps.
+ */
+struct kbase_kinstr_prfcnt_context {
+	struct kbase_hwcnt_virtualizer *hvirt;
+	u32 info_item_count;
+	const struct kbase_hwcnt_metadata *metadata;
+	struct mutex lock;
+	size_t suspend_count;
+	size_t client_count;
+	struct list_head clients;
+	struct hrtimer dump_timer;
+	struct work_struct dump_work;
+};
+
+/**
+ * struct kbase_kinstr_prfcnt_sample - Buffer and descriptor for sample data.
+ * @sample_meta: Pointer to samle metadata.
+ * @dump_buf:    Dump buffer containing sample data.
+ */
+struct kbase_kinstr_prfcnt_sample {
+	u64 *sample_meta;
+	struct kbase_hwcnt_dump_buffer dump_buf;
+};
+
+/**
+ * struct kbase_kinstr_prfcnt_sample_array - Array of sample data.
+ * @page_addr:    Address of allocated pages. A single allocation is used
+ *                for all Dump Buffers in the array.
+ * @page_order: The allocation order of the pages.
+ * @sample_count: Number of allocated samples.
+ * @samples:      Non-NULL pointer to the array of Dump Buffers.
+ */
+struct kbase_kinstr_prfcnt_sample_array {
+	u64 page_addr;
+	unsigned int page_order;
+	size_t sample_count;
+	struct kbase_kinstr_prfcnt_sample *samples;
+};
+
+/**
+ * struct kbase_kinstr_prfcnt_client_config - Client session configuration.
+ * @prfcnt_mode:  Sampling mode: either manual or periodic.
+ * @counter_set:  Set of performance counter blocks.
+ * @buffer_count: Number of buffers used to store samples.
+ * @period_us:    Sampling period, in microseconds, or 0 if manual mode.
+ * @phys_em:      Enable map used by the GPU.
+ */
+struct kbase_kinstr_prfcnt_client_config {
+	u8 prfcnt_mode;
+	u8 counter_set;
+	u16 buffer_count;
+	u64 period_us;
+	struct kbase_hwcnt_physical_enable_map phys_em;
+};
+
+/**
+ * struct kbase_kinstr_prfcnt_client - A kinstr_prfcnt client attached
+ *                                     to a kinstr_prfcnt context.
+ * @kinstr_ctx:        kinstr_prfcnt context client is attached to.
+ * @hvcli:             Hardware counter virtualizer client.
+ * @node:              Node used to attach this client to list in kinstr_prfcnt
+ *                     context.
+ * @next_dump_time_ns: Time in ns when this client's next periodic dump must
+ *                     occur. If 0, not a periodic client.
+ * @dump_interval_ns:  Interval between periodic dumps. If 0, not a periodic
+ *                     client.
+ * @config:            Configuration of the client session.
+ * @enable_map:        Counters enable map.
+ * @tmp_buf:           Temporary buffer to use before handing over dump to
+ *                     client.
+ * @sample_arr:        Array of dump buffers allocated by this client.
+ * @dump_bufs_meta:    Metadata of dump buffers.
+ * @meta_idx:          Index of metadata being accessed by userspace.
+ * @read_idx:          Index of buffer read by userspace.
+ * @write_idx:         Index of buffer being written by dump worker.
+ * @waitq:             Client's notification queue.
+ * @sample_size:       Size of the data required for one sample, in bytes.
+ * @sample_count:      Number of samples the client is able to capture.
+ */
+struct kbase_kinstr_prfcnt_client {
+	struct kbase_kinstr_prfcnt_context *kinstr_ctx;
+	struct kbase_hwcnt_virtualizer_client *hvcli;
+	struct list_head node;
+	u64 next_dump_time_ns;
+	u32 dump_interval_ns;
+	struct kbase_kinstr_prfcnt_client_config config;
+	struct kbase_hwcnt_enable_map enable_map;
+	struct kbase_hwcnt_dump_buffer tmp_buf;
+	struct kbase_kinstr_prfcnt_sample_array sample_arr;
+	struct kbase_hwcnt_reader_metadata *dump_bufs_meta;
+	atomic_t meta_idx;
+	atomic_t read_idx;
+	atomic_t write_idx;
+	wait_queue_head_t waitq;
+	size_t sample_size;
+	size_t sample_count;
+};
+
+static struct prfcnt_enum_item kinstr_prfcnt_supported_requests[] = {
+	{
+		/* Request description for MODE request */
+		.hdr = {
+				.item_type = PRFCNT_ENUM_TYPE_REQUEST,
+				.item_version = PRFCNT_READER_API_VERSION,
+		},
+		.u.request = {
+				.request_item_type = PRFCNT_REQUEST_MODE,
+				.versions_mask = 0x1,
+		},
+	},
+	{
+		/* Request description for ENABLE request */
+		.hdr = {
+				.item_type = PRFCNT_ENUM_TYPE_REQUEST,
+				.item_version = PRFCNT_READER_API_VERSION,
+		},
+		.u.request = {
+				.request_item_type = PRFCNT_REQUEST_ENABLE,
+				.versions_mask = 0x1,
+		},
+	},
+};
+
+/**
+ * kbasep_kinstr_prfcnt_hwcnt_reader_buffer_ready() - Check if client has ready
+ *                                                    buffers.
+ * @cli: Non-NULL pointer to kinstr_prfcnt client.
+ *
+ * Return: Non-zero if client has at least one dumping buffer filled that was
+ *         not notified to user yet.
+ */
+static int kbasep_kinstr_prfcnt_hwcnt_reader_buffer_ready(
+	struct kbase_kinstr_prfcnt_client *cli)
+{
+	WARN_ON(!cli);
+	return atomic_read(&cli->write_idx) != atomic_read(&cli->meta_idx);
+}
+
+/**
+ * kbasep_kinstr_prfcnt_hwcnt_reader_poll() - hwcnt reader's poll.
+ * @filp: Non-NULL pointer to file structure.
+ * @wait: Non-NULL pointer to poll table.
+ *
+ * Return: POLLIN if data can be read without blocking, 0 if data can not be
+ *         read without blocking, else error code.
+ */
+static unsigned int kbasep_kinstr_prfcnt_hwcnt_reader_poll(struct file *filp,
+							   poll_table *wait)
+{
+	struct kbase_kinstr_prfcnt_client *cli;
+
+	if (!filp || !wait)
+		return -EINVAL;
+
+	cli = filp->private_data;
+
+	if (!cli)
+		return -EINVAL;
+
+	poll_wait(filp, &cli->waitq, wait);
+
+	if (kbasep_kinstr_prfcnt_hwcnt_reader_buffer_ready(cli))
+		return POLLIN;
+
+	return 0;
+}
+
+/**
+ * kbasep_kinstr_prfcnt_hwcnt_reader_ioctl() - hwcnt reader's ioctl.
+ * @filp:   Non-NULL pointer to file structure.
+ * @cmd:    User command.
+ * @arg:    Command's argument.
+ *
+ * Return: 0 on success, else error code.
+ */
+static long kbasep_kinstr_prfcnt_hwcnt_reader_ioctl(struct file *filp,
+						    unsigned int cmd,
+						    unsigned long arg)
+{
+	long rcode;
+	struct kbase_kinstr_prfcnt_client *cli;
+
+	if (!filp || (_IOC_TYPE(cmd) != KBASE_HWCNT_READER))
+		return -EINVAL;
+
+	cli = filp->private_data;
+
+	if (!cli)
+		return -EINVAL;
+
+	switch (_IOC_NR(cmd)) {
+	default:
+		pr_warn("Unknown HWCNT ioctl 0x%x nr:%d", cmd, _IOC_NR(cmd));
+		rcode = -EINVAL;
+		break;
+	}
+
+	return rcode;
+}
+
+/**
+ * kbasep_kinstr_prfcnt_hwcnt_reader_mmap() - hwcnt reader's mmap.
+ * @filp: Non-NULL pointer to file structure.
+ * @vma:  Non-NULL pointer to vma structure.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_kinstr_prfcnt_hwcnt_reader_mmap(struct file *filp,
+						  struct vm_area_struct *vma)
+{
+	struct kbase_kinstr_prfcnt_client *cli;
+	unsigned long vm_size, size, addr, pfn, offset;
+
+	if (!filp || !vma)
+		return -EINVAL;
+
+	cli = filp->private_data;
+
+	if (!cli)
+		return -EINVAL;
+
+	vm_size = vma->vm_end - vma->vm_start;
+
+	/* The mapping is allowed to span the entirety of the page allocation,
+	 * not just the chunk where the dump buffers are allocated.
+	 * This accommodates the corner case where the combined size of the
+	 * dump buffers is smaller than a single page.
+	 * This does not pose a security risk as the pages are zeroed on
+	 * allocation, and anything out of bounds of the dump buffers is never
+	 * written to.
+	 */
+	size = (1ull << cli->sample_arr.page_order) * PAGE_SIZE;
+
+	if (vma->vm_pgoff > (size >> PAGE_SHIFT))
+		return -EINVAL;
+
+	offset = vma->vm_pgoff << PAGE_SHIFT;
+
+	if (vm_size > size - offset)
+		return -EINVAL;
+
+	addr = __pa(cli->sample_arr.page_addr + offset);
+	pfn = addr >> PAGE_SHIFT;
+
+	return remap_pfn_range(vma, vma->vm_start, pfn, vm_size,
+			       vma->vm_page_prot);
+}
+
+static void kbasep_kinstr_prfcnt_sample_array_free(
+	struct kbase_kinstr_prfcnt_sample_array *sample_arr)
+{
+	if (!sample_arr)
+		return;
+
+	kfree((void *)sample_arr->samples);
+	kfree((void *)(size_t)sample_arr->page_addr);
+	memset(sample_arr, 0, sizeof(*sample_arr));
+}
+
+/**
+ * kbasep_kinstr_prfcnt_client_destroy() - Destroy a kinstr_prfcnt client.
+ * @cli: kinstr_prfcnt client. Must not be attached to a kinstr_prfcnt context.
+ */
+static void
+kbasep_kinstr_prfcnt_client_destroy(struct kbase_kinstr_prfcnt_client *cli)
+{
+	if (!cli)
+		return;
+
+	kbase_hwcnt_virtualizer_client_destroy(cli->hvcli);
+	kfree(cli->dump_bufs_meta);
+	kbasep_kinstr_prfcnt_sample_array_free(&cli->sample_arr);
+	kbase_hwcnt_dump_buffer_free(&cli->tmp_buf);
+	kbase_hwcnt_enable_map_free(&cli->enable_map);
+	kfree(cli);
+}
+
+/**
+ * kbasep_kinstr_prfcnt_hwcnt_reader_release() - hwcnt reader's release.
+ * @inode: Non-NULL pointer to inode structure.
+ * @filp:  Non-NULL pointer to file structure.
+ *
+ * Return: 0 always.
+ */
+static int kbasep_kinstr_prfcnt_hwcnt_reader_release(struct inode *inode,
+						     struct file *filp)
+{
+	struct kbase_kinstr_prfcnt_client *cli = filp->private_data;
+
+	mutex_lock(&cli->kinstr_ctx->lock);
+
+	WARN_ON(cli->kinstr_ctx->client_count == 0);
+	if (cli->kinstr_ctx->client_count > 0)
+		cli->kinstr_ctx->client_count--;
+	list_del(&cli->node);
+
+	mutex_unlock(&cli->kinstr_ctx->lock);
+
+	kbasep_kinstr_prfcnt_client_destroy(cli);
+
+	return 0;
+}
+
+/* kinstr_prfcnt client file operations */
+static const struct file_operations kinstr_prfcnt_client_fops = {
+	.owner = THIS_MODULE,
+	.poll = kbasep_kinstr_prfcnt_hwcnt_reader_poll,
+	.unlocked_ioctl = kbasep_kinstr_prfcnt_hwcnt_reader_ioctl,
+	.compat_ioctl = kbasep_kinstr_prfcnt_hwcnt_reader_ioctl,
+	.mmap = kbasep_kinstr_prfcnt_hwcnt_reader_mmap,
+	.release = kbasep_kinstr_prfcnt_hwcnt_reader_release,
+};
+
+static size_t kbasep_kinstr_prfcnt_get_sample_size(
+	const struct kbase_hwcnt_metadata *metadata,
+	struct kbase_hwcnt_dump_buffer *dump_buf)
+{
+	size_t dump_buf_bytes;
+	size_t clk_cnt_buf_bytes;
+	size_t sample_meta_bytes;
+	size_t block_count = 0;
+	size_t grp, blk, blk_inst;
+
+	if (!metadata)
+		return 0;
+
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+		block_count++;
+
+	/* Reserve one for last sentinel item. */
+	block_count++;
+
+	sample_meta_bytes = sizeof(struct prfcnt_metadata) * block_count;
+	dump_buf_bytes = metadata->dump_buf_bytes;
+	clk_cnt_buf_bytes = sizeof(*dump_buf->clk_cnt_buf) * metadata->clk_cnt;
+
+	return (sample_meta_bytes + dump_buf_bytes + clk_cnt_buf_bytes);
+}
+
+/**
+ * kbasep_kinstr_prfcnt_dump_worker()- Dump worker, that dumps all periodic
+ *                                     clients that need to be dumped, then
+ *                                     reschedules itself.
+ * @work: Work structure.
+ */
+static void kbasep_kinstr_prfcnt_dump_worker(struct work_struct *work)
+{
+	/* Do nothing. */
+}
+
+/**
+ * kbasep_kinstr_prfcnt_dump_timer() - Dump timer that schedules the dump worker for
+ *                              execution as soon as possible.
+ * @timer: Timer structure.
+ */
+static enum hrtimer_restart
+kbasep_kinstr_prfcnt_dump_timer(struct hrtimer *timer)
+{
+	return HRTIMER_NORESTART;
+}
+
+int kbase_kinstr_prfcnt_init(struct kbase_hwcnt_virtualizer *hvirt,
+			     struct kbase_kinstr_prfcnt_context **out_kinstr_ctx)
+{
+	struct kbase_kinstr_prfcnt_context *kinstr_ctx;
+	const struct kbase_hwcnt_metadata *metadata;
+
+	if (!hvirt || !out_kinstr_ctx)
+		return -EINVAL;
+
+	metadata = kbase_hwcnt_virtualizer_metadata(hvirt);
+
+	if (!metadata)
+		return -EINVAL;
+
+	kinstr_ctx = kzalloc(sizeof(*kinstr_ctx), GFP_KERNEL);
+
+	if (!kinstr_ctx)
+		return -ENOMEM;
+
+	kinstr_ctx->hvirt = hvirt;
+	kinstr_ctx->metadata = metadata;
+
+	mutex_init(&kinstr_ctx->lock);
+	INIT_LIST_HEAD(&kinstr_ctx->clients);
+	hrtimer_init(&kinstr_ctx->dump_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL);
+	kinstr_ctx->dump_timer.function = kbasep_kinstr_prfcnt_dump_timer;
+	INIT_WORK(&kinstr_ctx->dump_work, kbasep_kinstr_prfcnt_dump_worker);
+
+	*out_kinstr_ctx = kinstr_ctx;
+	return 0;
+}
+
+void kbase_kinstr_prfcnt_term(struct kbase_kinstr_prfcnt_context *kinstr_ctx)
+{
+	if (!kinstr_ctx)
+		return;
+
+	cancel_work_sync(&kinstr_ctx->dump_work);
+
+	/* Non-zero client count implies client leak */
+	if (WARN_ON(kinstr_ctx->client_count > 0)) {
+		struct kbase_kinstr_prfcnt_client *pos, *n;
+
+		list_for_each_entry_safe(pos, n, &kinstr_ctx->clients, node) {
+			list_del(&pos->node);
+			kinstr_ctx->client_count--;
+			kbasep_kinstr_prfcnt_client_destroy(pos);
+		}
+	}
+
+	WARN_ON(kinstr_ctx->client_count > 0);
+	kfree(kinstr_ctx);
+}
+
+void kbase_kinstr_prfcnt_suspend(struct kbase_kinstr_prfcnt_context *kinstr_ctx)
+{
+	if (WARN_ON(!kinstr_ctx))
+		return;
+
+	mutex_lock(&kinstr_ctx->lock);
+
+	if (!WARN_ON(kinstr_ctx->suspend_count == SIZE_MAX))
+		kinstr_ctx->suspend_count++;
+
+	mutex_unlock(&kinstr_ctx->lock);
+
+	/* Always sync cancel the timer and then the worker, regardless of the
+	 * new suspend count.
+	 *
+	 * This ensures concurrent calls to kbase_kinstr_prfcnt_suspend() always block
+	 * until kinstr_prfcnt is fully suspended.
+	 *
+	 * The timer is canceled before the worker, as the timer
+	 * unconditionally re-enqueues the worker, but the worker checks the
+	 * suspend_count that we just incremented before rescheduling the timer.
+	 *
+	 * Therefore if we cancel the worker first, the timer might re-enqueue
+	 * the worker before we cancel the timer, but the opposite is not
+	 * possible.
+	 */
+	hrtimer_cancel(&kinstr_ctx->dump_timer);
+	cancel_work_sync(&kinstr_ctx->dump_work);
+}
+
+void kbase_kinstr_prfcnt_resume(struct kbase_kinstr_prfcnt_context *kinstr_ctx)
+{
+	if (WARN_ON(!kinstr_ctx))
+		return;
+
+	mutex_lock(&kinstr_ctx->lock);
+
+	if (!WARN_ON(kinstr_ctx->suspend_count == 0)) {
+		kinstr_ctx->suspend_count--;
+
+		/* Last resume, so re-enqueue the worker if we have any periodic
+		 * clients.
+		 */
+		if (kinstr_ctx->suspend_count == 0) {
+			struct kbase_kinstr_prfcnt_client *pos;
+			bool has_periodic_clients = false;
+
+			list_for_each_entry(pos, &kinstr_ctx->clients, node) {
+				if (pos->dump_interval_ns != 0) {
+					has_periodic_clients = true;
+					break;
+				}
+			}
+
+			if (has_periodic_clients)
+				kbase_hwcnt_virtualizer_queue_work(
+					kinstr_ctx->hvirt,
+					&kinstr_ctx->dump_work);
+		}
+	}
+
+	mutex_unlock(&kinstr_ctx->lock);
+}
+
+static int kbasep_kinstr_prfcnt_sample_array_alloc(
+	const struct kbase_hwcnt_metadata *metadata, size_t n,
+	struct kbase_kinstr_prfcnt_sample_array *sample_arr)
+{
+	struct kbase_kinstr_prfcnt_sample *samples;
+	size_t sample_idx;
+	u64 addr;
+	unsigned int order;
+	size_t dump_buf_bytes;
+	size_t clk_cnt_buf_bytes;
+	size_t sample_meta_bytes;
+	size_t block_count = 0;
+	size_t sample_size;
+	size_t grp, blk, blk_inst;
+
+	if (!metadata || !sample_arr)
+		return -EINVAL;
+
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+		block_count++;
+
+	/* Reserve one for last sentinel item. */
+	block_count++;
+
+	sample_meta_bytes = sizeof(struct prfcnt_metadata) * block_count;
+	dump_buf_bytes = metadata->dump_buf_bytes;
+	clk_cnt_buf_bytes =
+		sizeof(*samples->dump_buf.clk_cnt_buf) * metadata->clk_cnt;
+	sample_size = sample_meta_bytes + dump_buf_bytes + clk_cnt_buf_bytes;
+
+	samples = kmalloc_array(n, sizeof(*samples), GFP_KERNEL);
+
+	if (!samples)
+		return -ENOMEM;
+
+	order = get_order(sample_size * n);
+	addr = (u64)(uintptr_t)kzalloc(sample_size * n, GFP_KERNEL);
+
+	if (!addr) {
+		kfree((void *)samples);
+		return -ENOMEM;
+	}
+
+	sample_arr->page_addr = addr;
+	sample_arr->page_order = order;
+	sample_arr->sample_count = n;
+	sample_arr->samples = samples;
+
+	for (sample_idx = 0; sample_idx < n; sample_idx++) {
+		const size_t sample_meta_offset = sample_size * sample_idx;
+		const size_t dump_buf_offset =
+			sample_meta_offset + sample_meta_bytes;
+		const size_t clk_cnt_buf_offset =
+			dump_buf_offset + dump_buf_bytes;
+
+		/* Internal layout in a sample buffer: [sample metadata, dump_buf, clk_cnt_buf]. */
+		samples[sample_idx].dump_buf.metadata = metadata;
+		samples[sample_idx].sample_meta =
+			(u64 *)(uintptr_t)(addr + sample_meta_offset);
+		samples[sample_idx].dump_buf.dump_buf =
+			(u64 *)(uintptr_t)(addr + dump_buf_offset);
+		samples[sample_idx].dump_buf.clk_cnt_buf =
+			(u64 *)(uintptr_t)(addr + clk_cnt_buf_offset);
+	}
+
+	return 0;
+}
+
+static bool prfcnt_mode_supported(u8 mode)
+{
+	return (mode == PRFCNT_MODE_MANUAL) || (mode == PRFCNT_MODE_PERIODIC);
+}
+
+static void
+kbasep_kinstr_prfcnt_block_enable_to_physical(uint32_t *phys_em,
+					      const uint64_t *enable_mask)
+{
+	*phys_em |= kbase_hwcnt_backend_gpu_block_map_to_physical(
+		enable_mask[0], enable_mask[1]);
+}
+
+/**
+ * kbasep_kinstr_prfcnt_parse_request_enable - Parse an enable request
+ * @req_enable: Performance counters enable request to parse.
+ * @config:     Client object the session configuration should be written to.
+ *
+ * This function parses a performance counters enable request.
+ * This type of request specifies a bitmask of HW counters to enable
+ * for one performance counters block type. In addition to that,
+ * a performance counters enable request may also set "global"
+ * configuration properties that affect the whole session, like the
+ * performance counters set, which shall be compatible with the same value
+ * set by other performance request items.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_kinstr_prfcnt_parse_request_enable(
+	const struct prfcnt_request_enable *req_enable,
+	struct kbase_kinstr_prfcnt_client_config *config)
+{
+	int err = 0;
+	u8 req_set = KBASE_HWCNT_SET_UNDEFINED, default_set;
+
+	switch (req_enable->set) {
+	case PRFCNT_SET_PRIMARY:
+		req_set = KBASE_HWCNT_SET_PRIMARY;
+		break;
+	case PRFCNT_SET_SECONDARY:
+		req_set = KBASE_HWCNT_SET_SECONDARY;
+		break;
+	case PRFCNT_SET_TERTIARY:
+		req_set = KBASE_HWCNT_SET_TERTIARY;
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	/* The performance counter set is a "global" property that affects
+	 * the whole session. Either this is the first request that sets
+	 * the value, or it shall be identical to all previous requests.
+	 */
+	if (!err) {
+		if (config->counter_set == KBASE_HWCNT_SET_UNDEFINED)
+			config->counter_set = req_set;
+		else if (config->counter_set != req_set)
+			err = -EINVAL;
+	}
+
+	/* Temporarily, the requested set cannot be different from the default
+	 * set because it's the only one to be supported. This will change in
+	 * the future.
+	 */
+#if defined(CONFIG_MALI_PRFCNT_SET_SECONDARY)
+	default_set = KBASE_HWCNT_SET_SECONDARY;
+#elif defined(CONFIG_MALI_PRFCNT_SET_TERTIARY)
+	default_set = KBASE_HWCNT_SET_TERTIARY;
+#else
+	/* Default to primary */
+	default_set = KBASE_HWCNT_SET_PRIMARY;
+#endif
+
+	if (req_set != default_set)
+		err = -EINVAL;
+
+	if (err < 0)
+		return err;
+
+	/* Enable the performance counters based on the bitmask provided
+	 * by the user space client.
+	 * It is possible to receive multiple requests for the same counter
+	 * block, in which case the bitmask will be a logical OR of all the
+	 * bitmasks given by the client.
+	 */
+	switch (req_enable->block_type) {
+	case PRFCNT_BLOCK_TYPE_FE:
+		kbasep_kinstr_prfcnt_block_enable_to_physical(
+			&config->phys_em.fe_bm, req_enable->enable_mask);
+		break;
+	case PRFCNT_BLOCK_TYPE_TILER:
+		kbasep_kinstr_prfcnt_block_enable_to_physical(
+			&config->phys_em.tiler_bm, req_enable->enable_mask);
+		break;
+	case PRFCNT_BLOCK_TYPE_MEMORY:
+		kbasep_kinstr_prfcnt_block_enable_to_physical(
+			&config->phys_em.mmu_l2_bm, req_enable->enable_mask);
+		break;
+	case PRFCNT_BLOCK_TYPE_SHADER_CORE:
+		kbasep_kinstr_prfcnt_block_enable_to_physical(
+			&config->phys_em.shader_bm, req_enable->enable_mask);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+/**
+ * kbasep_kinstr_prfcnt_parse_setup - Parse session setup
+ * @kinstr_ctx: Pointer to the kinstr_prfcnt context.
+ * @setup:      Session setup information to parse.
+ * @config:     Client object the session configuration should be written to.
+ *
+ * This function parses the list of "request" items sent by the user space
+ * client, and writes the configuration for the new client to be created
+ * for the session.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_kinstr_prfcnt_parse_setup(
+	struct kbase_kinstr_prfcnt_context *kinstr_ctx,
+	union kbase_ioctl_kinstr_prfcnt_setup *setup,
+	struct kbase_kinstr_prfcnt_client_config *config)
+{
+	uint32_t i;
+	struct prfcnt_request_item *req_arr;
+	int err = 0;
+
+	if (!setup->in.requests_ptr || (setup->in.request_item_count == 0) ||
+	    (setup->in.request_item_size == 0)) {
+		return -EINVAL;
+	}
+
+	req_arr =
+		(struct prfcnt_request_item *)(uintptr_t)setup->in.requests_ptr;
+
+	if (req_arr[setup->in.request_item_count - 1].hdr.item_type !=
+	    FLEX_LIST_TYPE_NONE) {
+		return -EINVAL;
+	}
+
+	if (req_arr[setup->in.request_item_count - 1].hdr.item_version != 0)
+		return -EINVAL;
+
+	/* The session configuration can only feature one value for some
+	 * properties (like capture mode and block counter set), but the client
+	 * may potential issue multiple requests and try to set more than one
+	 * value for those properties. While issuing multiple requests for the
+	 * same property is allowed by the protocol, asking for different values
+	 * is illegal. Leaving these properties as undefined is illegal, too.
+	 */
+	config->prfcnt_mode = PRFCNT_MODE_RESERVED;
+	config->counter_set = KBASE_HWCNT_SET_UNDEFINED;
+
+	for (i = 0; i < setup->in.request_item_count - 1; i++) {
+		if (req_arr[i].hdr.item_version > PRFCNT_READER_API_VERSION) {
+			err = -EINVAL;
+			break;
+		}
+
+		switch (req_arr[i].hdr.item_type) {
+		/* Capture mode is initialized as undefined.
+		 * The first request of this type sets the capture mode.
+		 * The protocol allows the client to send redundant requests,
+		 * but only if they replicate the same value that has already
+		 * been set by the first request.
+		 */
+		case PRFCNT_REQUEST_TYPE_MODE:
+			if (!prfcnt_mode_supported(req_arr[i].u.req_mode.mode))
+				err = -EINVAL;
+			else if (config->prfcnt_mode == PRFCNT_MODE_RESERVED)
+				config->prfcnt_mode =
+					req_arr[i].u.req_mode.mode;
+			else if (req_arr[i].u.req_mode.mode !=
+				 config->prfcnt_mode)
+				err = -EINVAL;
+
+			if (err < 0)
+				break;
+
+			if (config->prfcnt_mode == PRFCNT_MODE_PERIODIC) {
+				config->period_us =
+					req_arr[i]
+						.u.req_mode.mode_config.periodic
+						.period_us;
+
+				if ((config->period_us != 0) &&
+				    (config->period_us <
+				     DUMP_INTERVAL_MIN_US)) {
+					config->period_us =
+						DUMP_INTERVAL_MIN_US;
+				}
+			}
+			break;
+
+		case PRFCNT_REQUEST_TYPE_ENABLE:
+			err = kbasep_kinstr_prfcnt_parse_request_enable(
+				&req_arr[i].u.req_enable, config);
+			break;
+
+		default:
+			err = -EINVAL;
+			break;
+		}
+
+		if (err < 0)
+			break;
+	}
+
+	/* Verify that properties (like capture mode and block counter set)
+	 * have been defined by the user space client.
+	 */
+	if (config->prfcnt_mode == PRFCNT_MODE_RESERVED)
+		err = -EINVAL;
+
+	if (config->counter_set == KBASE_HWCNT_SET_UNDEFINED)
+		err = -EINVAL;
+
+	return err;
+}
+
+/**
+ * kbasep_kinstr_prfcnt_client_create() - Create a kinstr_prfcnt client.
+ *                                        Does not attach to the kinstr_prfcnt
+ *                                        context.
+ * @kinstr_ctx: Non-NULL pointer to kinstr_prfcnt context.
+ * @setup:      Non-NULL pointer to hardware counter ioctl setup structure.
+ * @out_vcli:   Non-NULL pointer to where created client will be stored on
+ *              success.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_kinstr_prfcnt_client_create(
+	struct kbase_kinstr_prfcnt_context *kinstr_ctx,
+	union kbase_ioctl_kinstr_prfcnt_setup *setup,
+	struct kbase_kinstr_prfcnt_client **out_vcli)
+{
+	int err;
+	struct kbase_kinstr_prfcnt_client *cli;
+	struct kbase_hwcnt_physical_enable_map phys_em;
+
+	WARN_ON(!kinstr_ctx);
+	WARN_ON(!setup);
+
+	cli = kzalloc(sizeof(*cli), GFP_KERNEL);
+
+	if (!cli)
+		return -ENOMEM;
+
+	cli->kinstr_ctx = kinstr_ctx;
+	err = kbasep_kinstr_prfcnt_parse_setup(kinstr_ctx, setup, &cli->config);
+
+	if (err < 0)
+		goto error;
+
+	cli->config.buffer_count = MAX_BUFFER_COUNT;
+	cli->dump_interval_ns = cli->config.period_us * NSEC_PER_USEC;
+	cli->next_dump_time_ns = 0;
+	err = kbase_hwcnt_enable_map_alloc(kinstr_ctx->metadata,
+					   &cli->enable_map);
+
+	if (err < 0)
+		goto error;
+
+	phys_em.fe_bm = 0;
+	phys_em.shader_bm = 0;
+	phys_em.tiler_bm = 0;
+	phys_em.mmu_l2_bm = 0;
+
+	kbase_hwcnt_gpu_enable_map_from_physical(&cli->enable_map, &phys_em);
+
+	cli->sample_count = cli->config.buffer_count;
+	cli->sample_size = kbasep_kinstr_prfcnt_get_sample_size(
+		kinstr_ctx->metadata, &cli->tmp_buf);
+
+	/* Use virtualizer's metadata to alloc tmp buffer which interacts with
+	 * the HWC virtualizer.
+	 */
+	err = kbase_hwcnt_dump_buffer_alloc(kinstr_ctx->metadata,
+					    &cli->tmp_buf);
+
+	if (err < 0)
+		goto error;
+
+	/* Enable all the available clk_enable_map. */
+	cli->enable_map.clk_enable_map =
+		(1ull << kinstr_ctx->metadata->clk_cnt) - 1;
+
+	/* Use metadata from virtualizer to allocate dump buffers  if
+	 * kinstr_prfcnt doesn't have the truncated metadata.
+	 */
+	err = kbasep_kinstr_prfcnt_sample_array_alloc(kinstr_ctx->metadata,
+						      cli->config.buffer_count,
+						      &cli->sample_arr);
+
+	if (err < 0)
+		goto error;
+
+	err = -ENOMEM;
+
+	cli->dump_bufs_meta =
+		kmalloc_array(cli->config.buffer_count,
+			      sizeof(*cli->dump_bufs_meta), GFP_KERNEL);
+
+	if (!cli->dump_bufs_meta)
+		goto error;
+
+	err = kbase_hwcnt_virtualizer_client_create(
+		kinstr_ctx->hvirt, &cli->enable_map, &cli->hvcli);
+
+	if (err < 0)
+		goto error;
+
+	init_waitqueue_head(&cli->waitq);
+	*out_vcli = cli;
+
+	return 0;
+
+error:
+	kbasep_kinstr_prfcnt_client_destroy(cli);
+	return err;
+}
+
+static size_t kbasep_kinstr_prfcnt_get_block_info_count(
+	const struct kbase_hwcnt_metadata *metadata)
+{
+	size_t grp;
+	size_t block_info_count = 0;
+
+	if (!metadata)
+		return 0;
+
+	for (grp = 0; grp < kbase_hwcnt_metadata_group_count(metadata); grp++) {
+		block_info_count +=
+			kbase_hwcnt_metadata_block_count(metadata, grp);
+	}
+
+	return block_info_count;
+}
+
+static void kbasep_kinstr_prfcnt_get_request_info_list(
+	struct kbase_kinstr_prfcnt_context *kinstr_ctx,
+	struct prfcnt_enum_item *item_arr, size_t *arr_idx)
+{
+	memcpy(&item_arr[*arr_idx], kinstr_prfcnt_supported_requests,
+	       sizeof(kinstr_prfcnt_supported_requests));
+	*arr_idx += ARRAY_SIZE(kinstr_prfcnt_supported_requests);
+}
+
+static enum prfcnt_block_type
+kbase_hwcnt_metadata_block_type_to_prfcnt_block_type(u64 type)
+{
+	enum prfcnt_block_type block_type;
+
+	switch (type) {
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE:
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE2:
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE3:
+		block_type = PRFCNT_BLOCK_TYPE_FE;
+		break;
+
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER:
+		block_type = PRFCNT_BLOCK_TYPE_TILER;
+		break;
+
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC:
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2:
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC3:
+		block_type = PRFCNT_BLOCK_TYPE_SHADER_CORE;
+		break;
+
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS:
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2:
+		block_type = PRFCNT_BLOCK_TYPE_MEMORY;
+		break;
+
+	case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_UNDEFINED:
+	default:
+		block_type = PRFCNT_BLOCK_TYPE_RESERVED;
+		break;
+	}
+
+	return block_type;
+}
+
+static int kbasep_kinstr_prfcnt_get_block_info_list(
+	const struct kbase_hwcnt_metadata *metadata, size_t block_set,
+	struct prfcnt_enum_item *item_arr, size_t *arr_idx)
+{
+	size_t grp;
+	size_t blk;
+
+	if (!metadata || !item_arr || !arr_idx)
+		return -EINVAL;
+
+	for (grp = 0; grp < kbase_hwcnt_metadata_group_count(metadata); grp++) {
+		for (blk = 0;
+		     blk < kbase_hwcnt_metadata_block_count(metadata, grp);
+		     blk++, (*arr_idx)++) {
+			item_arr[*arr_idx].hdr.item_type =
+				PRFCNT_ENUM_TYPE_BLOCK;
+			item_arr[*arr_idx].hdr.item_version =
+				PRFCNT_READER_API_VERSION;
+			item_arr[*arr_idx].u.block_counter.set = block_set;
+
+			item_arr[*arr_idx].u.block_counter.block_type =
+				kbase_hwcnt_metadata_block_type_to_prfcnt_block_type(
+					kbase_hwcnt_metadata_block_type(
+						metadata, grp, blk));
+			item_arr[*arr_idx].u.block_counter.num_instances =
+				kbase_hwcnt_metadata_block_instance_count(
+					metadata, grp, blk);
+			item_arr[*arr_idx].u.block_counter.num_values =
+				kbase_hwcnt_metadata_block_values_count(
+					metadata, grp, blk);
+
+			/* The bitmask of available counters should be dynamic.
+			 * Temporarily, it is set to U64_MAX, waiting for the
+			 * required functionality to be available in the future.
+			 */
+			item_arr[*arr_idx].u.block_counter.counter_mask[0] =
+				U64_MAX;
+			item_arr[*arr_idx].u.block_counter.counter_mask[1] =
+				U64_MAX;
+		}
+	}
+
+	return 0;
+}
+
+static int kbasep_kinstr_prfcnt_enum_info_count(
+	struct kbase_kinstr_prfcnt_context *kinstr_ctx,
+	struct kbase_ioctl_kinstr_prfcnt_enum_info *enum_info)
+{
+	int err = 0;
+	uint32_t count = 0;
+	size_t block_info_count = 0;
+	const struct kbase_hwcnt_metadata *metadata;
+
+	count = ARRAY_SIZE(kinstr_prfcnt_supported_requests);
+	metadata = kbase_hwcnt_virtualizer_metadata(kinstr_ctx->hvirt);
+	block_info_count = kbasep_kinstr_prfcnt_get_block_info_count(metadata);
+	count += block_info_count;
+
+	/* Reserve one for the last sentinel item. */
+	count++;
+	enum_info->info_item_count = count;
+	enum_info->info_item_size = sizeof(struct prfcnt_enum_item);
+	kinstr_ctx->info_item_count = count;
+
+	return err;
+}
+
+static int kbasep_kinstr_prfcnt_enum_info_list(
+	struct kbase_kinstr_prfcnt_context *kinstr_ctx,
+	struct kbase_ioctl_kinstr_prfcnt_enum_info *enum_info)
+{
+	struct prfcnt_enum_item *prfcnt_item_arr;
+	size_t arr_idx = 0;
+	int err = 0;
+	size_t block_info_count = 0;
+	const struct kbase_hwcnt_metadata *metadata;
+
+	if ((enum_info->info_item_size == 0) ||
+	    (enum_info->info_item_count == 0) || !enum_info->info_list_ptr)
+		return -EINVAL;
+
+	if (enum_info->info_item_count != kinstr_ctx->info_item_count)
+		return -EINVAL;
+
+	prfcnt_item_arr =
+		(struct prfcnt_enum_item *)(uintptr_t)enum_info->info_list_ptr;
+	kbasep_kinstr_prfcnt_get_request_info_list(kinstr_ctx, prfcnt_item_arr,
+						   &arr_idx);
+	metadata = kbase_hwcnt_virtualizer_metadata(kinstr_ctx->hvirt);
+	block_info_count = kbasep_kinstr_prfcnt_get_block_info_count(metadata);
+
+	if (arr_idx + block_info_count >= enum_info->info_item_count)
+		err = -EINVAL;
+
+	if (!err) {
+		size_t counter_set;
+
+#if defined(CONFIG_MALI_PRFCNT_SET_SECONDARY)
+		counter_set = KBASE_HWCNT_SET_SECONDARY;
+#elif defined(CONFIG_MALI_PRFCNT_SET_TERTIARY)
+		counter_set = KBASE_HWCNT_SET_TERTIARY;
+#else
+		/* Default to primary */
+		counter_set = KBASE_HWCNT_SET_PRIMARY;
+#endif
+		kbasep_kinstr_prfcnt_get_block_info_list(
+			metadata, counter_set, prfcnt_item_arr, &arr_idx);
+		if (arr_idx != enum_info->info_item_count - 1)
+			err = -EINVAL;
+	}
+
+	/* The last sentinel item. */
+	prfcnt_item_arr[enum_info->info_item_count - 1].hdr.item_type =
+		FLEX_LIST_TYPE_NONE;
+	prfcnt_item_arr[enum_info->info_item_count - 1].hdr.item_version = 0;
+
+	return err;
+}
+
+int kbase_kinstr_prfcnt_enum_info(
+	struct kbase_kinstr_prfcnt_context *kinstr_ctx,
+	struct kbase_ioctl_kinstr_prfcnt_enum_info *enum_info)
+{
+	int err;
+
+	if (!kinstr_ctx || !enum_info)
+		return -EINVAL;
+
+	if (!enum_info->info_list_ptr)
+		err = kbasep_kinstr_prfcnt_enum_info_count(kinstr_ctx,
+							   enum_info);
+	else
+		err = kbasep_kinstr_prfcnt_enum_info_list(kinstr_ctx,
+							  enum_info);
+
+	return err;
+}
+
+int kbase_kinstr_prfcnt_setup(struct kbase_kinstr_prfcnt_context *kinstr_ctx,
+			      union kbase_ioctl_kinstr_prfcnt_setup *setup)
+{
+	int err;
+	struct kbase_kinstr_prfcnt_client *cli = NULL;
+
+	if (!kinstr_ctx || !setup)
+		return -EINVAL;
+
+	err = kbasep_kinstr_prfcnt_client_create(kinstr_ctx, setup, &cli);
+
+	if (err < 0)
+		goto error;
+
+	mutex_lock(&kinstr_ctx->lock);
+	kinstr_ctx->client_count++;
+	list_add(&cli->node, &kinstr_ctx->clients);
+	mutex_unlock(&kinstr_ctx->lock);
+
+	setup->out.prfcnt_metadata_item_size = sizeof(struct prfcnt_metadata);
+	setup->out.prfcnt_mmap_size_bytes =
+		cli->sample_size * cli->sample_count;
+
+	/* Expose to user-space only once the client is fully initialized */
+	err = anon_inode_getfd("[mali_kinstr_prfcnt_desc]",
+			       &kinstr_prfcnt_client_fops, cli,
+			       O_RDONLY | O_CLOEXEC);
+
+	if (err < 0)
+		goto client_installed_error;
+
+	return err;
+
+client_installed_error:
+	mutex_lock(&kinstr_ctx->lock);
+	kinstr_ctx->client_count--;
+	list_del(&cli->node);
+	mutex_unlock(&kinstr_ctx->lock);
+error:
+	kbasep_kinstr_prfcnt_client_destroy(cli);
+	return err;
+}
diff --git a/mali_kbase/mali_kbase_kinstr_prfcnt.h b/mali_kbase/mali_kbase_kinstr_prfcnt.h
new file mode 100644
index 0000000..83d76be
--- /dev/null
+++ b/mali_kbase/mali_kbase_kinstr_prfcnt.h
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * Kinstr_prfcnt, used to provide an ioctl for userspace access to
+ * performance counters.
+ */
+#ifndef _KBASE_KINSTR_PRFCNT_H_
+#define _KBASE_KINSTR_PRFCNT_H_
+
+struct kbase_kinstr_prfcnt_context;
+struct kbase_hwcnt_virtualizer;
+struct kbase_ioctl_hwcnt_reader_setup;
+struct kbase_ioctl_kinstr_prfcnt_enum_info;
+union kbase_ioctl_kinstr_prfcnt_setup;
+
+/**
+ * kbase_kinstr_prfcnt_init() - Initialize a kinstr_prfcnt context.
+ * @hvirt:          Non-NULL pointer to the hardware counter virtualizer.
+ * @out_kinstr_ctx: Non-NULL pointer to where the pointer to the created
+ *                  kinstr_prfcnt context will be stored on success.
+ *
+ * On creation, the suspend count of the context will be 0.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_kinstr_prfcnt_init(
+	struct kbase_hwcnt_virtualizer *hvirt,
+	struct kbase_kinstr_prfcnt_context **out_kinstr_ctx);
+
+/**
+ * kbase_kinstr_prfcnt_term() - Terminate a kinstr_prfcnt context.
+ * @kinstr_ctx: Pointer to the kinstr_prfcnt context to be terminated.
+ */
+void kbase_kinstr_prfcnt_term(struct kbase_kinstr_prfcnt_context *kinstr_ctx);
+
+/**
+ * kbase_kinstr_prfcnt_suspend() - Increment the suspend count of the context.
+ * @kinstr_ctx: Non-NULL pointer to the kinstr_prfcnt context to be suspended.
+ *
+ * After this function call returns, it is guaranteed that all timers and
+ * workers in kinstr_prfcnt will be canceled, and will not be re-triggered until
+ * after the context has been resumed. In effect, this means no new counter
+ * dumps will occur for any existing or subsequently added periodic clients.
+ */
+void kbase_kinstr_prfcnt_suspend(struct kbase_kinstr_prfcnt_context *kinstr_ctx);
+
+/**
+ * kbase_kinstr_prfcnt_resume() - Decrement the suspend count of the context.
+ * @kinstr_ctx: Non-NULL pointer to the kinstr_prfcnt context to be resumed.
+ *
+ * If a call to this function decrements the suspend count from 1 to 0, then
+ * normal operation of kinstr_prfcnt will be resumed (i.e. counter dumps will once
+ * again be automatically triggered for all periodic clients).
+ *
+ * It is only valid to call this function one time for each prior returned call
+ * to kbase_kinstr_prfcnt_suspend.
+ */
+void kbase_kinstr_prfcnt_resume(struct kbase_kinstr_prfcnt_context *kinstr_ctx);
+
+/**
+ * kbase_kinstr_prfcnt_enum_info - Enumerate performance counter information.
+ * @kinstr_ctx: Non-NULL pointer to the kinstr_prfcnt context.
+ * @enum_info:  Non-NULL pointer to the enumeration information.
+ *
+ * Enumerate which counter blocks and banks exist, and what counters are
+ * available within them.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_kinstr_prfcnt_enum_info(
+	struct kbase_kinstr_prfcnt_context *kinstr_ctx,
+	struct kbase_ioctl_kinstr_prfcnt_enum_info *enum_info);
+
+/**
+ * kbase_kinstr_prfcnt_setup() - Set up a new hardware counter reader client.
+ * @kinstr_ctx: Non-NULL pointer to the kinstr_prfcnt context.
+ * @setup:      Non-NULL pointer to the hwcnt reader configuration.
+ *
+ * Start a session between a user client and the kinstr_prfcnt component.
+ * A file descriptor shall be provided to the client as a handle to the
+ * hardware counter reader client that represents the session.
+ *
+ * Return: file descriptor on success, else error code.
+ */
+int kbase_kinstr_prfcnt_setup(struct kbase_kinstr_prfcnt_context *kinstr_ctx,
+			      union kbase_ioctl_kinstr_prfcnt_setup *setup);
+
+#endif /* _KBASE_KINSTR_PRFCNT_H_ */
diff --git a/mali_kbase/mali_kbase_mem.c b/mali_kbase/mali_kbase_mem.c
index a68e4ea..320ffef 100644
--- a/mali_kbase/mali_kbase_mem.c
+++ b/mali_kbase/mali_kbase_mem.c
@@ -351,6 +351,7 @@ static struct kbase_va_region *kbase_region_tracker_find_region_meeting_reqs(
 
 /**
  * Remove a region object from the global list.
+ * @kbdev: The kbase device
  * @reg: Region object to remove
  *
  * The region reg is removed, possibly by merging with other free and
@@ -358,7 +359,8 @@ static struct kbase_va_region *kbase_region_tracker_find_region_meeting_reqs(
  * region lock held. The associated memory is not released (see
  * kbase_free_alloced_region). Internal use only.
  */
-int kbase_remove_va_region(struct kbase_va_region *reg)
+void kbase_remove_va_region(struct kbase_device *kbdev,
+			    struct kbase_va_region *reg)
 {
 	struct rb_node *rbprev;
 	struct kbase_va_region *prev = NULL;
@@ -368,20 +370,26 @@ int kbase_remove_va_region(struct kbase_va_region *reg)
 
 	int merged_front = 0;
 	int merged_back = 0;
-	int err = 0;
 
 	reg_rbtree = reg->rbtree;
 
+	if (WARN_ON(RB_EMPTY_ROOT(reg_rbtree)))
+		return;
+
 	/* Try to merge with the previous block first */
 	rbprev = rb_prev(&(reg->rblink));
 	if (rbprev) {
 		prev = rb_entry(rbprev, struct kbase_va_region, rblink);
 		if (prev->flags & KBASE_REG_FREE) {
 			/* We're compatible with the previous VMA, merge with
-			 * it
+			 * it, handling any gaps for robustness.
 			 */
+			u64 prev_end_pfn = prev->start_pfn + prev->nr_pages;
+
 			WARN_ON((prev->flags & KBASE_REG_ZONE_MASK) !=
 					    (reg->flags & KBASE_REG_ZONE_MASK));
+			if (!WARN_ON(reg->start_pfn < prev_end_pfn))
+				prev->nr_pages += reg->start_pfn - prev_end_pfn;
 			prev->nr_pages += reg->nr_pages;
 			rb_erase(&(reg->rblink), reg_rbtree);
 			reg = prev;
@@ -393,11 +401,17 @@ int kbase_remove_va_region(struct kbase_va_region *reg)
 	/* Note we do the lookup here as the tree may have been rebalanced. */
 	rbnext = rb_next(&(reg->rblink));
 	if (rbnext) {
-		/* We're compatible with the next VMA, merge with it */
 		next = rb_entry(rbnext, struct kbase_va_region, rblink);
 		if (next->flags & KBASE_REG_FREE) {
+			/* We're compatible with the next VMA, merge with it,
+			 * handling any gaps for robustness.
+			 */
+			u64 reg_end_pfn = reg->start_pfn + reg->nr_pages;
+
 			WARN_ON((next->flags & KBASE_REG_ZONE_MASK) !=
 					    (reg->flags & KBASE_REG_ZONE_MASK));
+			if (!WARN_ON(next->start_pfn < reg_end_pfn))
+				next->nr_pages += next->start_pfn - reg_end_pfn;
 			next->start_pfn = reg->start_pfn;
 			next->nr_pages += reg->nr_pages;
 			rb_erase(&(reg->rblink), reg_rbtree);
@@ -412,8 +426,8 @@ int kbase_remove_va_region(struct kbase_va_region *reg)
 	/* If we failed to merge then we need to add a new block */
 	if (!(merged_front || merged_back)) {
 		/*
-		 * We didn't merge anything. Add a new free
-		 * placeholder and remove the original one.
+		 * We didn't merge anything. Try to add a new free
+		 * placeholder, and in any case, remove the original one.
 		 */
 		struct kbase_va_region *free_reg;
 
@@ -421,14 +435,37 @@ int kbase_remove_va_region(struct kbase_va_region *reg)
 				reg->start_pfn, reg->nr_pages,
 				reg->flags & KBASE_REG_ZONE_MASK);
 		if (!free_reg) {
-			err = -ENOMEM;
+			/* In case of failure, we cannot allocate a replacement
+			 * free region, so we will be left with a 'gap' in the
+			 * region tracker's address range (though, the rbtree
+			 * will itself still be correct after erasing
+			 * 'reg').
+			 *
+			 * The gap will be rectified when an adjacent region is
+			 * removed by one of the above merging paths. Other
+			 * paths will gracefully fail to allocate if they try
+			 * to allocate in the gap.
+			 *
+			 * There is nothing that the caller can do, since free
+			 * paths must not fail. The existing 'reg' cannot be
+			 * repurposed as the free region as callers must have
+			 * freedom of use with it by virtue of it being owned
+			 * by them, not the region tracker insert/remove code.
+			 */
+			dev_warn(
+				kbdev->dev,
+				"Could not alloc a replacement free region for 0x%.16llx..0x%.16llx",
+				(unsigned long long)reg->start_pfn << PAGE_SHIFT,
+				(unsigned long long)(reg->start_pfn + reg->nr_pages) << PAGE_SHIFT);
+			rb_erase(&(reg->rblink), reg_rbtree);
+
 			goto out;
 		}
 		rb_replace_node(&(reg->rblink), &(free_reg->rblink), reg_rbtree);
 	}
 
- out:
-	return err;
+out:
+	return;
 }
 
 KBASE_EXPORT_TEST_API(kbase_remove_va_region);
@@ -456,6 +493,9 @@ static int kbase_insert_va_region_nolock(struct kbase_va_region *new_reg,
 	KBASE_DEBUG_ASSERT((start_pfn >= at_reg->start_pfn) && (start_pfn < at_reg->start_pfn + at_reg->nr_pages));
 	/* at least nr_pages from start_pfn should be contained within at_reg */
 	KBASE_DEBUG_ASSERT(start_pfn + nr_pages <= at_reg->start_pfn + at_reg->nr_pages);
+	/* having at_reg means the rb_tree should not be empty */
+	if (WARN_ON(RB_EMPTY_ROOT(reg_rbtree)))
+		return -ENOMEM;
 
 	new_reg->start_pfn = start_pfn;
 	new_reg->nr_pages = nr_pages;
@@ -862,6 +902,8 @@ static bool kbase_region_tracker_has_allocs(struct kbase_context *kctx)
 		unsigned long zone_bits = KBASE_REG_ZONE(zone_idx);
 		unsigned long reg_zone;
 
+		if (!kbase_is_ctx_reg_zone(zone_bits))
+			continue;
 		zone = kbase_ctx_reg_zone_get(kctx, zone_bits);
 		zone_base_addr = zone->base_pfn << PAGE_SHIFT;
 
@@ -1457,7 +1499,9 @@ void kbase_free_alloced_region(struct kbase_va_region *reg)
 
 KBASE_EXPORT_TEST_API(kbase_free_alloced_region);
 
-int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg, u64 addr, size_t nr_pages, size_t align)
+int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg,
+		   u64 addr, size_t nr_pages, size_t align,
+		   enum kbase_caller_mmu_sync_info mmu_sync_info)
 {
 	int err;
 	size_t i = 0;
@@ -1494,14 +1538,16 @@ int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg, u64
 		KBASE_DEBUG_ASSERT(alloc->imported.alias.aliased);
 		for (i = 0; i < alloc->imported.alias.nents; i++) {
 			if (alloc->imported.alias.aliased[i].alloc) {
-				err = kbase_mmu_insert_pages(kctx->kbdev,
-						&kctx->mmu,
-						reg->start_pfn + (i * stride),
-						alloc->imported.alias.aliased[i].alloc->pages + alloc->imported.alias.aliased[i].offset,
-						alloc->imported.alias.aliased[i].length,
-						reg->flags & gwt_mask,
-						kctx->as_nr,
-						group_id);
+				err = kbase_mmu_insert_pages(
+					kctx->kbdev, &kctx->mmu,
+					reg->start_pfn + (i * stride),
+					alloc->imported.alias.aliased[i]
+							.alloc->pages +
+						alloc->imported.alias.aliased[i]
+							.offset,
+					alloc->imported.alias.aliased[i].length,
+					reg->flags & gwt_mask, kctx->as_nr,
+					group_id, mmu_sync_info);
 				if (err)
 					goto bad_insert;
 
@@ -1509,26 +1555,24 @@ int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg, u64
 				 * creation time
 				 */
 			} else {
-				err = kbase_mmu_insert_single_page(kctx,
-					reg->start_pfn + i * stride,
+				err = kbase_mmu_insert_single_page(
+					kctx, reg->start_pfn + i * stride,
 					kctx->aliasing_sink_page,
 					alloc->imported.alias.aliased[i].length,
 					(reg->flags & mask & gwt_mask) | attr,
-					group_id);
+					group_id, mmu_sync_info);
 
 				if (err)
 					goto bad_insert;
 			}
 		}
 	} else {
-		err = kbase_mmu_insert_pages(kctx->kbdev,
-				&kctx->mmu,
-				reg->start_pfn,
-				kbase_get_gpu_phy_pages(reg),
-				kbase_reg_current_backed_size(reg),
-				reg->flags & gwt_mask,
-				kctx->as_nr,
-				group_id);
+		err = kbase_mmu_insert_pages(kctx->kbdev, &kctx->mmu,
+					     reg->start_pfn,
+					     kbase_get_gpu_phy_pages(reg),
+					     kbase_reg_current_backed_size(reg),
+					     reg->flags & gwt_mask, kctx->as_nr,
+					     group_id, mmu_sync_info);
 		if (err)
 			goto bad_insert;
 		kbase_mem_phy_alloc_gpu_mapped(alloc);
@@ -1548,13 +1592,12 @@ int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg, u64
 		 * Assume reg->gpu_alloc->nents is the number of actual pages
 		 * in the dma-buf memory.
 		 */
-		err = kbase_mmu_insert_single_page(kctx,
-				reg->start_pfn + reg->gpu_alloc->nents,
-				kctx->aliasing_sink_page,
-				reg->nr_pages - reg->gpu_alloc->nents,
-				(reg->flags | KBASE_REG_GPU_RD) &
-				~KBASE_REG_GPU_WR,
-				KBASE_MEM_GROUP_SINK);
+		err = kbase_mmu_insert_single_page(
+			kctx, reg->start_pfn + reg->gpu_alloc->nents,
+			kctx->aliasing_sink_page,
+			reg->nr_pages - reg->gpu_alloc->nents,
+			(reg->flags | KBASE_REG_GPU_RD) & ~KBASE_REG_GPU_WR,
+			KBASE_MEM_GROUP_SINK, mmu_sync_info);
 		if (err)
 			goto bad_insert;
 	}
@@ -1566,7 +1609,7 @@ bad_insert:
 				 reg->start_pfn, reg->nr_pages,
 				 kctx->as_nr);
 
-	kbase_remove_va_region(reg);
+	kbase_remove_va_region(kctx->kbdev, reg);
 
 	return err;
 }
@@ -1588,7 +1631,28 @@ int kbase_gpu_munmap(struct kbase_context *kctx, struct kbase_va_region *reg)
 
 	/* Tear down down GPU page tables, depending on memory type. */
 	switch (reg->gpu_alloc->type) {
-	case KBASE_MEM_TYPE_ALIAS: /* Fall-through */
+	case KBASE_MEM_TYPE_ALIAS: {
+		size_t i = 0;
+		struct kbase_mem_phy_alloc *alloc = reg->gpu_alloc;
+
+		/* Due to the way the number of valid PTEs and ATEs are tracked
+		 * currently, only the GPU virtual range that is backed & mapped
+		 * should be passed to the kbase_mmu_teardown_pages() function,
+		 * hence individual aliased regions needs to be unmapped
+		 * separately.
+		 */
+		for (i = 0; i < alloc->imported.alias.nents; i++) {
+			if (alloc->imported.alias.aliased[i].alloc) {
+				err = kbase_mmu_teardown_pages(
+					kctx->kbdev, &kctx->mmu,
+					reg->start_pfn +
+						(i *
+						 alloc->imported.alias.stride),
+					alloc->imported.alias.aliased[i].length,
+					kctx->as_nr);
+			}
+		}
+	} break;
 	case KBASE_MEM_TYPE_IMPORTED_UMM:
 		err = kbase_mmu_teardown_pages(kctx->kbdev, &kctx->mmu,
 				reg->start_pfn, reg->nr_pages, kctx->as_nr);
@@ -1622,7 +1686,7 @@ int kbase_gpu_munmap(struct kbase_context *kctx, struct kbase_va_region *reg)
 				}
 			}
 		}
-		/* Fall-through */
+		fallthrough;
 	default:
 		kbase_mem_phy_alloc_gpu_unmapped(reg->gpu_alloc);
 		break;
@@ -3698,7 +3762,8 @@ static size_t kbase_mem_jit_trim_pages(struct kbase_context *kctx,
 static int kbase_jit_grow(struct kbase_context *kctx,
 			  const struct base_jit_alloc_info *info,
 			  struct kbase_va_region *reg,
-			  struct kbase_sub_alloc **prealloc_sas)
+			  struct kbase_sub_alloc **prealloc_sas,
+			  enum kbase_caller_mmu_sync_info mmu_sync_info)
 {
 	size_t delta;
 	size_t pages_required;
@@ -3795,7 +3860,7 @@ static int kbase_jit_grow(struct kbase_context *kctx,
 	spin_unlock(&kctx->mem_partials_lock);
 
 	ret = kbase_mem_grow_gpu_mapping(kctx, reg, info->commit_pages,
-			old_size);
+					 old_size, mmu_sync_info);
 	/*
 	 * The grow failed so put the allocation back in the
 	 * pool and return failure.
@@ -4010,6 +4075,11 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx,
 	struct kbase_sub_alloc *prealloc_sas[2] = { NULL, NULL };
 	int i;
 
+	/* Calls to this function are inherently synchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC;
+
 #if MALI_USE_CSF
 	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
 #else
@@ -4102,7 +4172,8 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx,
 		 * so any state protected by that lock might need to be
 		 * re-evaluated if more code is added here in future.
 		 */
-		ret = kbase_jit_grow(kctx, info, reg, prealloc_sas);
+		ret = kbase_jit_grow(kctx, info, reg, prealloc_sas,
+				     mmu_sync_info);
 
 #if MALI_JIT_PRESSURE_LIMIT_BASE
 		if (!ignore_pressure_limit)
@@ -4150,7 +4221,7 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx,
 			flags |= BASE_MEM_TILER_ALIGN_TOP;
 #endif /* !MALI_USE_CSF */
 
-		flags |= base_mem_group_id_set(kctx->jit_group_id);
+		flags |= kbase_mem_group_id_set(kctx->jit_group_id);
 #if MALI_JIT_PRESSURE_LIMIT_BASE
 		if (!ignore_pressure_limit) {
 			flags |= BASEP_MEM_PERFORM_JIT_TRIM;
@@ -4166,7 +4237,8 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx,
 		kbase_gpu_vm_unlock(kctx);
 
 		reg = kbase_mem_alloc(kctx, info->va_pages, info->commit_pages,
-				      info->extension, &flags, &gpu_addr);
+				      info->extension, &flags, &gpu_addr,
+				      mmu_sync_info);
 		if (!reg) {
 			/* Most likely not enough GPU virtual space left for
 			 * the new JIT allocation.
@@ -4455,6 +4527,15 @@ void kbase_jit_report_update_pressure(struct kbase_context *kctx,
 }
 #endif /* MALI_JIT_PRESSURE_LIMIT_BASE */
 
+void kbase_unpin_user_buf_page(struct page *page)
+{
+#if KERNEL_VERSION(5, 9, 0) > LINUX_VERSION_CODE
+	put_page(page);
+#else
+	unpin_user_page(page);
+#endif
+}
+
 #if MALI_USE_CSF
 static void kbase_jd_user_buf_unpin_pages(struct kbase_mem_phy_alloc *alloc)
 {
@@ -4465,7 +4546,7 @@ static void kbase_jd_user_buf_unpin_pages(struct kbase_mem_phy_alloc *alloc)
 		WARN_ON(alloc->nents != alloc->imported.user_buf.nr_pages);
 
 		for (i = 0; i < alloc->nents; i++)
-			put_page(pages[i]);
+			kbase_unpin_user_buf_page(pages[i]);
 	}
 }
 #endif
@@ -4524,11 +4605,10 @@ KERNEL_VERSION(4, 5, 0) > LINUX_VERSION_CODE
 			reg->flags & KBASE_REG_GPU_WR ? FOLL_WRITE : 0,
 			pages, NULL, NULL);
 #else
-	pinned_pages = get_user_pages_remote(mm,
-			address,
-			alloc->imported.user_buf.nr_pages,
-			reg->flags & KBASE_REG_GPU_WR ? FOLL_WRITE : 0,
-			pages, NULL, NULL);
+	pinned_pages = pin_user_pages_remote(
+		mm, address, alloc->imported.user_buf.nr_pages,
+		reg->flags & KBASE_REG_GPU_WR ? FOLL_WRITE : 0, pages, NULL,
+		NULL);
 #endif
 
 	if (pinned_pages <= 0)
@@ -4536,7 +4616,7 @@ KERNEL_VERSION(4, 5, 0) > LINUX_VERSION_CODE
 
 	if (pinned_pages != alloc->imported.user_buf.nr_pages) {
 		for (i = 0; i < pinned_pages; i++)
-			put_page(pages[i]);
+			kbase_unpin_user_buf_page(pages[i]);
 		return -ENOMEM;
 	}
 
@@ -4560,6 +4640,11 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx,
 	unsigned long gwt_mask = ~0;
 	int err = kbase_jd_user_buf_pin_pages(kctx, reg);
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	if (err)
 		return err;
 
@@ -4596,9 +4681,9 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx,
 #endif
 
 	err = kbase_mmu_insert_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn,
-			pa, kbase_reg_current_backed_size(reg),
-			reg->flags & gwt_mask, kctx->as_nr,
-			alloc->group_id);
+				     pa, kbase_reg_current_backed_size(reg),
+				     reg->flags & gwt_mask, kctx->as_nr,
+				     alloc->group_id, mmu_sync_info);
 	if (err == 0)
 		return 0;
 
@@ -4612,7 +4697,7 @@ unwind:
 	}
 
 	while (++i < pinned_pages) {
-		put_page(pages[i]);
+		kbase_unpin_user_buf_page(pages[i]);
 		pages[i] = NULL;
 	}
 
@@ -4642,7 +4727,7 @@ static void kbase_jd_user_buf_unmap(struct kbase_context *kctx,
 		if (writeable)
 			set_page_dirty_lock(pages[i]);
 #if !MALI_USE_CSF
-		put_page(pages[i]);
+		kbase_unpin_user_buf_page(pages[i]);
 		pages[i] = NULL;
 #endif
 
diff --git a/mali_kbase/mali_kbase_mem.h b/mali_kbase/mali_kbase_mem.h
index e9ac809..95533f5 100644
--- a/mali_kbase/mali_kbase_mem.h
+++ b/mali_kbase/mali_kbase_mem.h
@@ -506,6 +506,21 @@ struct kbase_va_region {
 	int    va_refcnt;
 };
 
+/**
+ * kbase_is_ctx_reg_zone - determine whether a KBASE_REG_ZONE_<...> is for a
+ *                         context or for a device
+ * @zone_bits: A KBASE_REG_ZONE_<...> to query
+ *
+ * Return: True if the zone for @zone_bits is a context zone, False otherwise
+ */
+static inline bool kbase_is_ctx_reg_zone(unsigned long zone_bits)
+{
+	WARN_ON((zone_bits & KBASE_REG_ZONE_MASK) != zone_bits);
+	return (zone_bits == KBASE_REG_ZONE_SAME_VA ||
+		zone_bits == KBASE_REG_ZONE_CUSTOM_VA ||
+		zone_bits == KBASE_REG_ZONE_EXEC_VA);
+}
+
 /* Special marker for failed JIT allocations that still must be marked as
  * in-use
  */
@@ -529,12 +544,14 @@ static inline bool kbase_is_region_invalid_or_free(struct kbase_va_region *reg)
 	return (kbase_is_region_invalid(reg) ||	kbase_is_region_free(reg));
 }
 
-int kbase_remove_va_region(struct kbase_va_region *reg);
-static inline void kbase_region_refcnt_free(struct kbase_va_region *reg)
+void kbase_remove_va_region(struct kbase_device *kbdev,
+			    struct kbase_va_region *reg);
+static inline void kbase_region_refcnt_free(struct kbase_device *kbdev,
+					    struct kbase_va_region *reg)
 {
 	/* If region was mapped then remove va region*/
 	if (reg->start_pfn)
-		kbase_remove_va_region(reg);
+		kbase_remove_va_region(kbdev, reg);
 
 	/* To detect use-after-free in debug builds */
 	KBASE_DEBUG_CODE(reg->flags |= KBASE_REG_FREE);
@@ -569,7 +586,7 @@ static inline struct kbase_va_region *kbase_va_region_alloc_put(
 	dev_dbg(kctx->kbdev->dev, "va_refcnt %d after put %pK\n",
 		region->va_refcnt, (void *)region);
 	if (!region->va_refcnt)
-		kbase_region_refcnt_free(region);
+		kbase_region_refcnt_free(kctx->kbdev, region);
 
 	return NULL;
 }
@@ -1167,10 +1184,13 @@ int kbase_alloc_phy_pages(struct kbase_va_region *reg, size_t vsize, size_t size
  * @addr: the address to insert the region at
  * @nr_pages: the number of pages in the region
  * @align: the minimum alignment in pages
+ * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops.
  *
  * Call kbase_add_va_region() and map the region on the GPU.
  */
-int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg, u64 addr, size_t nr_pages, size_t align);
+int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg,
+		   u64 addr, size_t nr_pages, size_t align,
+		   enum kbase_caller_mmu_sync_info mmu_sync_info);
 
 /**
  * Remove the region from the GPU and unregister it.
@@ -1798,6 +1818,11 @@ struct kbase_mem_phy_alloc *kbase_map_external_resource(
 void kbase_unmap_external_resource(struct kbase_context *kctx,
 		struct kbase_va_region *reg, struct kbase_mem_phy_alloc *alloc);
 
+/**
+ * kbase_unpin_user_buf_page - Unpin a page of a user buffer.
+ * @page: page to unpin
+ */
+void kbase_unpin_user_buf_page(struct page *page);
 
 /**
  * kbase_jd_user_buf_pin_pages - Pin the pages of a user buffer.
@@ -2025,7 +2050,7 @@ int kbase_mem_copy_to_pinned_user_pages(struct page **dest_pages,
 		unsigned int *target_page_nr, size_t offset);
 
 /**
- * kbase_ctx_reg_zone_end_pfn - return the end Page Frame Number of @zone
+ * kbase_reg_zone_end_pfn - return the end Page Frame Number of @zone
  * @zone: zone to query
  *
  * Return: The end of the zone corresponding to @zone
@@ -2050,7 +2075,7 @@ static inline void kbase_ctx_reg_zone_init(struct kbase_context *kctx,
 	struct kbase_reg_zone *zone;
 
 	lockdep_assert_held(&kctx->reg_lock);
-	WARN_ON((zone_bits & KBASE_REG_ZONE_MASK) != zone_bits);
+	WARN_ON(!kbase_is_ctx_reg_zone(zone_bits));
 
 	zone = &kctx->reg_zone[KBASE_REG_ZONE_IDX(zone_bits)];
 	*zone = (struct kbase_reg_zone){
@@ -2073,7 +2098,7 @@ static inline struct kbase_reg_zone *
 kbase_ctx_reg_zone_get_nolock(struct kbase_context *kctx,
 			      unsigned long zone_bits)
 {
-	WARN_ON((zone_bits & KBASE_REG_ZONE_MASK) != zone_bits);
+	WARN_ON(!kbase_is_ctx_reg_zone(zone_bits));
 
 	return &kctx->reg_zone[KBASE_REG_ZONE_IDX(zone_bits)];
 }
@@ -2091,9 +2116,60 @@ static inline struct kbase_reg_zone *
 kbase_ctx_reg_zone_get(struct kbase_context *kctx, unsigned long zone_bits)
 {
 	lockdep_assert_held(&kctx->reg_lock);
-	WARN_ON((zone_bits & KBASE_REG_ZONE_MASK) != zone_bits);
+	WARN_ON(!kbase_is_ctx_reg_zone(zone_bits));
 
 	return &kctx->reg_zone[KBASE_REG_ZONE_IDX(zone_bits)];
 }
 
+/**
+ * kbase_mem_allow_alloc - Check if allocation of GPU memory is allowed
+ * @kctx: Pointer to kbase context
+ *
+ * Don't allow the allocation of GPU memory until user space has set up the
+ * tracking page (which sets kctx->process_mm) or if the ioctl has been issued
+ * from the forked child process using the mali device file fd inherited from
+ * the parent process.
+ */
+static inline bool kbase_mem_allow_alloc(struct kbase_context *kctx)
+{
+	bool allow_alloc = true;
+
+	rcu_read_lock();
+	allow_alloc = (rcu_dereference(kctx->process_mm) == current->mm);
+	rcu_read_unlock();
+
+	return allow_alloc;
+}
+
+/**
+ * kbase_mem_group_id_get - Get group ID from flags
+ * @flags: Flags to pass to base_mem_alloc
+ *
+ * This inline function extracts the encoded group ID from flags
+ * and converts it into numeric value (0~15).
+ *
+ * Return: group ID(0~15) extracted from the parameter
+ */
+static inline int kbase_mem_group_id_get(base_mem_alloc_flags flags)
+{
+	KBASE_DEBUG_ASSERT((flags & ~BASE_MEM_FLAGS_INPUT_MASK) == 0);
+	return (int)BASE_MEM_GROUP_ID_GET(flags);
+}
+
+/**
+ * kbase_mem_group_id_set - Set group ID into base_mem_alloc_flags
+ * @id: group ID(0~15) you want to encode
+ *
+ * This inline function encodes specific group ID into base_mem_alloc_flags.
+ * Parameter 'id' should lie in-between 0 to 15.
+ *
+ * Return: base_mem_alloc_flags with the group ID (id) encoded
+ *
+ * The return value can be combined with other flags against base_mem_alloc
+ * to identify a specific memory group.
+ */
+static inline base_mem_alloc_flags kbase_mem_group_id_set(int id)
+{
+	return BASE_MEM_GROUP_ID_SET(id);
+}
 #endif				/* _KBASE_MEM_H_ */
diff --git a/mali_kbase/mali_kbase_mem_linux.c b/mali_kbase/mali_kbase_mem_linux.c
index 21302c1..527bec4 100644
--- a/mali_kbase/mali_kbase_mem_linux.c
+++ b/mali_kbase/mali_kbase_mem_linux.c
@@ -291,9 +291,10 @@ void kbase_phy_alloc_mapping_put(struct kbase_context *kctx,
 	 */
 }
 
-struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx,
-					u64 va_pages, u64 commit_pages,
-					u64 extension, u64 *flags, u64 *gpu_va)
+struct kbase_va_region *
+kbase_mem_alloc(struct kbase_context *kctx, u64 va_pages, u64 commit_pages,
+		u64 extension, u64 *flags, u64 *gpu_va,
+		enum kbase_caller_mmu_sync_info mmu_sync_info)
 {
 	int zone;
 	struct kbase_va_region *reg;
@@ -387,7 +388,7 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx,
 		goto invalid_flags;
 
 	if (kbase_reg_prepare_native(reg, kctx,
-				base_mem_group_id_get(*flags)) != 0) {
+				     kbase_mem_group_id_get(*flags)) != 0) {
 		dev_err(dev, "Failed to prepare region");
 		goto prepare_failed;
 	}
@@ -469,7 +470,8 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx,
 
 		*gpu_va = (u64) cookie;
 	} else /* we control the VA */ {
-		if (kbase_gpu_mmap(kctx, reg, *gpu_va, va_pages, 1) != 0) {
+		if (kbase_gpu_mmap(kctx, reg, *gpu_va, va_pages, 1,
+				   mmu_sync_info) != 0) {
 			dev_warn(dev, "Failed to map memory on GPU");
 			kbase_gpu_vm_unlock(kctx);
 			goto no_mmap;
@@ -604,7 +606,7 @@ int kbase_mem_query(struct kbase_context *kctx,
 		if (KBASE_REG_GPU_VA_SAME_4GB_PAGE & reg->flags)
 			*out |= BASE_MEM_GPU_VA_SAME_4GB_PAGE;
 
-		*out |= base_mem_group_id_set(reg->cpu_alloc->group_id);
+		*out |= kbase_mem_group_id_set(reg->cpu_alloc->group_id);
 
 		WARN(*out & ~BASE_MEM_FLAGS_QUERYABLE,
 				"BASE_MEM_FLAGS_QUERYABLE needs updating\n");
@@ -827,6 +829,11 @@ bool kbase_mem_evictable_unmake(struct kbase_mem_phy_alloc *gpu_alloc)
 	struct kbase_context *kctx = gpu_alloc->imported.native.kctx;
 	int err = 0;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	lockdep_assert_held(&kctx->reg_lock);
 
 	mutex_lock(&kctx->jit_evict_lock);
@@ -856,9 +863,9 @@ bool kbase_mem_evictable_unmake(struct kbase_mem_phy_alloc *gpu_alloc)
 			 * pre-eviction size.
 			 */
 			if (!err)
-				err = kbase_mem_grow_gpu_mapping(kctx,
-						gpu_alloc->reg,
-						gpu_alloc->evicted, 0);
+				err = kbase_mem_grow_gpu_mapping(
+					kctx, gpu_alloc->reg,
+					gpu_alloc->evicted, 0, mmu_sync_info);
 
 			gpu_alloc->evicted = 0;
 		}
@@ -1215,6 +1222,11 @@ int kbase_mem_umm_map(struct kbase_context *kctx,
 	struct kbase_mem_phy_alloc *alloc;
 	unsigned long gwt_mask = ~0;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	lockdep_assert_held(&kctx->reg_lock);
 
 	alloc = reg->gpu_alloc;
@@ -1241,14 +1253,11 @@ int kbase_mem_umm_map(struct kbase_context *kctx,
 		gwt_mask = ~KBASE_REG_GPU_WR;
 #endif
 
-	err = kbase_mmu_insert_pages(kctx->kbdev,
-				     &kctx->mmu,
-				     reg->start_pfn,
+	err = kbase_mmu_insert_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn,
 				     kbase_get_gpu_phy_pages(reg),
 				     kbase_reg_current_backed_size(reg),
-				     reg->flags & gwt_mask,
-				     kctx->as_nr,
-				     alloc->group_id);
+				     reg->flags & gwt_mask, kctx->as_nr,
+				     alloc->group_id, mmu_sync_info);
 	if (err)
 		goto bad_insert;
 
@@ -1261,13 +1270,11 @@ int kbase_mem_umm_map(struct kbase_context *kctx,
 		 * Assume alloc->nents is the number of actual pages in the
 		 * dma-buf memory.
 		 */
-		err = kbase_mmu_insert_single_page(kctx,
-				reg->start_pfn + alloc->nents,
-				kctx->aliasing_sink_page,
-				reg->nr_pages - alloc->nents,
-				(reg->flags | KBASE_REG_GPU_RD) &
-				~KBASE_REG_GPU_WR,
-				KBASE_MEM_GROUP_SINK);
+		err = kbase_mmu_insert_single_page(
+			kctx, reg->start_pfn + alloc->nents,
+			kctx->aliasing_sink_page, reg->nr_pages - alloc->nents,
+			(reg->flags | KBASE_REG_GPU_RD) & ~KBASE_REG_GPU_WR,
+			KBASE_MEM_GROUP_SINK, mmu_sync_info);
 		if (err)
 			goto bad_pad_insert;
 	}
@@ -1640,9 +1647,12 @@ KERNEL_VERSION(4, 5, 0) > LINUX_VERSION_CODE
 #elif KERNEL_VERSION(4, 9, 0) > LINUX_VERSION_CODE
 	faulted_pages = get_user_pages(address, *va_pages,
 			write, 0, pages, NULL);
-#else
+#elif KERNEL_VERSION(5, 9, 0) > LINUX_VERSION_CODE
 	faulted_pages = get_user_pages(address, *va_pages,
 			write ? FOLL_WRITE : 0, pages, NULL);
+#else
+	faulted_pages = pin_user_pages(address, *va_pages,
+				       write ? FOLL_WRITE : 0, pages, NULL);
 #endif
 
 	up_read(kbase_mem_get_process_mmap_lock());
@@ -1694,7 +1704,7 @@ unwind_dma_map:
 fault_mismatch:
 	if (pages) {
 		for (i = 0; i < faulted_pages; i++)
-			put_page(pages[i]);
+			kbase_unpin_user_buf_page(pages[i]);
 	}
 no_page_array:
 invalid_flags:
@@ -1718,6 +1728,11 @@ u64 kbase_mem_alias(struct kbase_context *kctx, u64 *flags, u64 stride,
 	size_t i;
 	bool coherent;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	KBASE_DEBUG_ASSERT(kctx);
 	KBASE_DEBUG_ASSERT(flags);
 	KBASE_DEBUG_ASSERT(ai);
@@ -1891,7 +1906,8 @@ u64 kbase_mem_alias(struct kbase_context *kctx, u64 *flags, u64 stride,
 #else
 	if (1) {
 #endif
-		if (kbase_gpu_mmap(kctx, reg, 0, *num_pages, 1) != 0) {
+		if (kbase_gpu_mmap(kctx, reg, 0, *num_pages, 1,
+				   mmu_sync_info) != 0) {
 			dev_warn(kctx->kbdev->dev, "Failed to map memory on GPU");
 			goto no_mmap;
 		}
@@ -1936,6 +1952,11 @@ int kbase_mem_import(struct kbase_context *kctx, enum base_mem_import_type type,
 {
 	struct kbase_va_region *reg;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	KBASE_DEBUG_ASSERT(kctx);
 	KBASE_DEBUG_ASSERT(gpu_va);
 	KBASE_DEBUG_ASSERT(va_pages);
@@ -2035,7 +2056,8 @@ int kbase_mem_import(struct kbase_context *kctx, enum base_mem_import_type type,
 
 	} else if (*flags & KBASE_MEM_IMPORT_HAVE_PAGES)  {
 		/* we control the VA, mmap now to the GPU */
-		if (kbase_gpu_mmap(kctx, reg, 0, *va_pages, 1) != 0)
+		if (kbase_gpu_mmap(kctx, reg, 0, *va_pages, 1, mmu_sync_info) !=
+		    0)
 			goto no_gpu_va;
 		/* return real GPU VA */
 		*gpu_va = reg->start_pfn << PAGE_SHIFT;
@@ -2069,8 +2091,9 @@ bad_flags:
 }
 
 int kbase_mem_grow_gpu_mapping(struct kbase_context *kctx,
-		struct kbase_va_region *reg,
-		u64 new_pages, u64 old_pages)
+			       struct kbase_va_region *reg, u64 new_pages,
+			       u64 old_pages,
+			       enum kbase_caller_mmu_sync_info mmu_sync_info)
 {
 	struct tagged_addr *phy_pages;
 	u64 delta = new_pages - old_pages;
@@ -2081,8 +2104,10 @@ int kbase_mem_grow_gpu_mapping(struct kbase_context *kctx,
 	/* Map the new pages into the GPU */
 	phy_pages = kbase_get_gpu_phy_pages(reg);
 	ret = kbase_mmu_insert_pages(kctx->kbdev, &kctx->mmu,
-		reg->start_pfn + old_pages, phy_pages + old_pages, delta,
-		reg->flags, kctx->as_nr, reg->gpu_alloc->group_id);
+				     reg->start_pfn + old_pages,
+				     phy_pages + old_pages, delta, reg->flags,
+				     kctx->as_nr, reg->gpu_alloc->group_id,
+				     mmu_sync_info);
 
 	return ret;
 }
@@ -2136,6 +2161,11 @@ int kbase_mem_commit(struct kbase_context *kctx, u64 gpu_addr, u64 new_pages)
 	struct kbase_va_region *reg;
 	bool read_locked = false;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	KBASE_DEBUG_ASSERT(kctx);
 	KBASE_DEBUG_ASSERT(gpu_addr != 0);
 
@@ -2227,8 +2257,8 @@ int kbase_mem_commit(struct kbase_context *kctx, u64 gpu_addr, u64 new_pages)
 		/* No update required for CPU mappings, that's done on fault. */
 
 		/* Update GPU mapping. */
-		res = kbase_mem_grow_gpu_mapping(kctx, reg,
-				new_pages, old_pages);
+		res = kbase_mem_grow_gpu_mapping(kctx, reg, new_pages,
+						 old_pages, mmu_sync_info);
 
 		/* On error free the new pages */
 		if (res) {
@@ -2647,6 +2677,11 @@ static int kbasep_reg_mmap(struct kbase_context *kctx,
 	struct kbase_va_region *reg;
 	int err = 0;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	*aligned_offset = 0;
 
 	dev_dbg(kctx->kbdev->dev, "in kbasep_reg_mmap\n");
@@ -2681,7 +2716,7 @@ static int kbasep_reg_mmap(struct kbase_context *kctx,
 	*nr_pages = kbase_reg_current_backed_size(reg);
 
 	if (kbase_gpu_mmap(kctx, reg, vma->vm_start + *aligned_offset,
-						reg->nr_pages, 1) != 0) {
+			   reg->nr_pages, 1, mmu_sync_info) != 0) {
 		dev_err(kctx->kbdev->dev, "%s:%d\n", __FILE__, __LINE__);
 		/* Unable to map in GPU space. */
 		WARN_ON(1);
@@ -2747,17 +2782,10 @@ int kbase_context_mmap(struct kbase_context *const kctx,
 		goto out_unlock;
 	}
 
-	/* if not the MTP, verify that the MTP has been mapped */
-	rcu_read_lock();
-	/* catches both when the special page isn't present or
-	 * when we've forked
-	 */
-	if (rcu_dereference(kctx->process_mm) != current->mm) {
+	if (!kbase_mem_allow_alloc(kctx)) {
 		err = -EINVAL;
-		rcu_read_unlock();
 		goto out_unlock;
 	}
-	rcu_read_unlock();
 
 	switch (vma->vm_pgoff) {
 	case PFN_DOWN(BASEP_MEM_INVALID_HANDLE):
diff --git a/mali_kbase/mali_kbase_mem_linux.h b/mali_kbase/mali_kbase_mem_linux.h
index 36159c1..f123d17 100644
--- a/mali_kbase/mali_kbase_mem_linux.h
+++ b/mali_kbase/mali_kbase_mem_linux.h
@@ -45,12 +45,14 @@ struct kbase_hwc_dma_mapping {
  *                properties for the new allocation.
  * @gpu_va:       Start address of the memory region which was allocated from GPU
  *                virtual address space.
+ * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops.
  *
  * Return: 0 on success or error code
  */
-struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx,
-					u64 va_pages, u64 commit_pages,
-					u64 extension, u64 *flags, u64 *gpu_va);
+struct kbase_va_region *
+kbase_mem_alloc(struct kbase_context *kctx, u64 va_pages, u64 commit_pages,
+		u64 extension, u64 *flags, u64 *gpu_va,
+		enum kbase_caller_mmu_sync_info mmu_sync_info);
 
 /**
  * kbase_mem_query - Query properties of a GPU memory region
@@ -169,6 +171,7 @@ void kbase_mem_evictable_deinit(struct kbase_context *kctx);
  * @reg:       The GPU region
  * @new_pages: The number of pages after the grow
  * @old_pages: The number of pages before the grow
+ * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops.
  *
  * Return: 0 on success, -errno on error.
  *
@@ -178,8 +181,9 @@ void kbase_mem_evictable_deinit(struct kbase_context *kctx);
  * Note: Caller must be holding the region lock.
  */
 int kbase_mem_grow_gpu_mapping(struct kbase_context *kctx,
-		struct kbase_va_region *reg,
-		u64 new_pages, u64 old_pages);
+			       struct kbase_va_region *reg, u64 new_pages,
+			       u64 old_pages,
+			       enum kbase_caller_mmu_sync_info mmu_sync_info);
 
 /**
  * kbase_mem_evictable_make - Make a physical allocation eligible for eviction
diff --git a/mali_kbase/mali_kbase_mem_profile_debugfs.c b/mali_kbase/mali_kbase_mem_profile_debugfs.c
index 201ff51..7e77963 100644
--- a/mali_kbase/mali_kbase_mem_profile_debugfs.c
+++ b/mali_kbase/mali_kbase_mem_profile_debugfs.c
@@ -84,9 +84,9 @@ int kbasep_mem_profile_debugfs_insert(struct kbase_context *kctx, char *data,
 	if (!kbase_ctx_flag(kctx, KCTX_MEM_PROFILE_INITIALIZED)) {
 		if (IS_ERR_OR_NULL(kctx->kctx_dentry)) {
 			err  = -ENOMEM;
-		} else if (!debugfs_create_file("mem_profile", mode,
-					kctx->kctx_dentry, kctx,
-					&kbasep_mem_profile_debugfs_fops)) {
+		} else if (IS_ERR_OR_NULL(debugfs_create_file("mem_profile",
+					mode, kctx->kctx_dentry, kctx,
+					&kbasep_mem_profile_debugfs_fops))) {
 			err = -EAGAIN;
 		} else {
 			kbase_ctx_flag_set(kctx,
diff --git a/mali_kbase/mali_kbase_mem_profile_debugfs_buf_size.h b/mali_kbase/mali_kbase_mem_profile_debugfs_buf_size.h
index 3184a98..1210ed5 100644
--- a/mali_kbase/mali_kbase_mem_profile_debugfs_buf_size.h
+++ b/mali_kbase/mali_kbase_mem_profile_debugfs_buf_size.h
@@ -30,8 +30,7 @@
  * The size of the buffer to accumulate the histogram report text in
  * @see @ref CCTXP_HIST_BUF_SIZE_MAX_LENGTH_REPORT
  */
-#define KBASE_MEM_PROFILE_MAX_BUF_SIZE \
-	((size_t) (64 + ((80 + (56 * 64)) * 53) + 56))
+#define KBASE_MEM_PROFILE_MAX_BUF_SIZE ((size_t)(64 + ((80 + (56 * 64)) * 54) + 56))
 
 #endif  /*_KBASE_MEM_PROFILE_DEBUGFS_BUF_SIZE_H_*/
 
diff --git a/mali_kbase/mali_kbase_pbha.c b/mali_kbase/mali_kbase_pbha.c
new file mode 100644
index 0000000..3e58a7b
--- /dev/null
+++ b/mali_kbase/mali_kbase_pbha.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#include "mali_kbase_pbha.h"
+
+#include <device/mali_kbase_device.h>
+#include <mali_kbase.h>
+#define DTB_SET_SIZE 2
+
+static bool read_setting_valid(unsigned int id, unsigned int read_setting)
+{
+	switch (id) {
+	/* Valid ID - fall through all */
+	case SYSC_ALLOC_ID_R_OTHER:
+	case SYSC_ALLOC_ID_R_CSF:
+	case SYSC_ALLOC_ID_R_MMU:
+	case SYSC_ALLOC_ID_R_TILER_VERT:
+	case SYSC_ALLOC_ID_R_TILER_PTR:
+	case SYSC_ALLOC_ID_R_TILER_INDEX:
+	case SYSC_ALLOC_ID_R_TILER_OTHER:
+	case SYSC_ALLOC_ID_R_IC:
+	case SYSC_ALLOC_ID_R_ATTR:
+	case SYSC_ALLOC_ID_R_SCM:
+	case SYSC_ALLOC_ID_R_FSDC:
+	case SYSC_ALLOC_ID_R_VL:
+	case SYSC_ALLOC_ID_R_PLR:
+	case SYSC_ALLOC_ID_R_TEX:
+	case SYSC_ALLOC_ID_R_LSC:
+		switch (read_setting) {
+		/* Valid setting value - fall through all */
+		case SYSC_ALLOC_L2_ALLOC:
+		case SYSC_ALLOC_NEVER_ALLOC:
+		case SYSC_ALLOC_ALWAYS_ALLOC:
+		case SYSC_ALLOC_PTL_ALLOC:
+		case SYSC_ALLOC_L2_PTL_ALLOC:
+			return true;
+		default:
+			return false;
+		}
+	default:
+		return false;
+	}
+
+	/* Unreachable */
+	return false;
+}
+
+static bool write_setting_valid(unsigned int id, unsigned int write_setting)
+{
+	switch (id) {
+	/* Valid ID - fall through all */
+	case SYSC_ALLOC_ID_W_OTHER:
+	case SYSC_ALLOC_ID_W_CSF:
+	case SYSC_ALLOC_ID_W_PCB:
+	case SYSC_ALLOC_ID_W_TILER_PTR:
+	case SYSC_ALLOC_ID_W_TILER_VERT_PLIST:
+	case SYSC_ALLOC_ID_W_TILER_OTHER:
+	case SYSC_ALLOC_ID_W_L2_EVICT:
+	case SYSC_ALLOC_ID_W_L2_FLUSH:
+	case SYSC_ALLOC_ID_W_TIB_COLOR:
+	case SYSC_ALLOC_ID_W_TIB_COLOR_AFBCH:
+	case SYSC_ALLOC_ID_W_TIB_COLOR_AFBCB:
+	case SYSC_ALLOC_ID_W_TIB_CRC:
+	case SYSC_ALLOC_ID_W_TIB_DS:
+	case SYSC_ALLOC_ID_W_TIB_DS_AFBCH:
+	case SYSC_ALLOC_ID_W_TIB_DS_AFBCB:
+	case SYSC_ALLOC_ID_W_LSC:
+		switch (write_setting) {
+		/* Valid setting value - fall through all */
+		case SYSC_ALLOC_L2_ALLOC:
+		case SYSC_ALLOC_NEVER_ALLOC:
+		case SYSC_ALLOC_ALWAYS_ALLOC:
+		case SYSC_ALLOC_PTL_ALLOC:
+		case SYSC_ALLOC_L2_PTL_ALLOC:
+			return true;
+		default:
+			return false;
+		}
+	default:
+		return false;
+	}
+
+	/* Unreachable */
+	return false;
+}
+
+static bool settings_valid(unsigned int id, unsigned int read_setting,
+			   unsigned int write_setting)
+{
+	bool settings_valid = false;
+
+	if (id < SYSC_ALLOC_COUNT * sizeof(u32)) {
+		settings_valid = read_setting_valid(id, read_setting) &&
+				 write_setting_valid(id, write_setting);
+	}
+
+	return settings_valid;
+}
+
+bool kbasep_pbha_supported(struct kbase_device *kbdev)
+{
+	const u32 arch_maj_rev =
+		ARCH_MAJOR_REV_REG(kbdev->gpu_props.props.raw_props.gpu_id);
+
+	return (arch_maj_rev >= GPU_ID2_ARCH_MAJOR_REV_MAKE(11, 3));
+}
+
+int kbase_pbha_record_settings(struct kbase_device *kbdev, bool runtime,
+			       unsigned int id, unsigned int read_setting,
+			       unsigned int write_setting)
+{
+	bool const valid = settings_valid(id, read_setting, write_setting);
+
+	if (valid) {
+		unsigned int const sysc_alloc_num = id / sizeof(u32);
+		u32 modified_reg;
+		if (runtime) {
+			int i;
+
+			kbase_pm_context_active(kbdev);
+			/* Ensure host copy of SYSC_ALLOC is up to date */
+			for (i = 0; i < SYSC_ALLOC_COUNT; i++)
+				kbdev->sysc_alloc[i] = kbase_reg_read(
+					kbdev, GPU_CONTROL_REG(SYSC_ALLOC(i)));
+			kbase_pm_context_idle(kbdev);
+		}
+
+		modified_reg = kbdev->sysc_alloc[sysc_alloc_num];
+
+		switch (id % sizeof(u32)) {
+		case 0:
+			modified_reg = SYSC_ALLOC_R_SYSC_ALLOC0_SET(
+				modified_reg, read_setting);
+			modified_reg = SYSC_ALLOC_W_SYSC_ALLOC0_SET(
+				modified_reg, write_setting);
+			break;
+		case 1:
+			modified_reg = SYSC_ALLOC_R_SYSC_ALLOC1_SET(
+				modified_reg, read_setting);
+			modified_reg = SYSC_ALLOC_W_SYSC_ALLOC1_SET(
+				modified_reg, write_setting);
+			break;
+		case 2:
+			modified_reg = SYSC_ALLOC_R_SYSC_ALLOC2_SET(
+				modified_reg, read_setting);
+			modified_reg = SYSC_ALLOC_W_SYSC_ALLOC2_SET(
+				modified_reg, write_setting);
+			break;
+		case 3:
+			modified_reg = SYSC_ALLOC_R_SYSC_ALLOC3_SET(
+				modified_reg, read_setting);
+			modified_reg = SYSC_ALLOC_W_SYSC_ALLOC3_SET(
+				modified_reg, write_setting);
+			break;
+		}
+
+		kbdev->sysc_alloc[sysc_alloc_num] = modified_reg;
+	}
+
+	return valid ? 0 : -EINVAL;
+}
+
+void kbase_pbha_write_settings(struct kbase_device *kbdev)
+{
+	if (kbasep_pbha_supported(kbdev)) {
+		int i;
+		for (i = 0; i < SYSC_ALLOC_COUNT; ++i)
+			kbase_reg_write(kbdev, GPU_CONTROL_REG(SYSC_ALLOC(i)),
+					kbdev->sysc_alloc[i]);
+	}
+}
+
+int kbase_pbha_read_dtb(struct kbase_device *kbdev)
+{
+	u32 dtb_data[SYSC_ALLOC_COUNT * sizeof(u32) * DTB_SET_SIZE];
+	const struct device_node *pbha_node;
+	int sz, i;
+	bool valid = true;
+
+	if (!kbasep_pbha_supported(kbdev))
+		return 0;
+
+	pbha_node = of_get_child_by_name(kbdev->dev->of_node, "pbha");
+	if (!pbha_node)
+		return 0;
+
+	sz = of_property_count_elems_of_size(pbha_node, "int_id_override",
+					     sizeof(u32));
+	if (sz <= 0 || (sz % DTB_SET_SIZE != 0)) {
+		dev_err(kbdev->dev, "Bad DTB format: pbha.int_id_override\n");
+		return -EINVAL;
+	}
+	if (of_property_read_u32_array(pbha_node, "int_id_override", dtb_data,
+				       sz) != 0) {
+		dev_err(kbdev->dev,
+			"Failed to read DTB pbha.int_id_override\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; valid && i < sz; i = i + DTB_SET_SIZE) {
+		unsigned int rdset =
+			SYSC_ALLOC_R_SYSC_ALLOC0_GET(dtb_data[i + 1]);
+		unsigned int wrset =
+			SYSC_ALLOC_W_SYSC_ALLOC0_GET(dtb_data[i + 1]);
+		valid = valid &&
+			(kbase_pbha_record_settings(kbdev, false, dtb_data[i],
+						    rdset, wrset) == 0);
+		if (valid)
+			dev_info(kbdev->dev,
+				 "pbha.int_id_override 0x%x r0x%x w0x%x\n",
+				 dtb_data[i], rdset, wrset);
+	}
+	if (i != sz || (!valid)) {
+		dev_err(kbdev->dev,
+			"Failed recording DTB data (pbha.int_id_override)\n");
+		return -EINVAL;
+	}
+	return 0;
+}
diff --git a/mali_kbase/mali_kbase_pbha.h b/mali_kbase/mali_kbase_pbha.h
new file mode 100644
index 0000000..6861773
--- /dev/null
+++ b/mali_kbase/mali_kbase_pbha.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _KBASE_PBHA_H
+#define _KBASE_PBHA_H
+
+#include <mali_kbase.h>
+
+/**
+ * kbasep_pbha_supported - check whether PBHA registers are
+ * available
+ *
+ * Should only be used in mali_kbase_pbha* files - thus the
+ * kbase[p] prefix.
+ *
+ * @kbdev: Device pointer
+ *
+ * Return: True if pbha is supported, false otherwise
+ */
+bool kbasep_pbha_supported(struct kbase_device *kbdev);
+
+/**
+ * kbase_pbha_record_settings - record PBHA settings to be applied when
+ * L2 is powered down
+ *
+ * @kbdev: Device pointer
+ * @runtime: true if it's called at runtime and false if it's called on init.
+ * @id: memory access source ID
+ * @read_setting: Read setting
+ * @write_setting: Write setting
+ *
+ * Return: 0 on success, otherwise error code.
+ */
+int kbase_pbha_record_settings(struct kbase_device *kbdev, bool runtime,
+			       unsigned int id, unsigned int read_setting,
+			       unsigned int write_setting);
+
+/**
+ * kbase_pbha_write_settings - write recorded PBHA settings to GPU
+ * registers
+ *
+ * Only valid to call this function when L2 is powered down, otherwise
+ * this will not affect PBHA settings.
+ *
+ * @kbdev: Device pointer
+ */
+void kbase_pbha_write_settings(struct kbase_device *kbdev);
+
+/**
+ * kbase_pbha_read_dtb - read PBHA settings from DTB and record it to be
+ * applied when L2 is powered down
+ *
+ * @kbdev: Device pointer
+ *
+ * Return: 0 on success, otherwise error code.
+ */
+int kbase_pbha_read_dtb(struct kbase_device *kbdev);
+
+#endif /* _KBASE_PBHA_H */
diff --git a/mali_kbase/mali_kbase_pbha_debugfs.c b/mali_kbase/mali_kbase_pbha_debugfs.c
new file mode 100644
index 0000000..47eab63
--- /dev/null
+++ b/mali_kbase/mali_kbase_pbha_debugfs.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#include "mali_kbase_pbha_debugfs.h"
+
+#include "mali_kbase_pbha.h"
+
+#include <device/mali_kbase_device.h>
+#include <mali_kbase_reset_gpu.h>
+#include <mali_kbase.h>
+
+static int int_id_overrides_show(struct seq_file *sfile, void *data)
+{
+	struct kbase_device *kbdev = sfile->private;
+	int i;
+
+	kbase_pm_context_active(kbdev);
+
+	/* Minimal header for readability */
+	seq_puts(sfile, "// R   W\n");
+	for (i = 0; i < SYSC_ALLOC_COUNT; ++i) {
+		int j;
+		u32 reg = kbase_reg_read(kbdev, GPU_CONTROL_REG(SYSC_ALLOC(i)));
+
+		for (j = 0; j < sizeof(u32); ++j) {
+			u8 r_val;
+			u8 w_val;
+
+			switch (j) {
+			case 0:
+				r_val = SYSC_ALLOC_R_SYSC_ALLOC0_GET(reg);
+				w_val = SYSC_ALLOC_W_SYSC_ALLOC0_GET(reg);
+				break;
+			case 1:
+				r_val = SYSC_ALLOC_R_SYSC_ALLOC1_GET(reg);
+				w_val = SYSC_ALLOC_W_SYSC_ALLOC1_GET(reg);
+				break;
+			case 2:
+				r_val = SYSC_ALLOC_R_SYSC_ALLOC2_GET(reg);
+				w_val = SYSC_ALLOC_W_SYSC_ALLOC2_GET(reg);
+				break;
+			case 3:
+				r_val = SYSC_ALLOC_R_SYSC_ALLOC3_GET(reg);
+				w_val = SYSC_ALLOC_W_SYSC_ALLOC3_GET(reg);
+				break;
+			}
+			seq_printf(sfile, "%2zu 0x%x 0x%x\n",
+				   (i * sizeof(u32)) + j, r_val, w_val);
+		}
+	}
+	kbase_pm_context_idle(kbdev);
+
+	return 0;
+}
+
+static ssize_t int_id_overrides_write(struct file *file,
+				      const char __user *ubuf, size_t count,
+				      loff_t *ppos)
+{
+	struct seq_file *sfile = file->private_data;
+	struct kbase_device *kbdev = sfile->private;
+	char raw_str[128];
+	unsigned int id;
+	unsigned int r_val;
+	unsigned int w_val;
+
+	if (count >= sizeof(raw_str))
+		return -E2BIG;
+	if (copy_from_user(raw_str, ubuf, count))
+		return -EINVAL;
+	raw_str[count] = '\0';
+
+	if (sscanf(raw_str, "%u %x %x", &id, &r_val, &w_val) != 3)
+		return -EINVAL;
+
+	if (kbase_pbha_record_settings(kbdev, true, id, r_val, w_val))
+		return -EINVAL;
+
+	/* This is a debugfs config write, so reset GPU such that changes take effect ASAP */
+	kbase_pm_context_active(kbdev);
+	if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
+		kbase_reset_gpu(kbdev);
+	kbase_pm_context_idle(kbdev);
+
+	return count;
+}
+
+static int int_id_overrides_open(struct inode *in, struct file *file)
+{
+	return single_open(file, int_id_overrides_show, in->i_private);
+}
+
+static const struct file_operations pbha_int_id_overrides_fops = {
+	.owner = THIS_MODULE,
+	.open = int_id_overrides_open,
+	.read = seq_read,
+	.write = int_id_overrides_write,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+void kbase_pbha_debugfs_init(struct kbase_device *kbdev)
+{
+	if (kbasep_pbha_supported(kbdev)) {
+#if (KERNEL_VERSION(4, 7, 0) <= LINUX_VERSION_CODE)
+		/* only for newer kernel version debug file system is safe */
+		const mode_t mode = 0644;
+#else
+		const mode_t mode = 0600;
+#endif
+		struct dentry *debugfs_pbha_dir = debugfs_create_dir(
+			"pbha", kbdev->mali_debugfs_directory);
+		if (IS_ERR_OR_NULL(debugfs_pbha_dir)) {
+			dev_err(kbdev->dev,
+				"Couldn't create mali debugfs page-based hardware attributes directory\n");
+			return;
+		}
+
+		debugfs_create_file("int_id_overrides", mode, debugfs_pbha_dir,
+				    kbdev, &pbha_int_id_overrides_fops);
+	}
+}
diff --git a/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_control_registers.h b/mali_kbase/mali_kbase_pbha_debugfs.h
index b62a8b0..3f477b4 100644
--- a/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_control_registers.h
+++ b/mali_kbase/mali_kbase_pbha_debugfs.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -19,14 +19,16 @@
  *
  */
 
-/*
- * This header was autogenerated, it should not be edited.
- */
+#ifndef _KBASE_PBHA_DEBUGFS_H
+#define _KBASE_PBHA_DEBUGFS_H
 
-#ifndef _UAPI_GPU_CSF_CONTROL_REGISTERS_H_
-#define _UAPI_GPU_CSF_CONTROL_REGISTERS_H_
+#include <mali_kbase.h>
 
-/* GPU_REGISTERS register offsets */
-#define GPU_CONTROL_MCU 0x3000 /* () MCU control registers */
+/**
+ * kbasep_pbha_debugfs_init - Initialize pbha debugfs directory
+ *
+ * @kbdev: Device pointer
+ */
+void kbase_pbha_debugfs_init(struct kbase_device *kbdev);
 
-#endif /* _UAPI_GPU_CSF_CONTROL_REGISTERS_H_ */
+#endif /* _KBASE_PBHA_DEBUGFS_H */
diff --git a/mali_kbase/mali_kbase_pm.c b/mali_kbase/mali_kbase_pm.c
index de100dd..4078da1 100644
--- a/mali_kbase/mali_kbase_pm.c
+++ b/mali_kbase/mali_kbase_pm.c
@@ -26,6 +26,7 @@
 #include <mali_kbase.h>
 #include <gpu/mali_kbase_gpu_regmap.h>
 #include <mali_kbase_vinstr.h>
+#include <mali_kbase_kinstr_prfcnt.h>
 #include <mali_kbase_hwcnt_context.h>
 
 #include <mali_kbase_pm.h>
@@ -76,13 +77,13 @@ int kbase_pm_context_active_handle_suspend(struct kbase_device *kbdev,
 		case KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE:
 			if (kbdev->pm.active_count != 0)
 				break;
-			/* FALLTHROUGH */
+			fallthrough;
 		case KBASE_PM_SUSPEND_HANDLER_DONT_INCREASE:
 			kbase_pm_unlock(kbdev);
 			return 1;
 
 		case KBASE_PM_SUSPEND_HANDLER_NOT_POSSIBLE:
-			/* FALLTHROUGH */
+			fallthrough;
 		default:
 			KBASE_DEBUG_ASSERT_MSG(false, "unreachable");
 			break;
@@ -147,10 +148,11 @@ void kbase_pm_driver_suspend(struct kbase_device *kbdev)
 {
 	KBASE_DEBUG_ASSERT(kbdev);
 
-	/* Suspend vinstr. This blocks until the vinstr worker and timer are
-	 * no longer running.
+	/* Suspend HW counter intermediaries. This blocks until workers and timers
+	 * are no longer running.
 	 */
 	kbase_vinstr_suspend(kbdev->vinstr_ctx);
+	kbase_kinstr_prfcnt_suspend(kbdev->kinstr_prfcnt_ctx);
 
 	/* Disable GPU hardware counters.
 	 * This call will block until counters are disabled.
@@ -266,8 +268,9 @@ void kbase_pm_driver_resume(struct kbase_device *kbdev, bool arb_gpu_start)
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 #endif
 
-	/* Resume vinstr */
+	/* Resume HW counters intermediaries. */
 	kbase_vinstr_resume(kbdev->vinstr_ctx);
+	kbase_kinstr_prfcnt_resume(kbdev->kinstr_prfcnt_ctx);
 }
 
 void kbase_pm_suspend(struct kbase_device *kbdev)
diff --git a/mali_kbase/mali_kbase_regs_history_debugfs.h b/mali_kbase/mali_kbase_regs_history_debugfs.h
index 3b181d3..26decb4 100644
--- a/mali_kbase/mali_kbase_regs_history_debugfs.h
+++ b/mali_kbase/mali_kbase_regs_history_debugfs.h
@@ -70,6 +70,15 @@ void kbase_io_history_dump(struct kbase_device *kbdev);
 void kbasep_regs_history_debugfs_init(struct kbase_device *kbdev);
 
 #else /* defined(CONFIG_DEBUG_FS) && !IS_ENABLED(CONFIG_MALI_NO_MALI) */
+
+#define kbase_io_history_init(...) ((int)0)
+
+#define kbase_io_history_term CSTD_NOP
+
+#define kbase_io_history_dump CSTD_NOP
+
+#define kbasep_regs_history_debugfs_init CSTD_NOP
+
 #endif /* defined(CONFIG_DEBUG_FS) && !IS_ENABLED(CONFIG_MALI_NO_MALI) */
 
 #endif  /*_KBASE_REGS_HISTORY_DEBUGFS_H*/
diff --git a/mali_kbase/mali_kbase_reset_gpu.h b/mali_kbase/mali_kbase_reset_gpu.h
index 897b732..7502fe8 100644
--- a/mali_kbase/mali_kbase_reset_gpu.h
+++ b/mali_kbase/mali_kbase_reset_gpu.h
@@ -91,7 +91,8 @@ int kbase_reset_gpu_prevent_and_wait(struct kbase_device *kbdev);
  * Refer to kbase_reset_gpu_prevent_and_wait() for more information.
  *
  * Return: 0 on success. -EAGAIN if a reset is currently happening. Other
- * negative error codes on failure.
+ * negative error codes on failure, where -ENOMEM indicates that GPU reset
+ * had failed.
  */
 int kbase_reset_gpu_try_prevent(struct kbase_device *kbdev);
 
diff --git a/mali_kbase/mali_kbase_vinstr.c b/mali_kbase/mali_kbase_vinstr.c
index d00bc00..6a1e782 100644
--- a/mali_kbase/mali_kbase_vinstr.c
+++ b/mali_kbase/mali_kbase_vinstr.c
@@ -24,6 +24,7 @@
 #include "mali_kbase_hwcnt_types.h"
 #include <uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h>
 #include "mali_kbase_hwcnt_gpu.h"
+#include "mali_kbase_hwcnt_gpu_narrow.h"
 #include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h>
 #include "mali_malisw.h"
 #include "mali_kbase_debug.h"
@@ -55,8 +56,8 @@
  * @metadata:      Hardware counter metadata provided by virtualizer.
  * @metadata_user: API compatible hardware counter metadata provided by vinstr.
  *                 For compatibility with the user driver interface, this
- *                 contains a "truncated" version of the HWCNT metadata limited
- *                 to 64 entries per block. NULL when not required.
+ *                 contains a narrowed version of the HWCNT metadata limited
+ *                 to 64 entries per block of 32 bits each.
  * @lock:          Lock protecting all vinstr state.
  * @suspend_count: Suspend reference count. If non-zero, timer and worker are
  *                 prevented from being re-scheduled.
@@ -68,7 +69,7 @@
 struct kbase_vinstr_context {
 	struct kbase_hwcnt_virtualizer *hvirt;
 	const struct kbase_hwcnt_metadata *metadata;
-	const struct kbase_hwcnt_metadata *metadata_user;
+	const struct kbase_hwcnt_metadata_narrow *metadata_user;
 	struct mutex lock;
 	size_t suspend_count;
 	size_t client_count;
@@ -89,8 +90,8 @@ struct kbase_vinstr_context {
  *                     occur. If 0, not a periodic client.
  * @enable_map:        Counters enable map.
  * @tmp_buf:           Temporary buffer to use before handing dump to client.
- * @dump_bufs:         Array of dump buffers allocated by this client.
- * @dump_bufs_meta:    Metadata of dump buffers.
+ * @dump_bufs:         Array of narrow dump buffers allocated by this client.
+ * @dump_bufs_meta:    Metadata of hwcnt reader client buffers.
  * @meta_idx:          Index of metadata being accessed by userspace.
  * @read_idx:          Index of buffer read by userspace.
  * @write_idx:         Index of buffer being written by dump worker.
@@ -104,7 +105,7 @@ struct kbase_vinstr_client {
 	u32 dump_interval_ns;
 	struct kbase_hwcnt_enable_map enable_map;
 	struct kbase_hwcnt_dump_buffer tmp_buf;
-	struct kbase_hwcnt_dump_buffer_array dump_bufs;
+	struct kbase_hwcnt_dump_buffer_narrow_array dump_bufs;
 	struct kbase_hwcnt_reader_metadata *dump_bufs_meta;
 	atomic_t meta_idx;
 	atomic_t read_idx;
@@ -190,7 +191,7 @@ static int kbasep_vinstr_client_dump(
 	unsigned int write_idx;
 	unsigned int read_idx;
 	struct kbase_hwcnt_dump_buffer *tmp_buf;
-	struct kbase_hwcnt_dump_buffer *dump_buf;
+	struct kbase_hwcnt_dump_buffer_narrow *dump_buf;
 	struct kbase_hwcnt_reader_metadata *meta;
 	u8 clk_cnt;
 
@@ -223,17 +224,11 @@ static int kbasep_vinstr_client_dump(
 	 * variant will explicitly zero any non-enabled counters to ensure
 	 * nothing except exactly what the user asked for is made visible.
 	 *
-	 * If the metadata in vinstr (vctx->metadata_user) is not NULL, it means
-	 * vinstr has the truncated metadata, so do a narrow copy since
-	 * virtualizer has a bigger buffer but user only needs part of it.
-	 * otherwise we do a full copy.
+	 * A narrow copy is required since virtualizer has a bigger buffer
+	 * but user only needs part of it.
 	 */
-	if (vcli->vctx->metadata_user)
-		kbase_hwcnt_dump_buffer_copy_strict_narrow(dump_buf, tmp_buf,
-							   &vcli->enable_map);
-	else
-		kbase_hwcnt_dump_buffer_copy_strict(dump_buf, tmp_buf,
-						    &vcli->enable_map);
+	kbase_hwcnt_dump_buffer_copy_strict_narrow(dump_buf, tmp_buf,
+						   &vcli->enable_map);
 
 	clk_cnt = vcli->vctx->metadata->clk_cnt;
 
@@ -388,7 +383,7 @@ static void kbasep_vinstr_client_destroy(struct kbase_vinstr_client *vcli)
 
 	kbase_hwcnt_virtualizer_client_destroy(vcli->hvcli);
 	kfree(vcli->dump_bufs_meta);
-	kbase_hwcnt_dump_buffer_array_free(&vcli->dump_bufs);
+	kbase_hwcnt_dump_buffer_narrow_array_free(&vcli->dump_bufs);
 	kbase_hwcnt_dump_buffer_free(&vcli->tmp_buf);
 	kbase_hwcnt_enable_map_free(&vcli->enable_map);
 	kfree(vcli);
@@ -446,20 +441,11 @@ static int kbasep_vinstr_client_create(
 	/* Enable all the available clk_enable_map. */
 	vcli->enable_map.clk_enable_map = (1ull << vctx->metadata->clk_cnt) - 1;
 
-	if (vctx->metadata_user)
-		/* Use vinstr's truncated metadata to alloc dump buffers which
-		 * interact with clients.
-		 */
-		errcode =
-			kbase_hwcnt_dump_buffer_array_alloc(vctx->metadata_user,
-							    setup->buffer_count,
-							    &vcli->dump_bufs);
-	else
-		/* Use metadata from virtualizer to allocate dump buffers  if
-		 * vinstr doesn't have the truncated metadata.
-		 */
-		errcode = kbase_hwcnt_dump_buffer_array_alloc(
-			vctx->metadata, setup->buffer_count, &vcli->dump_bufs);
+	/* Use vinstr's narrowed metadata to alloc narrow dump buffers which
+	 * interact with clients.
+	 */
+	errcode = kbase_hwcnt_dump_buffer_narrow_array_alloc(
+		vctx->metadata_user, setup->buffer_count, &vcli->dump_bufs);
 	if (errcode)
 		goto error;
 
@@ -504,9 +490,8 @@ int kbase_vinstr_init(
 
 	vctx->hvirt = hvirt;
 	vctx->metadata = metadata;
-	vctx->metadata_user = NULL;
-	errcode = kbase_hwcnt_gpu_metadata_create_truncate_64(
-		&vctx->metadata_user, metadata);
+	errcode = kbase_hwcnt_gpu_metadata_narrow_create(&vctx->metadata_user,
+							 metadata);
 	if (errcode)
 		goto err_metadata_create;
 
@@ -543,8 +528,7 @@ void kbase_vinstr_term(struct kbase_vinstr_context *vctx)
 		}
 	}
 
-	if (vctx->metadata_user)
-		kbase_hwcnt_metadata_destroy(vctx->metadata_user);
+	kbase_hwcnt_gpu_metadata_narrow_destroy(vctx->metadata_user);
 
 	WARN_ON(vctx->client_count != 0);
 	kfree(vctx);
@@ -1007,14 +991,8 @@ static long kbasep_vinstr_hwcnt_reader_ioctl(
 			cli, (u32 __user *)arg);
 		break;
 	case _IOC_NR(KBASE_HWCNT_READER_GET_BUFFER_SIZE):
-		if (cli->vctx->metadata_user)
-			rcode = put_user(
-				(u32)cli->vctx->metadata_user->dump_buf_bytes,
-				(u32 __user *)arg);
-		else
-			rcode = put_user(
-				(u32)cli->vctx->metadata->dump_buf_bytes,
-				(u32 __user *)arg);
+		rcode = put_user((u32)cli->vctx->metadata_user->dump_buf_bytes,
+				 (u32 __user *)arg);
 		break;
 	case _IOC_NR(KBASE_HWCNT_READER_DUMP):
 		rcode = kbasep_vinstr_hwcnt_reader_ioctl_dump(cli);
diff --git a/mali_kbase/mali_malisw.h b/mali_kbase/mali_malisw.h
index c0649f2..3ddfcd9 100644
--- a/mali_kbase/mali_malisw.h
+++ b/mali_kbase/mali_malisw.h
@@ -96,4 +96,9 @@
  */
 #define CSTD_STR2(x)	CSTD_STR1(x)
 
+/* LINUX_VERSION_CODE < 5.4 */
+#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
+#define fallthrough	CSTD_NOP(...)	/* fallthrough */
+#endif
+
 #endif /* _MALISW_H_ */
diff --git a/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c b/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c
index 05253ae..c9ba3fc 100644
--- a/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c
+++ b/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c
@@ -130,6 +130,7 @@ void kbase_mmu_report_mcu_as_fault_and_reset(struct kbase_device *kbdev,
 	if (kbase_prepare_to_reset_gpu(kbdev,
 				       RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
 		kbase_reset_gpu(kbdev);
+
 }
 KBASE_EXPORT_TEST_API(kbase_mmu_report_mcu_as_fault_and_reset);
 
@@ -482,8 +483,6 @@ static void kbase_mmu_gpu_fault_worker(struct work_struct *data)
 	kbase_csf_ctx_handle_fault(kctx, fault);
 	kbase_ctx_sched_release_ctx_lock(kctx);
 
-	atomic_dec(&kbdev->faults_pending);
-
 	/* A work for GPU fault is complete.
 	 * Till reaching here, no further GPU fault will be reported.
 	 * Now clear the GPU fault to allow next GPU fault interrupt report.
@@ -492,6 +491,8 @@ static void kbase_mmu_gpu_fault_worker(struct work_struct *data)
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
 			GPU_COMMAND_CLEAR_FAULT);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	atomic_dec(&kbdev->faults_pending);
 }
 
 /**
diff --git a/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c b/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c
index 01ca419..b050be8 100644
--- a/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c
+++ b/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c
@@ -185,6 +185,7 @@ void kbase_mmu_report_fault_and_kill(struct kbase_context *kctx,
 			KBASE_MMU_FAULT_TYPE_PAGE_UNEXPECTED);
 	kbase_mmu_hw_enable_fault(kbdev, as,
 			KBASE_MMU_FAULT_TYPE_PAGE_UNEXPECTED);
+
 }
 
 /**
diff --git a/mali_kbase/mmu/mali_kbase_mmu.c b/mali_kbase/mmu/mali_kbase_mmu.c
index e3c5b15..5f6cc7a 100644
--- a/mali_kbase/mmu/mali_kbase_mmu.c
+++ b/mali_kbase/mmu/mali_kbase_mmu.c
@@ -43,7 +43,6 @@
 #include <device/mali_kbase_device.h>
 
 #include <mali_kbase_trace_gpu_mem.h>
-#define KBASE_MMU_PAGE_ENTRIES 512
 
 /**
  * kbase_mmu_flush_invalidate() - Flush and invalidate the GPU caches.
@@ -62,9 +61,12 @@
  * If sync is set then accesses in the flushed region will be drained
  * before data is flush and invalidated through L1, L2 and into memory,
  * after which point this function will return.
+ * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops.
  */
-static void kbase_mmu_flush_invalidate(struct kbase_context *kctx,
-		u64 vpfn, size_t nr, bool sync);
+static void
+kbase_mmu_flush_invalidate(struct kbase_context *kctx, u64 vpfn, size_t nr,
+			   bool sync,
+			   enum kbase_caller_mmu_sync_info mmu_sync_info);
 
 /**
  * kbase_mmu_flush_invalidate_no_ctx() - Flush and invalidate the GPU caches.
@@ -73,11 +75,13 @@ static void kbase_mmu_flush_invalidate(struct kbase_context *kctx,
  * @nr: The number of pages to flush.
  * @sync: Set if the operation should be synchronous or not.
  * @as_nr: GPU address space number for which flush + invalidate is required.
+ * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops.
  *
  * This is used for MMU tables which do not belong to a user space context.
  */
-static void kbase_mmu_flush_invalidate_no_ctx(struct kbase_device *kbdev,
-		u64 vpfn, size_t nr, bool sync, int as_nr);
+static void kbase_mmu_flush_invalidate_no_ctx(
+	struct kbase_device *kbdev, u64 vpfn, size_t nr, bool sync, int as_nr,
+	enum kbase_caller_mmu_sync_info mmu_sync_info);
 
 /**
  * kbase_mmu_sync_pgd() - sync page directory to memory when needed.
@@ -112,6 +116,31 @@ static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn,
 					unsigned long flags, int group_id);
 
 /**
+ * kbase_mmu_update_and_free_parent_pgds() - Update number of valid entries and
+ *                                           free memory of the page directories
+ *
+ * @kbdev:   Device pointer.
+ * @mmut:    GPU MMU page table.
+ * @pgds:    Physical addresses of page directories to be freed.
+ * @vpfn:    The virtual page frame number.
+ * @level:   The level of MMU page table.
+ */
+static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev,
+						  struct kbase_mmu_table *mmut,
+						  phys_addr_t *pgds, u64 vpfn,
+						  int level);
+/**
+ * kbase_mmu_free_pgd() - Free memory of the page directory
+ *
+ * @kbdev:   Device pointer.
+ * @mmut:    GPU MMU page table.
+ * @pgd:     Physical address of page directory to be freed.
+ * @dirty:   Flag to indicate whether the page may be dirty in the cache.
+ */
+static void kbase_mmu_free_pgd(struct kbase_device *kbdev,
+			       struct kbase_mmu_table *mmut, phys_addr_t pgd,
+			       bool dirty);
+/**
  * reg_grow_calc_extra_pages() - Calculate the number of backed pages to add to
  *                               a region on a GPU page fault
  * @kbdev:         KBase device
@@ -191,17 +220,31 @@ static size_t reg_grow_calc_extra_pages(struct kbase_device *kbdev,
 }
 
 #ifdef CONFIG_MALI_CINSTR_GWT
-static void kbase_gpu_mmu_handle_write_faulting_as(
-				struct kbase_device *kbdev,
-				struct kbase_as *faulting_as,
-				u64 start_pfn, size_t nr, u32 op)
+static void kbase_gpu_mmu_handle_write_faulting_as(struct kbase_device *kbdev,
+						   struct kbase_as *faulting_as,
+						   u64 start_pfn, size_t nr,
+						   u32 kctx_id)
 {
+	/* Calls to this function are inherently synchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC;
+	struct kbase_mmu_hw_op_param op_param;
+
 	mutex_lock(&kbdev->mmu_hw_mutex);
 
 	kbase_mmu_hw_clear_fault(kbdev, faulting_as,
 			KBASE_MMU_FAULT_TYPE_PAGE);
-	kbase_mmu_hw_do_operation(kbdev, faulting_as, start_pfn,
-			nr, op, 1);
+
+	/* flush L2 and unlock the VA (resumes the MMU) */
+	op_param = (struct kbase_mmu_hw_op_param){
+		.vpfn = start_pfn,
+		.nr = nr,
+		.op = KBASE_MMU_OP_FLUSH_PT,
+		.kctx_id = kctx_id,
+		.mmu_sync_info = mmu_sync_info,
+	};
+	kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param);
 
 	mutex_unlock(&kbdev->mmu_hw_mutex);
 
@@ -217,7 +260,6 @@ static void kbase_gpu_mmu_handle_write_fault(struct kbase_context *kctx,
 	struct kbase_device *kbdev;
 	struct kbase_fault *fault;
 	u64 fault_pfn, pfn_offset;
-	u32 op;
 	int ret;
 	int as_no;
 
@@ -280,11 +322,8 @@ static void kbase_gpu_mmu_handle_write_fault(struct kbase_context *kctx,
 				&kbase_get_gpu_phy_pages(region)[pfn_offset],
 				1, region->flags, region->gpu_alloc->group_id);
 
-	/* flush L2 and unlock the VA (resumes the MMU) */
-	op = AS_COMMAND_FLUSH_PT;
-
-	kbase_gpu_mmu_handle_write_faulting_as(kbdev, faulting_as,
-			fault_pfn, 1, op);
+	kbase_gpu_mmu_handle_write_faulting_as(kbdev, faulting_as, fault_pfn, 1,
+					       kctx->id);
 
 	kbase_gpu_vm_unlock(kctx);
 }
@@ -554,6 +593,11 @@ void kbase_mmu_page_fault_worker(struct work_struct *data)
 	size_t pages_trimmed = 0;
 #endif
 
+	/* Calls to this function are inherently synchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC;
+
 	faulting_as = container_of(data, struct kbase_as, work_pagefault);
 	fault = &faulting_as->pf_data;
 	fault_pfn = fault->addr >> PAGE_SHIFT;
@@ -720,6 +764,8 @@ page_fault_retry:
 	current_backed_size = kbase_reg_current_backed_size(region);
 
 	if (fault_rel_pfn < current_backed_size) {
+		struct kbase_mmu_hw_op_param op_param;
+
 		dev_dbg(kbdev->dev,
 			"Page fault @ 0x%llx in allocated region 0x%llx-0x%llx of growable TMEM: Ignoring",
 				fault->addr, region->start_pfn,
@@ -738,8 +784,14 @@ page_fault_retry:
 		 * transaction (which should cause the other page fault to be
 		 * raised again).
 		 */
-		kbase_mmu_hw_do_operation(kbdev, faulting_as, 0, 0,
-				AS_COMMAND_UNLOCK, 1);
+		op_param = (struct kbase_mmu_hw_op_param){
+			.vpfn = 0,
+			.nr = 0,
+			.op = KBASE_MMU_OP_UNLOCK,
+			.kctx_id = kctx->id,
+			.mmu_sync_info = mmu_sync_info,
+		};
+		kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param);
 
 		mutex_unlock(&kbdev->mmu_hw_mutex);
 
@@ -758,14 +810,23 @@ page_fault_retry:
 		new_pages);
 
 	if (new_pages == 0) {
+		struct kbase_mmu_hw_op_param op_param;
+
 		mutex_lock(&kbdev->mmu_hw_mutex);
 
 		/* Duplicate of a fault we've already handled, nothing to do */
 		kbase_mmu_hw_clear_fault(kbdev, faulting_as,
 				KBASE_MMU_FAULT_TYPE_PAGE);
+
 		/* See comment [1] about UNLOCK usage */
-		kbase_mmu_hw_do_operation(kbdev, faulting_as, 0, 0,
-				AS_COMMAND_UNLOCK, 1);
+		op_param = (struct kbase_mmu_hw_op_param){
+			.vpfn = 0,
+			.nr = 0,
+			.op = KBASE_MMU_OP_UNLOCK,
+			.kctx_id = kctx->id,
+			.mmu_sync_info = mmu_sync_info,
+		};
+		kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param);
 
 		mutex_unlock(&kbdev->mmu_hw_mutex);
 
@@ -791,7 +852,7 @@ page_fault_retry:
 
 	if (grown) {
 		u64 pfn_offset;
-		u32 op;
+		struct kbase_mmu_hw_op_param op_param;
 
 		/* alloc success */
 		WARN_ON(kbase_reg_current_backed_size(region) >
@@ -854,9 +915,6 @@ page_fault_retry:
 		/* AS transaction begin */
 		mutex_lock(&kbdev->mmu_hw_mutex);
 
-		/* flush L2 and unlock the VA (resumes the MMU) */
-		op = AS_COMMAND_FLUSH_PT;
-
 		/* clear MMU interrupt - this needs to be done after updating
 		 * the page tables but before issuing a FLUSH command. The
 		 * FLUSH cmd has a side effect that it restarts stalled memory
@@ -868,9 +926,15 @@ page_fault_retry:
 		kbase_mmu_hw_clear_fault(kbdev, faulting_as,
 					 KBASE_MMU_FAULT_TYPE_PAGE);
 
-		kbase_mmu_hw_do_operation(kbdev, faulting_as,
-				fault->addr >> PAGE_SHIFT,
-				new_pages, op, 1);
+		/* flush L2 and unlock the VA (resumes the MMU) */
+		op_param = (struct kbase_mmu_hw_op_param){
+			.vpfn = fault->addr >> PAGE_SHIFT,
+			.nr = new_pages,
+			.op = KBASE_MMU_OP_FLUSH_PT,
+			.kctx_id = kctx->id,
+			.mmu_sync_info = mmu_sync_info,
+		};
+		kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param);
 
 		mutex_unlock(&kbdev->mmu_hw_mutex);
 		/* AS transaction end */
@@ -1073,7 +1137,7 @@ static int mmu_get_next_pgd(struct kbase_device *kbdev,
 			return -ENOMEM;
 		}
 
-		kbdev->mmu_mode->entry_set_pte(&page[vpfn], target_pgd);
+		kbdev->mmu_mode->entry_set_pte(page, vpfn, target_pgd);
 
 		kbase_mmu_sync_pgd(kbdev, kbase_dma_addr(p), PAGE_SIZE);
 		/* Rely on the caller to update the address space flags. */
@@ -1149,6 +1213,8 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
 		unsigned int left = to_vpfn - vpfn;
 		int level;
 		u64 *page;
+		register unsigned int num_of_valid_entries;
+		phys_addr_t pgds[MIDGARD_MMU_BOTTOMLEVEL + 1];
 
 		if (count > left)
 			count = left;
@@ -1159,6 +1225,7 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
 		for (level = MIDGARD_MMU_TOPLEVEL;
 				level <= MIDGARD_MMU_BOTTOMLEVEL; level++) {
 			idx = (vpfn >> ((3 - level) * 9)) & 0x1FF;
+			pgds[level] = pgd;
 			page = kmap(phys_to_page(pgd));
 			if (mmu_mode->ate_is_valid(page[idx], level))
 				break; /* keep the mapping */
@@ -1181,15 +1248,33 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
 			goto next;
 		}
 
+		num_of_valid_entries = mmu_mode->get_num_valid_entries(page);
+		if (WARN_ON_ONCE(num_of_valid_entries < pcount))
+			num_of_valid_entries = 0;
+		else
+			num_of_valid_entries -= pcount;
+
+		if (!num_of_valid_entries) {
+			kunmap(phys_to_page(pgd));
+
+			kbase_mmu_free_pgd(kbdev, mmut, pgd, true);
+
+			kbase_mmu_update_and_free_parent_pgds(kbdev, mmut, pgds,
+							      vpfn, level);
+			vpfn += count;
+			continue;
+		}
+
 		/* Invalidate the entries we added */
 		for (i = 0; i < pcount; i++)
 			mmu_mode->entry_invalidate(&page[idx + i]);
 
+		mmu_mode->set_num_valid_entries(page, num_of_valid_entries);
+
 		kbase_mmu_sync_pgd(kbdev,
 				   kbase_dma_addr(phys_to_page(pgd)) + 8 * idx,
 				   8 * pcount);
 		kunmap(phys_to_page(pgd));
-
 next:
 		vpfn += count;
 	}
@@ -1199,8 +1284,9 @@ next:
  * Map the single page 'phys' 'nr' of times, starting at GPU PFN 'vpfn'
  */
 int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
-					struct tagged_addr phys, size_t nr,
-					unsigned long flags, int const group_id)
+				 struct tagged_addr phys, size_t nr,
+				 unsigned long flags, int const group_id,
+				 enum kbase_caller_mmu_sync_info mmu_sync_info)
 {
 	phys_addr_t pgd;
 	u64 *pgd_page;
@@ -1233,12 +1319,13 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 		unsigned int index = vpfn & 0x1FF;
 		unsigned int count = KBASE_MMU_PAGE_ENTRIES - index;
 		struct page *p;
+		register unsigned int num_of_valid_entries;
 
 		if (count > remain)
 			count = remain;
 
 		/*
-		 * Repeatedly calling mmu_get_bottom_pte() is clearly
+		 * Repeatedly calling mmu_get_bottom_pgd() is clearly
 		 * suboptimal. We don't have to re-parse the whole tree
 		 * each time (just cache the l0-l2 sequence).
 		 * On the other hand, it's only a gain when we map more than
@@ -1264,7 +1351,8 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 			mutex_lock(&kctx->mmu.mmu_lock);
 		} while (!err);
 		if (err) {
-			dev_warn(kbdev->dev, "kbase_mmu_insert_pages: mmu_get_bottom_pgd failure\n");
+			dev_warn(kbdev->dev, "%s: mmu_get_bottom_pgd failure\n",
+				 __func__);
 			if (recover_required) {
 				/* Invalidate the pages we have partially
 				 * completed
@@ -1280,7 +1368,7 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 		p = pfn_to_page(PFN_DOWN(pgd));
 		pgd_page = kmap(p);
 		if (!pgd_page) {
-			dev_warn(kbdev->dev, "kbase_mmu_insert_pages: kmap failure\n");
+			dev_warn(kbdev->dev, "%s: kmap failure\n", __func__);
 			if (recover_required) {
 				/* Invalidate the pages we have partially
 				 * completed
@@ -1294,6 +1382,9 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 			goto fail_unlock;
 		}
 
+		num_of_valid_entries =
+			kbdev->mmu_mode->get_num_valid_entries(pgd_page);
+
 		for (i = 0; i < count; i++) {
 			unsigned int ofs = index + i;
 
@@ -1304,6 +1395,9 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 				phys, flags, MIDGARD_MMU_BOTTOMLEVEL, group_id);
 		}
 
+		kbdev->mmu_mode->set_num_valid_entries(
+			pgd_page, num_of_valid_entries + count);
+
 		vpfn += count;
 		remain -= count;
 
@@ -1320,38 +1414,41 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 		recover_count += count;
 	}
 	mutex_unlock(&kctx->mmu.mmu_lock);
-	kbase_mmu_flush_invalidate(kctx, start_vpfn, nr, false);
+	kbase_mmu_flush_invalidate(kctx, start_vpfn, nr, false, mmu_sync_info);
 	return 0;
 
 fail_unlock:
 	mutex_unlock(&kctx->mmu.mmu_lock);
-	kbase_mmu_flush_invalidate(kctx, start_vpfn, nr, false);
+	kbase_mmu_flush_invalidate(kctx, start_vpfn, nr, false, mmu_sync_info);
 	return err;
 }
 
-static inline void cleanup_empty_pte(struct kbase_device *kbdev,
-		struct kbase_mmu_table *mmut, u64 *pte)
+static void kbase_mmu_free_pgd(struct kbase_device *kbdev,
+			       struct kbase_mmu_table *mmut, phys_addr_t pgd,
+			       bool dirty)
 {
-	phys_addr_t tmp_pgd;
-	struct page *tmp_p;
+	struct page *p;
+
+	lockdep_assert_held(&mmut->mmu_lock);
+
+	p = pfn_to_page(PFN_DOWN(pgd));
 
-	tmp_pgd = kbdev->mmu_mode->pte_to_phy_addr(*pte);
-	tmp_p = phys_to_page(tmp_pgd);
 #ifdef CONFIG_MALI_2MB_ALLOC
 	kbase_mem_pool_free(&kbdev->mem_pools.large[mmut->group_id],
 #else
 	kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id],
 #endif
-		tmp_p, false);
+			    p, dirty);
+
+	atomic_sub(1, &kbdev->memdev.used_pages);
 
-	/* If the MMU tables belong to a context then we accounted the memory
-	 * usage to that context, so decrement here.
+	/* If MMU tables belong to a context then pages will have been accounted
+	 * against it, so we must decrement the usage counts here.
 	 */
 	if (mmut->kctx) {
 		kbase_process_page_usage_dec(mmut->kctx, 1);
 		atomic_sub(1, &mmut->kctx->used_pages);
 	}
-	atomic_sub(1, &kbdev->memdev.used_pages);
 
 	kbase_trace_gpu_mem_usage_dec(kbdev, mmut->kctx, 1);
 }
@@ -1399,6 +1496,7 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
 		unsigned int count = KBASE_MMU_PAGE_ENTRIES - vindex;
 		struct page *p;
 		int cur_level;
+		register unsigned int num_of_valid_entries;
 
 		if (count > remain)
 			count = remain;
@@ -1463,14 +1561,25 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
 			goto fail_unlock;
 		}
 
+		num_of_valid_entries =
+			mmu_mode->get_num_valid_entries(pgd_page);
+
 		if (cur_level == MIDGARD_MMU_LEVEL(2)) {
 			int level_index = (insert_vpfn >> 9) & 0x1FF;
 			u64 *target = &pgd_page[level_index];
 
-			if (mmu_mode->pte_is_valid(*target, cur_level))
-				cleanup_empty_pte(kbdev, mmut, target);
+			if (mmu_mode->pte_is_valid(*target, cur_level)) {
+				kbase_mmu_free_pgd(
+					kbdev, mmut,
+					kbdev->mmu_mode->pte_to_phy_addr(
+						*target),
+					false);
+				num_of_valid_entries--;
+			}
 			*target = kbase_mmu_create_ate(kbdev, *phys, flags,
 				cur_level, group_id);
+
+			num_of_valid_entries++;
 		} else {
 			for (i = 0; i < count; i++) {
 				unsigned int ofs = vindex + i;
@@ -1488,8 +1597,11 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
 				*target = kbase_mmu_create_ate(kbdev,
 					phys[i], flags, cur_level, group_id);
 			}
+			num_of_valid_entries += count;
 		}
 
+		mmu_mode->set_num_valid_entries(pgd_page, num_of_valid_entries);
+
 		phys += count;
 		insert_vpfn += count;
 		remain -= count;
@@ -1513,9 +1625,10 @@ fail_unlock:
  * number 'as_nr'.
  */
 int kbase_mmu_insert_pages(struct kbase_device *kbdev,
-		struct kbase_mmu_table *mmut, u64 vpfn,
-		struct tagged_addr *phys, size_t nr,
-		unsigned long flags, int as_nr, int const group_id)
+			   struct kbase_mmu_table *mmut, u64 vpfn,
+			   struct tagged_addr *phys, size_t nr,
+			   unsigned long flags, int as_nr, int const group_id,
+			   enum kbase_caller_mmu_sync_info mmu_sync_info)
 {
 	int err;
 
@@ -1523,10 +1636,11 @@ int kbase_mmu_insert_pages(struct kbase_device *kbdev,
 			phys, nr, flags, group_id);
 
 	if (mmut->kctx)
-		kbase_mmu_flush_invalidate(mmut->kctx, vpfn, nr, false);
+		kbase_mmu_flush_invalidate(mmut->kctx, vpfn, nr, false,
+					   mmu_sync_info);
 	else
-		kbase_mmu_flush_invalidate_no_ctx(kbdev, vpfn, nr, false,
-				as_nr);
+		kbase_mmu_flush_invalidate_no_ctx(kbdev, vpfn, nr, false, as_nr,
+						  mmu_sync_info);
 
 	return err;
 }
@@ -1539,30 +1653,36 @@ KBASE_EXPORT_TEST_API(kbase_mmu_insert_pages);
  * @kctx: The KBase context.
  * @vpfn: The virtual page frame number to start the flush on.
  * @nr: The number of pages to flush.
- * @sync: Set if the operation should be synchronous or not.
  *
  * As per kbase_mmu_flush_invalidate but doesn't retain the kctx or do any
  * other locking.
  */
 static void kbase_mmu_flush_invalidate_noretain(struct kbase_context *kctx,
-		u64 vpfn, size_t nr, bool sync)
+						u64 vpfn, size_t nr)
 {
 	struct kbase_device *kbdev = kctx->kbdev;
+	struct kbase_mmu_hw_op_param op_param;
 	int err;
-	u32 op;
+
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
 
 	/* Early out if there is nothing to do */
 	if (nr == 0)
 		return;
 
-	if (sync)
-		op = AS_COMMAND_FLUSH_MEM;
-	else
-		op = AS_COMMAND_FLUSH_PT;
-
-	err = kbase_mmu_hw_do_operation(kbdev,
-				&kbdev->as[kctx->as_nr],
-				vpfn, nr, op, 0);
+	/* flush L2 and unlock the VA (resumes the MMU) */
+	op_param = (struct kbase_mmu_hw_op_param){
+		.vpfn = vpfn,
+		.nr = nr,
+		.op = KBASE_MMU_OP_FLUSH_MEM,
+		.kctx_id = kctx->id,
+		.mmu_sync_info = mmu_sync_info,
+	};
+	err = kbase_mmu_hw_do_operation(kbdev, &kbdev->as[kctx->as_nr],
+					&op_param);
 	if (err) {
 		/* Flush failed to complete, assume the
 		 * GPU has hung and perform a reset to recover
@@ -1576,14 +1696,15 @@ static void kbase_mmu_flush_invalidate_noretain(struct kbase_context *kctx,
 
 /* Perform a flush/invalidate on a particular address space
  */
-static void kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev,
-		struct kbase_as *as,
-		u64 vpfn, size_t nr, bool sync)
+static void
+kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev, struct kbase_as *as,
+			      u64 vpfn, size_t nr, bool sync, u32 kctx_id,
+			      enum kbase_caller_mmu_sync_info mmu_sync_info)
 {
 	int err;
-	u32 op;
 	bool gpu_powered;
 	unsigned long flags;
+	struct kbase_mmu_hw_op_param op_param;
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	gpu_powered = kbdev->pm.backend.gpu_powered;
@@ -1611,13 +1732,19 @@ static void kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev,
 	/* AS transaction begin */
 	mutex_lock(&kbdev->mmu_hw_mutex);
 
+	op_param = (struct kbase_mmu_hw_op_param){
+		.vpfn = vpfn,
+		.nr = nr,
+		.kctx_id = kctx_id,
+		.mmu_sync_info = mmu_sync_info,
+	};
+
 	if (sync)
-		op = AS_COMMAND_FLUSH_MEM;
+		op_param.op = KBASE_MMU_OP_FLUSH_MEM;
 	else
-		op = AS_COMMAND_FLUSH_PT;
+		op_param.op = KBASE_MMU_OP_FLUSH_PT;
 
-	err = kbase_mmu_hw_do_operation(kbdev,
-			as, vpfn, nr, op, 0);
+	err = kbase_mmu_hw_do_operation(kbdev, as, &op_param);
 
 	if (err) {
 		/* Flush failed to complete, assume the GPU has hung and
@@ -1636,18 +1763,23 @@ static void kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev,
 	kbase_pm_context_idle(kbdev);
 }
 
-static void kbase_mmu_flush_invalidate_no_ctx(struct kbase_device *kbdev,
-		u64 vpfn, size_t nr, bool sync, int as_nr)
+static void
+kbase_mmu_flush_invalidate_no_ctx(struct kbase_device *kbdev, u64 vpfn,
+				  size_t nr, bool sync, int as_nr,
+				  enum kbase_caller_mmu_sync_info mmu_sync_info)
 {
 	/* Skip if there is nothing to do */
 	if (nr) {
 		kbase_mmu_flush_invalidate_as(kbdev, &kbdev->as[as_nr], vpfn,
-					nr, sync);
+					      nr, sync, 0xFFFFFFFF,
+					      mmu_sync_info);
 	}
 }
 
-static void kbase_mmu_flush_invalidate(struct kbase_context *kctx,
-		u64 vpfn, size_t nr, bool sync)
+static void
+kbase_mmu_flush_invalidate(struct kbase_context *kctx, u64 vpfn, size_t nr,
+			   bool sync,
+			   enum kbase_caller_mmu_sync_info mmu_sync_info)
 {
 	struct kbase_device *kbdev;
 	bool ctx_is_in_runpool;
@@ -1669,7 +1801,8 @@ static void kbase_mmu_flush_invalidate(struct kbase_context *kctx,
 		KBASE_DEBUG_ASSERT(kctx->as_nr != KBASEP_AS_NR_INVALID);
 
 		kbase_mmu_flush_invalidate_as(kbdev, &kbdev->as[kctx->as_nr],
-				vpfn, nr, sync);
+					      vpfn, nr, sync, kctx->id,
+					      mmu_sync_info);
 
 		release_ctx(kbdev, kctx);
 	}
@@ -1714,17 +1847,58 @@ void kbase_mmu_disable(struct kbase_context *kctx)
 	 * The job scheduler code will already be holding the locks and context
 	 * so just do the flush.
 	 */
-	kbase_mmu_flush_invalidate_noretain(kctx, 0, ~0, true);
+	kbase_mmu_flush_invalidate_noretain(kctx, 0, ~0);
 
 	kctx->kbdev->mmu_mode->disable_as(kctx->kbdev, kctx->as_nr);
 }
 KBASE_EXPORT_TEST_API(kbase_mmu_disable);
 
+static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev,
+						  struct kbase_mmu_table *mmut,
+						  phys_addr_t *pgds, u64 vpfn,
+						  int level)
+{
+	int current_level;
+
+	lockdep_assert_held(&mmut->mmu_lock);
+
+	for (current_level = level - 1; current_level >= MIDGARD_MMU_LEVEL(0);
+	     current_level--) {
+		u64 *current_page = kmap(phys_to_page(pgds[current_level]));
+		unsigned int current_valid_entries =
+			kbdev->mmu_mode->get_num_valid_entries(current_page);
+
+		if (current_valid_entries == 1 &&
+		    current_level != MIDGARD_MMU_LEVEL(0)) {
+			kunmap(phys_to_page(pgds[current_level]));
+
+			kbase_mmu_free_pgd(kbdev, mmut, pgds[current_level],
+					   true);
+		} else {
+			int index = (vpfn >> ((3 - current_level) * 9)) & 0x1FF;
+
+			kbdev->mmu_mode->entry_invalidate(&current_page[index]);
+
+			current_valid_entries--;
+
+			kbdev->mmu_mode->set_num_valid_entries(
+				current_page, current_valid_entries);
+
+			kbase_mmu_sync_pgd(kbdev,
+					   kbase_dma_addr(phys_to_page(
+						   pgds[current_level])) +
+						   8 * index,
+					   8 * 1);
+
+			kunmap(phys_to_page(pgds[current_level]));
+			break;
+		}
+	}
+}
+
 /*
- * We actually only discard the ATE, and not the page table
- * pages. There is a potential DoS here, as we'll leak memory by
- * having PTEs that are potentially unused.  Will require physical
- * page accounting, so MMU pages are part of the process allocation.
+ * We actually discard the ATE and free the page table pages if no valid entries
+ * exist in PGD.
  *
  * IMPORTANT: This uses kbasep_js_runpool_release_ctx() when the context is
  * currently scheduled into the runpool, and so potentially uses a lot of locks.
@@ -1741,6 +1915,11 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev,
 	struct kbase_mmu_mode const *mmu_mode;
 	int err = -EFAULT;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	if (nr == 0) {
 		/* early out if nothing to do */
 		return 0;
@@ -1757,6 +1936,8 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev,
 		unsigned int pcount;
 		int level;
 		u64 *page;
+		phys_addr_t pgds[MIDGARD_MMU_BOTTOMLEVEL + 1];
+		register unsigned int num_of_valid_entries;
 
 		if (count > nr)
 			count = nr;
@@ -1793,6 +1974,7 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev,
 				goto next;
 			}
 			next_pgd = mmu_mode->pte_to_phy_addr(page[index]);
+			pgds[level] = pgd;
 			kunmap(phys_to_page(pgd));
 			pgd = next_pgd;
 		}
@@ -1829,14 +2011,34 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev,
 			continue;
 		}
 
+		num_of_valid_entries = mmu_mode->get_num_valid_entries(page);
+		if (WARN_ON_ONCE(num_of_valid_entries < pcount))
+			num_of_valid_entries = 0;
+		else
+			num_of_valid_entries -= pcount;
+
+		if (!num_of_valid_entries) {
+			kunmap(phys_to_page(pgd));
+
+			kbase_mmu_free_pgd(kbdev, mmut, pgd, true);
+
+			kbase_mmu_update_and_free_parent_pgds(kbdev, mmut, pgds,
+							      vpfn, level);
+
+			vpfn += count;
+			nr -= count;
+			continue;
+		}
+
 		/* Invalidate the entries we added */
 		for (i = 0; i < pcount; i++)
 			mmu_mode->entry_invalidate(&page[index + i]);
 
-		kbase_mmu_sync_pgd(kbdev,
-				   kbase_dma_addr(phys_to_page(pgd)) +
-				   8 * index, 8*pcount);
+		mmu_mode->set_num_valid_entries(page, num_of_valid_entries);
 
+		kbase_mmu_sync_pgd(
+			kbdev, kbase_dma_addr(phys_to_page(pgd)) + 8 * index,
+			8 * pcount);
 next:
 		kunmap(phys_to_page(pgd));
 		vpfn += count;
@@ -1848,10 +2050,11 @@ out:
 
 	if (mmut->kctx)
 		kbase_mmu_flush_invalidate(mmut->kctx, start_vpfn, requested_nr,
-				true);
+					   true, mmu_sync_info);
 	else
-		kbase_mmu_flush_invalidate_no_ctx(kbdev, start_vpfn, requested_nr,
-				true, as_nr);
+		kbase_mmu_flush_invalidate_no_ctx(kbdev, start_vpfn,
+						  requested_nr, true, as_nr,
+						  mmu_sync_info);
 
 	return err;
 }
@@ -1903,6 +2106,7 @@ static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn,
 		unsigned int index = vpfn & 0x1FF;
 		size_t count = KBASE_MMU_PAGE_ENTRIES - index;
 		struct page *p;
+		register unsigned int num_of_valid_entries;
 
 		if (count > nr)
 			count = nr;
@@ -1940,10 +2144,22 @@ static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn,
 			goto fail_unlock;
 		}
 
-		for (i = 0; i < count; i++)
+		num_of_valid_entries =
+			kbdev->mmu_mode->get_num_valid_entries(pgd_page);
+
+		for (i = 0; i < count; i++) {
+#ifdef CONFIG_MALI_DEBUG
+			WARN_ON_ONCE(!kbdev->mmu_mode->ate_is_valid(
+					pgd_page[index + i],
+					MIDGARD_MMU_BOTTOMLEVEL));
+#endif
 			pgd_page[index + i] = kbase_mmu_create_ate(kbdev,
 				phys[i], flags, MIDGARD_MMU_BOTTOMLEVEL,
 				group_id);
+		}
+
+		kbdev->mmu_mode->set_num_valid_entries(pgd_page,
+					num_of_valid_entries);
 
 		phys += count;
 		vpfn += count;
@@ -1970,9 +2186,14 @@ int kbase_mmu_update_pages(struct kbase_context *kctx, u64 vpfn,
 {
 	int err;
 
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
 	err = kbase_mmu_update_pages_no_flush(kctx, vpfn, phys, nr, flags,
 		group_id);
-	kbase_mmu_flush_invalidate(kctx, vpfn, nr, true);
+	kbase_mmu_flush_invalidate(kctx, vpfn, nr, true, mmu_sync_info);
 	return err;
 }
 
@@ -1981,13 +2202,18 @@ static void mmu_teardown_level(struct kbase_device *kbdev,
 		int level, u64 *pgd_page_buffer)
 {
 	phys_addr_t target_pgd;
-	struct page *p;
 	u64 *pgd_page;
 	int i;
 	struct kbase_mmu_mode const *mmu_mode;
 
 	lockdep_assert_held(&mmut->mmu_lock);
 
+	/* Early-out. No need to kmap to check entries for L3 PGD. */
+	if (level == MIDGARD_MMU_BOTTOMLEVEL) {
+		kbase_mmu_free_pgd(kbdev, mmut, pgd, true);
+		return;
+	}
+
 	pgd_page = kmap_atomic(pfn_to_page(PFN_DOWN(pgd)));
 	/* kmap_atomic should NEVER fail. */
 	if (WARN_ON(pgd_page == NULL))
@@ -2015,25 +2241,7 @@ static void mmu_teardown_level(struct kbase_device *kbdev,
 		}
 	}
 
-	p = pfn_to_page(PFN_DOWN(pgd));
-#ifdef CONFIG_MALI_2MB_ALLOC
-	kbase_mem_pool_free(&kbdev->mem_pools.large[mmut->group_id],
-#else
-	kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id],
-#endif
-		p, true);
-
-	atomic_sub(1, &kbdev->memdev.used_pages);
-
-	/* If MMU tables belong to a context then pages will have been accounted
-	 * against it, so we must decrement the usage counts here.
-	 */
-	if (mmut->kctx) {
-		kbase_process_page_usage_dec(mmut->kctx, 1);
-		atomic_sub(1, &mmut->kctx->used_pages);
-	}
-
-	kbase_trace_gpu_mem_usage_dec(kbdev, mmut->kctx, 1);
+	kbase_mmu_free_pgd(kbdev, mmut, pgd, true);
 }
 
 int kbase_mmu_init(struct kbase_device *const kbdev,
@@ -2293,6 +2501,13 @@ void kbase_mmu_bus_fault_worker(struct work_struct *data)
 
 	}
 
+#if MALI_USE_CSF
+	/* Before the GPU power off, wait is done for the completion of
+	 * in-flight MMU fault work items. So GPU is expected to remain
+	 * powered up whilst the bus fault handling is being done.
+	 */
+	kbase_gpu_report_bus_fault_and_kill(kctx, faulting_as, fault);
+#else
 	/* NOTE: If GPU already powered off for suspend,
 	 * we don't need to switch to unmapped
 	 */
@@ -2301,6 +2516,7 @@ void kbase_mmu_bus_fault_worker(struct work_struct *data)
 		kbase_gpu_report_bus_fault_and_kill(kctx, faulting_as, fault);
 		kbase_pm_context_idle(kbdev);
 	}
+#endif
 
 	release_ctx(kbdev, kctx);
 
diff --git a/mali_kbase/mmu/mali_kbase_mmu.h b/mali_kbase/mmu/mali_kbase_mmu.h
index a2d1a8e..45a628c 100644
--- a/mali_kbase/mmu/mali_kbase_mmu.h
+++ b/mali_kbase/mmu/mali_kbase_mmu.h
@@ -22,6 +22,29 @@
 #ifndef _KBASE_MMU_H_
 #define _KBASE_MMU_H_
 
+#include <uapi/gpu/arm/midgard/mali_base_kernel.h>
+
+#define KBASE_MMU_PAGE_ENTRIES 512
+
+struct kbase_context;
+struct kbase_mmu_table;
+
+/**
+ * MMU-synchronous caller info. A pointer to this type is passed down from the outer-most callers
+ * in the kbase module - where the information resides as to the synchronous / asynchronous
+ * nature of the call flow, with respect to MMU operations. ie - does the call flow relate to
+ * existing GPU work does it come from requests (like ioctl) from user-space, power management,
+ * etc.
+ */
+enum kbase_caller_mmu_sync_info {
+	/* default value must be invalid to avoid accidental choice ov a 'valid' value. */
+	CALLER_MMU_UNSET_SYNCHRONICITY,
+	/* Arbitrary value for 'synchronous that isn't easy to choose by accident. */
+	CALLER_MMU_SYNC = 0x02,
+	/* Also hard to choose by accident */
+	CALLER_MMU_ASYNC
+};
+
 /**
  * kbase_mmu_as_init() - Initialising GPU address space object.
  *
@@ -111,10 +134,12 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
 int kbase_mmu_insert_pages(struct kbase_device *kbdev,
 			   struct kbase_mmu_table *mmut, u64 vpfn,
 			   struct tagged_addr *phys, size_t nr,
-			   unsigned long flags, int as_nr, int group_id);
+			   unsigned long flags, int as_nr, int group_id,
+			   enum kbase_caller_mmu_sync_info mmu_sync_info);
 int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
-					struct tagged_addr phys, size_t nr,
-					unsigned long flags, int group_id);
+				 struct tagged_addr phys, size_t nr,
+				 unsigned long flags, int group_id,
+				 enum kbase_caller_mmu_sync_info mmu_sync_info);
 
 int kbase_mmu_teardown_pages(struct kbase_device *kbdev,
 			     struct kbase_mmu_table *mmut, u64 vpfn,
@@ -152,4 +177,22 @@ int kbase_mmu_bus_fault_interrupt(struct kbase_device *kbdev, u32 status,
 void kbase_mmu_gpu_fault_interrupt(struct kbase_device *kbdev, u32 status,
 		u32 as_nr, u64 address, bool as_valid);
 
+/**
+ * kbase_context_mmu_group_id_get - Decode a memory group ID from
+ *                                 base_context_create_flags
+ *
+ * Memory allocated for GPU page tables will come from the returned group.
+ *
+ * @flags: Bitmask of flags to pass to base_context_init.
+ *
+ * Return: Physical memory group ID. Valid range is 0..(BASE_MEM_GROUP_COUNT-1).
+ */
+static inline int
+kbase_context_mmu_group_id_get(base_context_create_flags const flags)
+{
+	KBASE_DEBUG_ASSERT(flags ==
+			   (flags & BASEP_CONTEXT_CREATE_ALLOWED_FLAGS));
+	return (int)BASE_CONTEXT_MMU_GROUP_ID_GET(flags);
+}
+
 #endif /* _KBASE_MMU_H_ */
diff --git a/mali_kbase/mmu/mali_kbase_mmu_hw.h b/mali_kbase/mmu/mali_kbase_mmu_hw.h
index d1f1ff2..7c0e95e 100644
--- a/mali_kbase/mmu/mali_kbase_mmu_hw.h
+++ b/mali_kbase/mmu/mali_kbase_mmu_hw.h
@@ -31,6 +31,8 @@
 #ifndef _KBASE_MMU_HW_H_
 #define _KBASE_MMU_HW_H_
 
+#include "mali_kbase_mmu.h"
+
 /* Forward declarations */
 struct kbase_device;
 struct kbase_as;
@@ -53,6 +55,42 @@ enum kbase_mmu_fault_type {
 };
 
 /**
+ * enum kbase_mmu_cache_flush_type - enum for MMU operations
+ * @KBASE_MMU_OP_NONE:        To help catch uninitialized struct
+ * @KBASE_MMU_OP_FIRST:       The lower boundary of enum
+ * @KBASE_MMU_OP_LOCK:        Lock memory region
+ * @KBASE_MMU_OP_UNLOCK:      Unlock memory region
+ * @KBASE_MMU_OP_FLUSH_PT:    Flush page table (CLN+INV L2 only)
+ * @KBASE_MMU_OP_FLUSH_MEM:   Flush memory (CLN+INV L2+LSC)
+ * @KBASE_MMU_OP_COUNT:       The upper boundary of enum
+ */
+enum kbase_mmu_op_type {
+	KBASE_MMU_OP_NONE = 0, /* Must be zero */
+	KBASE_MMU_OP_FIRST, /* Must be the first non-zero op */
+	KBASE_MMU_OP_LOCK = KBASE_MMU_OP_FIRST,
+	KBASE_MMU_OP_UNLOCK,
+	KBASE_MMU_OP_FLUSH_PT,
+	KBASE_MMU_OP_FLUSH_MEM,
+	KBASE_MMU_OP_COUNT /* Must be the last in enum */
+};
+
+/**
+ * struct kbase_mmu_hw_op_param  - parameters for kbase_mmu_hw_do_operation()
+ * @vpfn:          MMU Virtual Page Frame Number to start the operation on.
+ * @nr:            Number of pages to work on.
+ * @type:          Operation type (written to ASn_COMMAND).
+ * @kctx_id:       Kernel context ID for MMU command tracepoint
+ * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops.
+ */
+struct kbase_mmu_hw_op_param {
+	u64 vpfn;
+	u32 nr;
+	enum kbase_mmu_op_type op;
+	u32 kctx_id;
+	enum kbase_caller_mmu_sync_info mmu_sync_info;
+};
+
+/**
  * kbase_mmu_hw_configure - Configure an address space for use.
  * @kbdev:          kbase device to configure.
  * @as:             address space to configure.
@@ -67,11 +105,7 @@ void kbase_mmu_hw_configure(struct kbase_device *kbdev,
  * kbase_mmu_hw_do_operation - Issue an operation to the MMU.
  * @kbdev:         kbase device to issue the MMU operation on.
  * @as:            address space to issue the MMU operation on.
- * @vpfn:          MMU Virtual Page Frame Number to start the operation on.
- * @nr:            Number of pages to work on.
- * @type:          Operation type (written to ASn_COMMAND).
- * @handling_irq:  Is this operation being called during the handling
- *                 of an interrupt?
+ * @op_param:      parameters for the operation.
  *
  * Issue an operation (MMU invalidate, MMU flush, etc) on the address space that
  * is associated with the provided kbase_context over the specified range
@@ -79,8 +113,7 @@ void kbase_mmu_hw_configure(struct kbase_device *kbdev,
  * Return: Zero if the operation was successful, non-zero otherwise.
  */
 int kbase_mmu_hw_do_operation(struct kbase_device *kbdev, struct kbase_as *as,
-		u64 vpfn, u32 nr, u32 type,
-		unsigned int handling_irq);
+			      struct kbase_mmu_hw_op_param *op_param);
 
 /**
  * kbase_mmu_hw_clear_fault - Clear a fault that has been previously reported by
diff --git a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
index a99b988..6306946 100644
--- a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
+++ b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c
@@ -19,12 +19,13 @@
  *
  */
 
+#include <device/mali_kbase_device.h>
 #include <linux/bitops.h>
 #include <mali_kbase.h>
+#include <mali_kbase_ctx_sched.h>
 #include <mali_kbase_mem.h>
 #include <mmu/mali_kbase_mmu_hw.h>
 #include <tl/mali_kbase_tracepoints.h>
-#include <device/mali_kbase_device.h>
 
 /**
  * lock_region() - Generate lockaddr to lock memory region in MMU
@@ -35,47 +36,87 @@
  * The lockaddr value is a combination of the starting address and
  * the size of the region that encompasses all the memory pages to lock.
  *
- * The size is expressed as a logarithm: it is represented in a way
- * that is compatible with the HW specification and it also determines
- * how many of the lowest bits of the address are cleared.
+ * Bits 5:0 are used to represent the size, which must be a power of 2.
+ * The smallest amount of memory to be locked corresponds to 32 kB,
+ * i.e. 8 memory pages, because a MMU cache line is made of 64 bytes
+ * and every page table entry is 8 bytes. Therefore it is not possible
+ * to lock less than 8 memory pages at a time.
+ *
+ * The size is expressed as a logarithm minus one:
+ * - A value of 14 is thus interpreted as log(32 kB) = 15, where 32 kB
+ *   is the smallest possible size.
+ * - Likewise, a value of 47 is interpreted as log(256 TB) = 48, where 256 TB
+ *   is the largest possible size (implementation defined value according
+ *   to the HW spec).
+ *
+ * Bits 11:6 are reserved.
+ *
+ * Bits 63:12 are used to represent the base address of the region to lock.
+ * Only the upper bits of the address are used; lowest bits are cleared
+ * to avoid confusion.
+ *
+ * The address is aligned to a multiple of the region size. This has profound
+ * implications on the region size itself: often the MMU will lock a region
+ * larger than the given number of pages, because the lock region cannot start
+ * from any arbitrary address.
  *
  * Return: 0 if success, or an error code on failure.
  */
 static int lock_region(u64 pfn, u32 num_pages, u64 *lockaddr)
 {
 	const u64 lockaddr_base = pfn << PAGE_SHIFT;
-	u64 lockaddr_size_log2, region_frame_number_start,
-		region_frame_number_end;
+	const u64 lockaddr_end = ((pfn + num_pages) << PAGE_SHIFT) - 1;
+	u64 lockaddr_size_log2;
 
 	if (num_pages == 0)
 		return -EINVAL;
 
-	/* The size is expressed as a logarithm and should take into account
-	 * the possibility that some pages might spill into the next region.
+	/* The MMU lock region is a self-aligned region whose size
+	 * is a power of 2 and that contains both start and end
+	 * of the address range determined by pfn and num_pages.
+	 * The size of the MMU lock region can be defined as the
+	 * largest divisor that yields the same result when both
+	 * start and end addresses are divided by it.
+	 *
+	 * For instance: pfn=0x4F000 num_pages=2 describe the
+	 * address range between 0x4F000 and 0x50FFF. It is only
+	 * 2 memory pages. However there isn't a single lock region
+	 * of 8 kB that encompasses both addresses because 0x4F000
+	 * would fall into the [0x4E000, 0x4FFFF] region while
+	 * 0x50000 would fall into the [0x50000, 0x51FFF] region.
+	 * The minimum lock region size that includes the entire
+	 * address range is 128 kB, and the region would be
+	 * [0x40000, 0x5FFFF].
+	 *
+	 * The region size can be found by comparing the desired
+	 * start and end addresses and finding the highest bit
+	 * that differs. The smallest naturally aligned region
+	 * must include this bit change, hence the desired region
+	 * starts with this bit (and subsequent bits) set to 0
+	 * and ends with the bit (and subsequent bits) set to 1.
+	 *
+	 * In the example above: 0x4F000 ^ 0x50FFF = 0x1FFFF
+	 * therefore the highest bit that differs is bit #16
+	 * and the region size (as a logarithm) is 16 + 1 = 17, i.e. 128 kB.
 	 */
-	lockaddr_size_log2 = fls(num_pages) + PAGE_SHIFT - 1;
-
-	/* Round up if the number of pages is not a power of 2. */
-	if (num_pages != ((u32)1 << (lockaddr_size_log2 - PAGE_SHIFT)))
-		lockaddr_size_log2 += 1;
-
-	/* Round up if some memory pages spill into the next region. */
-	region_frame_number_start = pfn >> (lockaddr_size_log2 - PAGE_SHIFT);
-	region_frame_number_end =
-	    (pfn + num_pages - 1) >> (lockaddr_size_log2 - PAGE_SHIFT);
-
-	if (region_frame_number_start < region_frame_number_end)
-		lockaddr_size_log2 += 1;
-
-	/* Represent the size according to the HW specification. */
-	lockaddr_size_log2 = MAX(lockaddr_size_log2,
-		KBASE_LOCK_REGION_MIN_SIZE_LOG2);
+	lockaddr_size_log2 = fls(lockaddr_base ^ lockaddr_end);
 
+	/* Cap the size against minimum and maximum values allowed. */
 	if (lockaddr_size_log2 > KBASE_LOCK_REGION_MAX_SIZE_LOG2)
 		return -EINVAL;
 
-	/* The lowest bits are cleared and then set to size - 1 to represent
-	 * the size in a way that is compatible with the HW specification.
+	lockaddr_size_log2 =
+		MAX(lockaddr_size_log2, KBASE_LOCK_REGION_MIN_SIZE_LOG2);
+
+	/* Represent the result in a way that is compatible with HW spec.
+	 *
+	 * Upper bits are used for the base address, whose lower bits
+	 * are cleared to avoid confusion because they are going to be ignored
+	 * by the MMU anyway, since lock regions shall be aligned with
+	 * a multiple of their size and cannot start from any address.
+	 *
+	 * Lower bits are used for the size, which is represented as
+	 * logarithm minus one of the actual size.
 	 */
 	*lockaddr = lockaddr_base & ~((1ull << lockaddr_size_log2) - 1);
 	*lockaddr |= lockaddr_size_log2 - 1;
@@ -170,20 +211,30 @@ void kbase_mmu_hw_configure(struct kbase_device *kbdev, struct kbase_as *as)
 }
 
 int kbase_mmu_hw_do_operation(struct kbase_device *kbdev, struct kbase_as *as,
-		u64 vpfn, u32 nr, u32 op,
-		unsigned int handling_irq)
+			      struct kbase_mmu_hw_op_param *op_param)
 {
 	int ret;
+	u64 lock_addr = 0x0;
 
 	lockdep_assert_held(&kbdev->mmu_hw_mutex);
 
-	if (op == AS_COMMAND_UNLOCK) {
+	if (op_param->op == KBASE_MMU_OP_UNLOCK) {
 		/* Unlock doesn't require a lock first */
 		ret = write_cmd(kbdev, as->number, AS_COMMAND_UNLOCK);
-	} else {
-		u64 lock_addr;
 
-		ret = lock_region(vpfn, nr, &lock_addr);
+		/* Wait for UNLOCK command to complete */
+		ret = wait_ready(kbdev, as->number);
+
+		if (!ret) {
+			/* read MMU_AS_CONTROL.LOCKADDR register */
+			lock_addr |= (u64)kbase_reg_read(kbdev,
+				MMU_AS_REG(as->number, AS_LOCKADDR_HI)) << 32;
+			lock_addr |= (u64)kbase_reg_read(kbdev,
+				MMU_AS_REG(as->number, AS_LOCKADDR_LO));
+		}
+	} else if (op_param->op >= KBASE_MMU_OP_FIRST &&
+		   op_param->op < KBASE_MMU_OP_COUNT) {
+		ret = lock_region(op_param->vpfn, op_param->nr, &lock_addr);
 
 		if (!ret) {
 			/* Lock the region that needs to be updated */
@@ -195,12 +246,49 @@ int kbase_mmu_hw_do_operation(struct kbase_device *kbdev, struct kbase_as *as,
 				(lock_addr >> 32) & 0xFFFFFFFFUL);
 			write_cmd(kbdev, as->number, AS_COMMAND_LOCK);
 
-			/* Run the MMU operation */
-			write_cmd(kbdev, as->number, op);
-
-			/* Wait for the flush to complete */
+			/* Translate and send operation to HW */
+			switch (op_param->op) {
+			case KBASE_MMU_OP_FLUSH_PT:
+				write_cmd(kbdev, as->number,
+					  AS_COMMAND_FLUSH_PT);
+				break;
+			case KBASE_MMU_OP_FLUSH_MEM:
+				write_cmd(kbdev, as->number,
+					  AS_COMMAND_FLUSH_MEM);
+				break;
+			case KBASE_MMU_OP_LOCK:
+				/* No further operation. */
+				break;
+			default:
+				dev_warn(kbdev->dev,
+					 "Unsupported MMU operation (op=%d).\n",
+					 op_param->op);
+				return -EINVAL;
+			};
+
+			/* Wait for the command to complete */
 			ret = wait_ready(kbdev, as->number);
 		}
+	} else {
+		/* Code should not reach here. */
+		dev_warn(kbdev->dev, "Invalid mmu operation (op=%d).\n",
+			 op_param->op);
+		return -EINVAL;
+	}
+
+	/* MMU command instrumentation */
+	if (!ret) {
+		u64 lock_addr_base = AS_LOCKADDR_LOCKADDR_BASE_GET(lock_addr);
+		u32 lock_addr_size = AS_LOCKADDR_LOCKADDR_SIZE_GET(lock_addr);
+
+		bool is_mmu_synchronous = false;
+
+		if (op_param->mmu_sync_info == CALLER_MMU_SYNC)
+			is_mmu_synchronous = true;
+
+		KBASE_TLSTREAM_AUX_MMU_COMMAND(kbdev, op_param->kctx_id,
+					       op_param->op, is_mmu_synchronous,
+					       lock_addr_base, lock_addr_size);
 	}
 
 	return ret;
diff --git a/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c b/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c
index 16b928d..6ef4c9d 100644
--- a/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c
+++ b/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c
@@ -42,6 +42,9 @@
 #define ENTRY_ACCESS_BIT (1ULL << 10)
 #define ENTRY_NX_BIT (1ULL << 54)
 
+#define UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR (55)
+#define VALID_ENTRY_MASK ((u64)0xF << UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR)
+
 /* Helper Function to perform assignment of page table entries, to
  * ensure the use of strd, which is required on LPAE systems.
  */
@@ -85,6 +88,7 @@ static phys_addr_t pte_to_phy_addr(u64 entry)
 	if (!(entry & 1))
 		return 0;
 
+	entry &= ~VALID_ENTRY_MASK;
 	return entry & ~0xFFF;
 }
 
@@ -151,10 +155,48 @@ static void entry_set_ate(u64 *entry,
 				ENTRY_ACCESS_BIT | ENTRY_IS_ATE_L02);
 }
 
-static void entry_set_pte(u64 *entry, phys_addr_t phy)
+static unsigned int get_num_valid_entries(u64 *pgd)
+{
+	register unsigned int num_of_valid_entries;
+
+	num_of_valid_entries =
+		(unsigned int)((pgd[2] & VALID_ENTRY_MASK) >>
+			       (UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR - 8));
+	num_of_valid_entries |=
+		(unsigned int)((pgd[1] & VALID_ENTRY_MASK) >>
+			       (UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR - 4));
+	num_of_valid_entries |=
+		(unsigned int)((pgd[0] & VALID_ENTRY_MASK) >>
+			       (UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR));
+
+	return num_of_valid_entries;
+}
+
+static void set_num_valid_entries(u64 *pgd, unsigned int num_of_valid_entries)
+{
+	WARN_ON_ONCE(num_of_valid_entries > KBASE_MMU_PAGE_ENTRIES);
+
+	pgd[0] &= ~VALID_ENTRY_MASK;
+	pgd[0] |= ((u64)(num_of_valid_entries & 0xF)
+		   << UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR);
+
+	pgd[1] &= ~VALID_ENTRY_MASK;
+	pgd[1] |= ((u64)((num_of_valid_entries >> 4) & 0xF)
+		   << UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR);
+
+	pgd[2] &= ~VALID_ENTRY_MASK;
+	pgd[2] |= ((u64)((num_of_valid_entries >> 8) & 0xF)
+		   << UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR);
+}
+
+static void entry_set_pte(u64 *pgd, u64 vpfn, phys_addr_t phy)
 {
-	page_table_entry_set(entry, (phy & PAGE_MASK) |
-			ENTRY_ACCESS_BIT | ENTRY_IS_PTE);
+	unsigned int nr_entries = get_num_valid_entries(pgd);
+
+	page_table_entry_set(&pgd[vpfn], (phy & PAGE_MASK) | ENTRY_ACCESS_BIT |
+						 ENTRY_IS_PTE);
+
+	set_num_valid_entries(pgd, nr_entries + 1);
 }
 
 static void entry_invalidate(u64 *entry)
@@ -172,6 +214,8 @@ static struct kbase_mmu_mode const aarch64_mode = {
 	.entry_set_ate = entry_set_ate,
 	.entry_set_pte = entry_set_pte,
 	.entry_invalidate = entry_invalidate,
+	.get_num_valid_entries = get_num_valid_entries,
+	.set_num_valid_entries = set_num_valid_entries,
 	.flags = KBASE_MMU_MODE_HAS_NON_CACHEABLE
 };
 
diff --git a/mali_kbase/platform/devicetree/mali_kbase_runtime_pm.c b/mali_kbase/platform/devicetree/mali_kbase_runtime_pm.c
index 3b84d74..9ae2c02 100644
--- a/mali_kbase/platform/devicetree/mali_kbase_runtime_pm.c
+++ b/mali_kbase/platform/devicetree/mali_kbase_runtime_pm.c
@@ -77,13 +77,28 @@ static int pm_callback_power_on(struct kbase_device *kbdev)
 {
 	int ret = 1; /* Assume GPU has been powered off */
 	int error;
+	unsigned long flags;
 
-	dev_dbg(kbdev->dev, "pm_callback_power_on %p\n",
+	dev_dbg(kbdev->dev, "%s %p\n", __func__,
 			(void *)kbdev->dev->pm_domain);
 
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	WARN_ON(kbdev->pm.backend.gpu_powered);
+#if MALI_USE_CSF
+	if (likely(kbdev->csf.firmware_inited)) {
+		WARN_ON(!kbdev->pm.active_count);
+		WARN_ON(kbdev->pm.runtime_active);
+	}
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
 	enable_gpu_power_control(kbdev);
+	CSTD_UNUSED(error);
+#else
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
+	enable_gpu_power_control(kbdev);
 	error = pm_runtime_get_sync(kbdev->dev);
+
 	if (error == 1) {
 		/*
 		 * Let core know that the chip has not been
@@ -93,22 +108,93 @@ static int pm_callback_power_on(struct kbase_device *kbdev)
 	}
 
 	dev_dbg(kbdev->dev, "pm_runtime_get_sync returned %d\n", error);
+#endif /* MALI_USE_CSF */
 
 	return ret;
 }
 
 static void pm_callback_power_off(struct kbase_device *kbdev)
 {
-	dev_dbg(kbdev->dev, "pm_callback_power_off\n");
+	unsigned long flags;
+
+	dev_dbg(kbdev->dev, "%s\n", __func__);
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	WARN_ON(kbdev->pm.backend.gpu_powered);
+#if MALI_USE_CSF
+	if (likely(kbdev->csf.firmware_inited)) {
+		WARN_ON(kbase_csf_scheduler_get_nr_active_csgs(kbdev));
+		WARN_ON(kbdev->pm.backend.mcu_state != KBASE_MCU_OFF);
+	}
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	/* Power down the GPU immediately */
+	disable_gpu_power_control(kbdev);
+#else  /* MALI_USE_CSF */
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
+#ifdef KBASE_PM_RUNTIME
 	pm_runtime_mark_last_busy(kbdev->dev);
 	pm_runtime_put_autosuspend(kbdev->dev);
-
-#ifndef KBASE_PM_RUNTIME
+#else
+	/* Power down the GPU immediately as runtime PM is disabled */
 	disable_gpu_power_control(kbdev);
 #endif
+#endif /* MALI_USE_CSF */
+}
+
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+static void pm_callback_runtime_gpu_active(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+	int error;
+
+	lockdep_assert_held(&kbdev->pm.lock);
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	WARN_ON(!kbdev->pm.backend.gpu_powered);
+	WARN_ON(!kbdev->pm.active_count);
+	WARN_ON(kbdev->pm.runtime_active);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	if (pm_runtime_status_suspended(kbdev->dev)) {
+		error = pm_runtime_get_sync(kbdev->dev);
+		dev_dbg(kbdev->dev, "pm_runtime_get_sync returned %d", error);
+	} else {
+		/* Call the async version here, otherwise there could be
+		 * a deadlock if the runtime suspend operation is ongoing.
+		 * Caller would have taken the kbdev->pm.lock and/or the
+		 * scheduler lock, and the runtime suspend callback function
+		 * will also try to acquire the same lock(s).
+		 */
+		error = pm_runtime_get(kbdev->dev);
+		dev_dbg(kbdev->dev, "pm_runtime_get returned %d", error);
+	}
+
+	kbdev->pm.runtime_active = true;
 }
 
+static void pm_callback_runtime_gpu_idle(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+
+	lockdep_assert_held(&kbdev->pm.lock);
+
+	dev_dbg(kbdev->dev, "%s", __func__);
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	WARN_ON(!kbdev->pm.backend.gpu_powered);
+	WARN_ON(kbdev->pm.backend.l2_state != KBASE_L2_OFF);
+	WARN_ON(kbdev->pm.active_count);
+	WARN_ON(!kbdev->pm.runtime_active);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	pm_runtime_mark_last_busy(kbdev->dev);
+	pm_runtime_put_autosuspend(kbdev->dev);
+	kbdev->pm.runtime_active = false;
+}
+#endif
+
 #ifdef KBASE_PM_RUNTIME
 static int kbase_device_runtime_init(struct kbase_device *kbdev)
 {
@@ -124,7 +210,12 @@ static int kbase_device_runtime_init(struct kbase_device *kbdev)
 
 	if (!pm_runtime_enabled(kbdev->dev)) {
 		dev_warn(kbdev->dev, "pm_runtime not enabled");
-		ret = -ENOSYS;
+		ret = -EINVAL;
+	} else if (atomic_read(&kbdev->dev->power.usage_count)) {
+		dev_warn(kbdev->dev,
+			 "%s: Device runtime usage count unexpectedly non zero %d",
+			__func__, atomic_read(&kbdev->dev->power.usage_count));
+		ret = -EINVAL;
 	}
 
 	return ret;
@@ -133,9 +224,15 @@ static int kbase_device_runtime_init(struct kbase_device *kbdev)
 static void kbase_device_runtime_disable(struct kbase_device *kbdev)
 {
 	dev_dbg(kbdev->dev, "kbase_device_runtime_disable\n");
+
+	if (atomic_read(&kbdev->dev->power.usage_count))
+		dev_warn(kbdev->dev,
+			 "%s: Device runtime usage count unexpectedly non zero %d",
+			__func__, atomic_read(&kbdev->dev->power.usage_count));
+
 	pm_runtime_disable(kbdev->dev);
 }
-#endif
+#endif /* KBASE_PM_RUNTIME */
 
 static int pm_callback_runtime_on(struct kbase_device *kbdev)
 {
@@ -180,6 +277,14 @@ struct kbase_pm_callback_conf pm_callbacks = {
 	.power_runtime_on_callback = NULL,
 	.power_runtime_off_callback = NULL,
 #endif				/* KBASE_PM_RUNTIME */
+
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+	.power_runtime_gpu_idle_callback = pm_callback_runtime_gpu_idle,
+	.power_runtime_gpu_active_callback = pm_callback_runtime_gpu_active,
+#else
+	.power_runtime_gpu_idle_callback = NULL,
+	.power_runtime_gpu_active_callback = NULL,
+#endif
 };
 
 
diff --git a/mali_kbase/tests/include/kutf/kutf_helpers.h b/mali_kbase/tests/include/kutf/kutf_helpers.h
index c4c713c..79b1eac 100644
--- a/mali_kbase/tests/include/kutf/kutf_helpers.h
+++ b/mali_kbase/tests/include/kutf/kutf_helpers.h
@@ -81,4 +81,17 @@ int kutf_helper_input_enqueue(struct kutf_context *context,
  */
 void kutf_helper_input_enqueue_end_of_data(struct kutf_context *context);
 
+/* kutf_helper_external_reset_gpu() - Mimic power-on-reset using external reset
+ *
+ * Reset GPU using FPGA SYSCTL register.
+ *
+ * Note that
+ * - It must be called on the platform that has FPGA SYSCTL
+ *   register available such as Juno board.
+ * - It won't reinitialize GPU related settings such as interrupt for kbase.
+ *
+ * Return:  0 on success, negative value otherwise.
+ */
+int kutf_helper_external_reset_gpu(void);
+
 #endif	/* _KERNEL_UTF_HELPERS_H_ */
diff --git a/mali_kbase/tests/kutf/kutf_helpers.c b/mali_kbase/tests/kutf/kutf_helpers.c
index c075428..d76cebe 100644
--- a/mali_kbase/tests/kutf/kutf_helpers.c
+++ b/mali_kbase/tests/kutf/kutf_helpers.c
@@ -21,7 +21,6 @@
 
 /* Kernel UTF test helpers */
 #include <kutf/kutf_helpers.h>
-
 #include <linux/err.h>
 #include <linux/jiffies.h>
 #include <linux/sched.h>
@@ -29,6 +28,10 @@
 #include <linux/wait.h>
 #include <linux/uaccess.h>
 #include <linux/export.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include "gpu/mali_kbase_gpu_regmap.h"
+#include <device/mali_kbase_device.h>
 
 static DEFINE_SPINLOCK(kutf_input_lock);
 
@@ -128,3 +131,44 @@ void kutf_helper_input_enqueue_end_of_data(struct kutf_context *context)
 {
 	kutf_helper_input_enqueue(context, NULL, 0);
 }
+
+/* Values are taken from juno-fpga.dtsi */
+#define FPGA_SYSCTL_START_ADDR ((resource_size_t)0x6f020000)
+#define FPGA_SYSCTL_SIZE ((size_t)0xCC)
+
+/* Offset of FPGA_SYSCTL_GPU_RESET_REG register */
+#define FPGA_SYSCTL_GPU_RESET_REG 0x64
+#define GPU_RESET_HIGH 0x1
+#define GPU_RESET_LOW 0x0
+
+int kutf_helper_external_reset_gpu(void)
+{
+	void __iomem *regs = NULL;
+	void __iomem *gpu_reset_reg = NULL;
+	int error = -ENXIO;
+	int repeat = 100;
+
+	regs = ioremap(FPGA_SYSCTL_START_ADDR, FPGA_SYSCTL_SIZE);
+	if (!regs)
+		return -ENOMEM;
+
+	/* Reset GPU via SYSCTL_GPU_RESET by rising & falling the reset signal */
+	gpu_reset_reg = regs + FPGA_SYSCTL_GPU_RESET_REG;
+	while (error && repeat--) {
+		writel(GPU_RESET_HIGH, gpu_reset_reg);
+		if (readl(gpu_reset_reg) == GPU_RESET_HIGH) {
+			mdelay(100);
+			writel(GPU_RESET_LOW, gpu_reset_reg);
+			mdelay(100);
+
+			/* Succeed in resetting GPU */
+			if (readl(gpu_reset_reg) == GPU_RESET_LOW)
+				error = 0;
+		}
+	}
+
+	iounmap(regs);
+
+	return error;
+}
+EXPORT_SYMBOL(kutf_helper_external_reset_gpu);
diff --git a/mali_kbase/tests/kutf/kutf_suite.c b/mali_kbase/tests/kutf/kutf_suite.c
index 6745299..d45d9df 100644
--- a/mali_kbase/tests/kutf/kutf_suite.c
+++ b/mali_kbase/tests/kutf/kutf_suite.c
@@ -582,7 +582,7 @@ static int create_fixture_variant(struct kutf_test_function *test_func,
 
 	snprintf(name, sizeof(name), "%d", fixture_index);
 	test_fix->dir = debugfs_create_dir(name, test_func->dir);
-	if (!test_func->dir) {
+	if (IS_ERR_OR_NULL(test_func->dir)) {
 		pr_err("Failed to create debugfs directory when adding fixture\n");
 		/* Might not be the right error, we don't get it passed back to us */
 		err = -EEXIST;
@@ -591,7 +591,7 @@ static int create_fixture_variant(struct kutf_test_function *test_func,
 
 	tmp = debugfs_create_file("type", S_IROTH, test_fix->dir, "fixture\n",
 				  &kutf_debugfs_const_string_ops);
-	if (!tmp) {
+	if (IS_ERR_OR_NULL(tmp)) {
 		pr_err("Failed to create debugfs file \"type\" when adding fixture\n");
 		/* Might not be the right error, we don't get it passed back to us */
 		err = -EEXIST;
@@ -606,7 +606,7 @@ static int create_fixture_variant(struct kutf_test_function *test_func,
 			"run", 0600, test_fix->dir,
 			test_fix,
 			&kutf_debugfs_run_ops);
-	if (!tmp) {
+	if (IS_ERR_OR_NULL(tmp)) {
 		pr_err("Failed to create debugfs file \"run\" when adding fixture\n");
 		/* Might not be the right error, we don't get it passed back to us */
 		err = -EEXIST;
@@ -666,14 +666,14 @@ void kutf_add_test_with_filters_and_data(
 	INIT_LIST_HEAD(&test_func->variant_list);
 
 	test_func->dir = debugfs_create_dir(name, suite->dir);
-	if (!test_func->dir) {
+	if (IS_ERR_OR_NULL(test_func->dir)) {
 		pr_err("Failed to create debugfs directory when adding test %s\n", name);
 		goto fail_dir;
 	}
 
 	tmp = debugfs_create_file("type", S_IROTH, test_func->dir, "test\n",
 				  &kutf_debugfs_const_string_ops);
-	if (!tmp) {
+	if (IS_ERR_OR_NULL(tmp)) {
 		pr_err("Failed to create debugfs file \"type\" when adding test %s\n", name);
 		goto fail_file;
 	}
@@ -686,7 +686,7 @@ void kutf_add_test_with_filters_and_data(
 	tmp = debugfs_create_x32("filters", S_IROTH, test_func->dir,
 				 &test_func->filters);
 #endif
-	if (!tmp) {
+	if (IS_ERR_OR_NULL(tmp)) {
 		pr_err("Failed to create debugfs file \"filters\" when adding test %s\n", name);
 		goto fail_file;
 	}
@@ -698,7 +698,7 @@ void kutf_add_test_with_filters_and_data(
 #else
 	tmp = debugfs_create_u32("test_id", S_IROTH, test_func->dir,
 				 &test_func->test_id);
-	if (!tmp) {
+	if (IS_ERR_OR_NULL(tmp)) {
 		pr_err("Failed to create debugfs file \"test_id\" when adding test %s\n", name);
 		goto fail_file;
 	}
@@ -805,14 +805,14 @@ struct kutf_suite *kutf_create_suite_with_filters_and_data(
 	}
 
 	suite->dir = debugfs_create_dir(name, app->dir);
-	if (!suite->dir) {
+	if (IS_ERR_OR_NULL(suite->dir)) {
 		pr_err("Failed to create debugfs directory when adding test %s\n", name);
 		goto fail_debugfs;
 	}
 
 	tmp = debugfs_create_file("type", S_IROTH, suite->dir, "suite\n",
 				  &kutf_debugfs_const_string_ops);
-	if (!tmp) {
+	if (IS_ERR_OR_NULL(tmp)) {
 		pr_err("Failed to create debugfs file \"type\" when adding test %s\n", name);
 		goto fail_file;
 	}
@@ -913,14 +913,14 @@ struct kutf_application *kutf_create_application(const char *name)
 	}
 
 	app->dir = debugfs_create_dir(name, base_dir);
-	if (!app->dir) {
+	if (IS_ERR_OR_NULL(app->dir)) {
 		pr_err("Failed to create debugfs direcotry when creating application %s\n", name);
 		goto fail_debugfs;
 	}
 
 	tmp = debugfs_create_file("type", S_IROTH, app->dir, "application\n",
 				  &kutf_debugfs_const_string_ops);
-	if (!tmp) {
+	if (IS_ERR_OR_NULL(tmp)) {
 		pr_err("Failed to create debugfs file \"type\" when creating application %s\n", name);
 		goto fail_file;
 	}
@@ -1172,7 +1172,7 @@ static int __init init_kutf_core(void)
 		return -ENOMEM;
 
 	base_dir = debugfs_create_dir("kutf_tests", NULL);
-	if (!base_dir) {
+	if (IS_ERR_OR_NULL(base_dir)) {
 		destroy_workqueue(kutf_workq);
 		kutf_workq = NULL;
 		return -ENOMEM;
diff --git a/mali_kbase/tests/mali_kutf_clk_rate_trace/kernel/mali_kutf_clk_rate_trace_test.c b/mali_kbase/tests/mali_kutf_clk_rate_trace/kernel/mali_kutf_clk_rate_trace_test.c
index 5e9a2e7..87bcb31 100644
--- a/mali_kbase/tests/mali_kutf_clk_rate_trace/kernel/mali_kutf_clk_rate_trace_test.c
+++ b/mali_kbase/tests/mali_kutf_clk_rate_trace/kernel/mali_kutf_clk_rate_trace_test.c
@@ -530,7 +530,7 @@ static bool kutf_clk_trace_process_portal_cmd(struct kutf_context *context,
 		errmsg = kutf_clk_trace_do_get_platform(context, cmd);
 		break;
 	case PORTAL_CMD_GET_CLK_RATE_MGR:
-		/* Fall through */
+		fallthrough;
 	case PORTAL_CMD_GET_CLK_RATE_TRACE:
 		errmsg = kutf_clk_trace_do_get_rate(context, cmd);
 		break;
@@ -538,7 +538,7 @@ static bool kutf_clk_trace_process_portal_cmd(struct kutf_context *context,
 		errmsg = kutf_clk_trace_do_get_snapshot(context, cmd);
 		break;
 	case PORTAL_CMD_INC_PM_CTX_CNT:
-		/* Fall through */
+		fallthrough;
 	case PORTAL_CMD_DEC_PM_CTX_CNT:
 		errmsg = kutf_clk_trace_do_change_pm_ctx(context, cmd);
 		break;
diff --git a/mali_kbase/tl/backend/mali_kbase_timeline_csf.c b/mali_kbase/tl/backend/mali_kbase_timeline_csf.c
index a2868da..c101563 100644
--- a/mali_kbase/tl/backend/mali_kbase_timeline_csf.c
+++ b/mali_kbase/tl/backend/mali_kbase_timeline_csf.c
@@ -25,6 +25,8 @@
 
 #include <mali_kbase.h>
 
+#define GPU_FEATURES_CROSS_STREAM_SYNC_MASK (1ull << 3ull)
+
 void kbase_create_timeline_objects(struct kbase_device *kbdev)
 {
 	unsigned int as_nr;
@@ -33,6 +35,15 @@ void kbase_create_timeline_objects(struct kbase_device *kbdev)
 	struct kbase_timeline *timeline = kbdev->timeline;
 	struct kbase_tlstream *summary =
 		&kbdev->timeline->streams[TL_STREAM_TYPE_OBJ_SUMMARY];
+	u32 const kbdev_has_cross_stream_sync =
+		(kbdev->gpu_props.props.raw_props.gpu_features &
+		 GPU_FEATURES_CROSS_STREAM_SYNC_MASK) ?
+			1 :
+			0;
+	u32 const arch_maj = (kbdev->gpu_props.props.raw_props.gpu_id &
+			      GPU_ID2_ARCH_MAJOR) >>
+			     GPU_ID2_ARCH_MAJOR_SHIFT;
+	u32 const num_sb_entries = arch_maj >= 11 ? 16 : 8;
 
 	/* Summarize the Address Space objects. */
 	for (as_nr = 0; as_nr < kbdev->nr_hw_address_spaces; as_nr++)
@@ -51,10 +62,11 @@ void kbase_create_timeline_objects(struct kbase_device *kbdev)
 				kbdev);
 
 	/* Trace the creation of a new kbase device and set its properties. */
-	__kbase_tlstream_tl_kbase_new_device(summary,
-		kbdev->gpu_props.props.raw_props.gpu_id,
+	__kbase_tlstream_tl_kbase_new_device(
+		summary, kbdev->gpu_props.props.raw_props.gpu_id,
 		kbdev->gpu_props.num_cores, kbdev->csf.global_iface.group_num,
-		kbdev->nr_hw_address_spaces);
+		kbdev->nr_hw_address_spaces, num_sb_entries,
+		kbdev_has_cross_stream_sync);
 
 	/* Lock the context list, to ensure no changes to the list are made
 	 * while we're summarizing the contexts and their contents.
@@ -74,9 +86,10 @@ void kbase_create_timeline_objects(struct kbase_device *kbdev)
 			kbdev->csf.scheduler.csg_slots[slot_i].resident_group;
 
 		if (group)
-			__kbase_tlstream_tl_kbase_device_program_csg(summary,
+			__kbase_tlstream_tl_kbase_device_program_csg(
+				summary,
 				kbdev->gpu_props.props.raw_props.gpu_id,
-				group->handle, slot_i);
+				group->kctx->id, group->handle, slot_i);
 	}
 
 	/* Reset body stream buffers while holding the kctx lock.
diff --git a/mali_kbase/tl/mali_kbase_timeline.c b/mali_kbase/tl/mali_kbase_timeline.c
index 09818a5..af10cf5 100644
--- a/mali_kbase/tl/mali_kbase_timeline.c
+++ b/mali_kbase/tl/mali_kbase_timeline.c
@@ -224,13 +224,6 @@ int kbase_timeline_io_acquire(struct kbase_device *kbdev, u32 flags)
 		timeline->obj_header_btc = obj_desc_header_size;
 		timeline->aux_header_btc = aux_desc_header_size;
 
-		/* Start autoflush timer. */
-		atomic_set(&timeline->autoflush_timer_active, 1);
-		rcode = mod_timer(
-				&timeline->autoflush_timer,
-				jiffies + msecs_to_jiffies(AUTOFLUSH_INTERVAL));
-		CSTD_UNUSED(rcode);
-
 #if !MALI_USE_CSF
 		/* If job dumping is enabled, readjust the software event's
 		 * timeout as the default value of 3 seconds is often
@@ -258,6 +251,16 @@ int kbase_timeline_io_acquire(struct kbase_device *kbdev, u32 flags)
 		kbase_tlstream_current_devfreq_target(kbdev);
 #endif /* CONFIG_MALI_DEVFREQ */
 
+		/* Start the autoflush timer.
+		 * We must do this after creating timeline objects to ensure we
+		 * don't auto-flush the streams which will be reset during the
+		 * summarization process.
+		 */
+		atomic_set(&timeline->autoflush_timer_active, 1);
+		rcode = mod_timer(&timeline->autoflush_timer,
+				  jiffies +
+					  msecs_to_jiffies(AUTOFLUSH_INTERVAL));
+		CSTD_UNUSED(rcode);
 	} else {
 		ret = -EBUSY;
 	}
diff --git a/mali_kbase/tl/mali_kbase_tracepoints.c b/mali_kbase/tl/mali_kbase_tracepoints.c
index 2c0de01..54e51f8 100644
--- a/mali_kbase/tl/mali_kbase_tracepoints.c
+++ b/mali_kbase/tl/mali_kbase_tracepoints.c
@@ -74,6 +74,7 @@ enum tl_msg_id_obj {
 	KBASE_TL_KBASE_NEW_DEVICE,
 	KBASE_TL_KBASE_DEVICE_PROGRAM_CSG,
 	KBASE_TL_KBASE_DEVICE_DEPROGRAM_CSG,
+	KBASE_TL_KBASE_DEVICE_HALT_CSG,
 	KBASE_TL_KBASE_NEW_CTX,
 	KBASE_TL_KBASE_DEL_CTX,
 	KBASE_TL_KBASE_CTX_ASSIGN_AS,
@@ -121,6 +122,17 @@ enum tl_msg_id_obj {
 	KBASE_TL_KBASE_KCPUQUEUE_EXECUTE_GROUP_SUSPEND_END,
 	KBASE_TL_KBASE_CSFFW_TLSTREAM_OVERFLOW,
 	KBASE_TL_KBASE_CSFFW_RESET,
+	KBASE_TL_JS_SCHED_START,
+	KBASE_TL_JS_SCHED_END,
+	KBASE_TL_JD_SUBMIT_ATOM_START,
+	KBASE_TL_JD_SUBMIT_ATOM_END,
+	KBASE_TL_JD_DONE_NO_LOCK_START,
+	KBASE_TL_JD_DONE_NO_LOCK_END,
+	KBASE_TL_JD_DONE_START,
+	KBASE_TL_JD_DONE_END,
+	KBASE_TL_JD_ATOM_COMPLETE,
+	KBASE_TL_RUN_ATOM_START,
+	KBASE_TL_RUN_ATOM_END,
 	KBASE_OBJ_MSG_COUNT,
 };
 
@@ -137,6 +149,7 @@ enum tl_msg_id_aux {
 	KBASE_AUX_JIT_STATS,
 	KBASE_AUX_TILER_HEAP_STATS,
 	KBASE_AUX_EVENT_JOB_SLOT,
+	KBASE_AUX_MMU_COMMAND,
 	KBASE_AUX_MSG_COUNT,
 };
 
@@ -299,16 +312,20 @@ enum tl_msg_id_aux {
 		"gpu") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_NEW_DEVICE, \
 		"New KBase Device", \
-		"@IIII", \
-		"kbase_device_id,kbase_device_gpu_core_count,kbase_device_max_num_csgs,kbase_device_as_count") \
+		"@IIIIII", \
+		"kbase_device_id,kbase_device_gpu_core_count,kbase_device_max_num_csgs,kbase_device_as_count,kbase_device_sb_entry_count,kbase_device_has_cross_stream_sync") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_DEVICE_PROGRAM_CSG, \
 		"CSG is programmed to a slot", \
-		"@III", \
-		"kbase_device_id,gpu_cmdq_grp_handle,kbase_device_csg_slot_index") \
+		"@IIII", \
+		"kbase_device_id,kernel_ctx_id,gpu_cmdq_grp_handle,kbase_device_csg_slot_index") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_DEVICE_DEPROGRAM_CSG, \
 		"CSG is deprogrammed from a slot", \
 		"@II", \
 		"kbase_device_id,kbase_device_csg_slot_index") \
+	TRACEPOINT_DESC(KBASE_TL_KBASE_DEVICE_HALT_CSG, \
+		"CSG is halted", \
+		"@II", \
+		"kbase_device_id,kbase_device_csg_slot_index") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_NEW_CTX, \
 		"New KBase Context", \
 		"@II", \
@@ -497,6 +514,50 @@ enum tl_msg_id_aux {
 		"A reset has happened with the CSFFW", \
 		"@L", \
 		"csffw_cycle") \
+	TRACEPOINT_DESC(KBASE_TL_JS_SCHED_START, \
+		"Scheduling starts", \
+		"@I", \
+		"dummy") \
+	TRACEPOINT_DESC(KBASE_TL_JS_SCHED_END, \
+		"Scheduling ends", \
+		"@I", \
+		"dummy") \
+	TRACEPOINT_DESC(KBASE_TL_JD_SUBMIT_ATOM_START, \
+		"Submitting an atom starts", \
+		"@p", \
+		"atom") \
+	TRACEPOINT_DESC(KBASE_TL_JD_SUBMIT_ATOM_END, \
+		"Submitting an atom ends", \
+		"@p", \
+		"atom") \
+	TRACEPOINT_DESC(KBASE_TL_JD_DONE_NO_LOCK_START, \
+		"Within function jd_done_nolock", \
+		"@p", \
+		"atom") \
+	TRACEPOINT_DESC(KBASE_TL_JD_DONE_NO_LOCK_END, \
+		"Within function jd_done_nolock - end", \
+		"@p", \
+		"atom") \
+	TRACEPOINT_DESC(KBASE_TL_JD_DONE_START, \
+		"Start of kbase_jd_done", \
+		"@p", \
+		"atom") \
+	TRACEPOINT_DESC(KBASE_TL_JD_DONE_END, \
+		"End of kbase_jd_done", \
+		"@p", \
+		"atom") \
+	TRACEPOINT_DESC(KBASE_TL_JD_ATOM_COMPLETE, \
+		"Atom marked complete", \
+		"@p", \
+		"atom") \
+	TRACEPOINT_DESC(KBASE_TL_RUN_ATOM_START, \
+		"Running of atom starts", \
+		"@pI", \
+		"atom,atom_nr") \
+	TRACEPOINT_DESC(KBASE_TL_RUN_ATOM_END, \
+		"Running of atom ends", \
+		"@pI", \
+		"atom,atom_nr") \
 
 #define MIPE_HEADER_BLOB_VAR_NAME		__obj_desc_header
 #define MIPE_HEADER_STREAM_ID			TL_STREAM_ID_KERNEL
@@ -554,6 +615,10 @@ const size_t  obj_desc_header_size = sizeof(__obj_desc_header);
 		"event on a given job slot", \
 		"@pIII", \
 		"ctx,slot_nr,atom_nr,event") \
+	TRACEPOINT_DESC(KBASE_AUX_MMU_COMMAND, \
+		"mmu commands with synchronicity info", \
+		"@IIILI", \
+		"kernel_ctx_id,mmu_cmd_id,mmu_synchronicity,mmu_lock_addr,mmu_lock_page_num") \
 
 #define MIPE_HEADER_BLOB_VAR_NAME		__aux_desc_header
 #define MIPE_HEADER_STREAM_ID        		TL_STREAM_ID_KERNEL
@@ -1936,12 +2001,52 @@ void __kbase_tlstream_aux_event_job_slot(
 	kbase_tlstream_msgbuf_release(stream, acq_flags);
 }
 
+void __kbase_tlstream_aux_mmu_command(
+	struct kbase_tlstream *stream,
+	u32 kernel_ctx_id,
+	u32 mmu_cmd_id,
+	u32 mmu_synchronicity,
+	u64 mmu_lock_addr,
+	u32 mmu_lock_page_num)
+{
+	const u32 msg_id = KBASE_AUX_MMU_COMMAND;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(kernel_ctx_id)
+		+ sizeof(mmu_cmd_id)
+		+ sizeof(mmu_synchronicity)
+		+ sizeof(mmu_lock_addr)
+		+ sizeof(mmu_lock_page_num)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &kernel_ctx_id, sizeof(kernel_ctx_id));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &mmu_cmd_id, sizeof(mmu_cmd_id));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &mmu_synchronicity, sizeof(mmu_synchronicity));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &mmu_lock_addr, sizeof(mmu_lock_addr));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &mmu_lock_page_num, sizeof(mmu_lock_page_num));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
 void __kbase_tlstream_tl_kbase_new_device(
 	struct kbase_tlstream *stream,
 	u32 kbase_device_id,
 	u32 kbase_device_gpu_core_count,
 	u32 kbase_device_max_num_csgs,
-	u32 kbase_device_as_count)
+	u32 kbase_device_as_count,
+	u32 kbase_device_sb_entry_count,
+	u32 kbase_device_has_cross_stream_sync)
 {
 	const u32 msg_id = KBASE_TL_KBASE_NEW_DEVICE;
 	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
@@ -1949,6 +2054,8 @@ void __kbase_tlstream_tl_kbase_new_device(
 		+ sizeof(kbase_device_gpu_core_count)
 		+ sizeof(kbase_device_max_num_csgs)
 		+ sizeof(kbase_device_as_count)
+		+ sizeof(kbase_device_sb_entry_count)
+		+ sizeof(kbase_device_has_cross_stream_sync)
 		;
 	char *buffer;
 	unsigned long acq_flags;
@@ -1966,6 +2073,10 @@ void __kbase_tlstream_tl_kbase_new_device(
 		pos, &kbase_device_max_num_csgs, sizeof(kbase_device_max_num_csgs));
 	pos = kbasep_serialize_bytes(buffer,
 		pos, &kbase_device_as_count, sizeof(kbase_device_as_count));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &kbase_device_sb_entry_count, sizeof(kbase_device_sb_entry_count));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &kbase_device_has_cross_stream_sync, sizeof(kbase_device_has_cross_stream_sync));
 
 	kbase_tlstream_msgbuf_release(stream, acq_flags);
 }
@@ -1973,12 +2084,14 @@ void __kbase_tlstream_tl_kbase_new_device(
 void __kbase_tlstream_tl_kbase_device_program_csg(
 	struct kbase_tlstream *stream,
 	u32 kbase_device_id,
+	u32 kernel_ctx_id,
 	u32 gpu_cmdq_grp_handle,
 	u32 kbase_device_csg_slot_index)
 {
 	const u32 msg_id = KBASE_TL_KBASE_DEVICE_PROGRAM_CSG;
 	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
 		+ sizeof(kbase_device_id)
+		+ sizeof(kernel_ctx_id)
 		+ sizeof(gpu_cmdq_grp_handle)
 		+ sizeof(kbase_device_csg_slot_index)
 		;
@@ -1993,6 +2106,8 @@ void __kbase_tlstream_tl_kbase_device_program_csg(
 	pos = kbasep_serialize_bytes(buffer,
 		pos, &kbase_device_id, sizeof(kbase_device_id));
 	pos = kbasep_serialize_bytes(buffer,
+		pos, &kernel_ctx_id, sizeof(kernel_ctx_id));
+	pos = kbasep_serialize_bytes(buffer,
 		pos, &gpu_cmdq_grp_handle, sizeof(gpu_cmdq_grp_handle));
 	pos = kbasep_serialize_bytes(buffer,
 		pos, &kbase_device_csg_slot_index, sizeof(kbase_device_csg_slot_index));
@@ -2026,6 +2141,32 @@ void __kbase_tlstream_tl_kbase_device_deprogram_csg(
 	kbase_tlstream_msgbuf_release(stream, acq_flags);
 }
 
+void __kbase_tlstream_tl_kbase_device_halt_csg(
+	struct kbase_tlstream *stream,
+	u32 kbase_device_id,
+	u32 kbase_device_csg_slot_index)
+{
+	const u32 msg_id = KBASE_TL_KBASE_DEVICE_HALT_CSG;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(kbase_device_id)
+		+ sizeof(kbase_device_csg_slot_index)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &kbase_device_id, sizeof(kbase_device_id));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &kbase_device_csg_slot_index, sizeof(kbase_device_csg_slot_index));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
 void __kbase_tlstream_tl_kbase_new_ctx(
 	struct kbase_tlstream *stream,
 	u32 kernel_ctx_id,
@@ -3216,4 +3357,254 @@ void __kbase_tlstream_tl_kbase_csffw_reset(
 	kbase_tlstream_msgbuf_release(stream, acq_flags);
 }
 
+void __kbase_tlstream_tl_js_sched_start(
+	struct kbase_tlstream *stream,
+	u32 dummy)
+{
+	const u32 msg_id = KBASE_TL_JS_SCHED_START;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(dummy)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &dummy, sizeof(dummy));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_js_sched_end(
+	struct kbase_tlstream *stream,
+	u32 dummy)
+{
+	const u32 msg_id = KBASE_TL_JS_SCHED_END;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(dummy)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &dummy, sizeof(dummy));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_jd_submit_atom_start(
+	struct kbase_tlstream *stream,
+	const void *atom)
+{
+	const u32 msg_id = KBASE_TL_JD_SUBMIT_ATOM_START;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(atom)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &atom, sizeof(atom));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_jd_submit_atom_end(
+	struct kbase_tlstream *stream,
+	const void *atom)
+{
+	const u32 msg_id = KBASE_TL_JD_SUBMIT_ATOM_END;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(atom)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &atom, sizeof(atom));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_jd_done_no_lock_start(
+	struct kbase_tlstream *stream,
+	const void *atom)
+{
+	const u32 msg_id = KBASE_TL_JD_DONE_NO_LOCK_START;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(atom)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &atom, sizeof(atom));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_jd_done_no_lock_end(
+	struct kbase_tlstream *stream,
+	const void *atom)
+{
+	const u32 msg_id = KBASE_TL_JD_DONE_NO_LOCK_END;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(atom)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &atom, sizeof(atom));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_jd_done_start(
+	struct kbase_tlstream *stream,
+	const void *atom)
+{
+	const u32 msg_id = KBASE_TL_JD_DONE_START;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(atom)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &atom, sizeof(atom));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_jd_done_end(
+	struct kbase_tlstream *stream,
+	const void *atom)
+{
+	const u32 msg_id = KBASE_TL_JD_DONE_END;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(atom)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &atom, sizeof(atom));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_jd_atom_complete(
+	struct kbase_tlstream *stream,
+	const void *atom)
+{
+	const u32 msg_id = KBASE_TL_JD_ATOM_COMPLETE;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(atom)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &atom, sizeof(atom));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_run_atom_start(
+	struct kbase_tlstream *stream,
+	const void *atom,
+	u32 atom_nr)
+{
+	const u32 msg_id = KBASE_TL_RUN_ATOM_START;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(atom)
+		+ sizeof(atom_nr)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &atom, sizeof(atom));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &atom_nr, sizeof(atom_nr));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_run_atom_end(
+	struct kbase_tlstream *stream,
+	const void *atom,
+	u32 atom_nr)
+{
+	const u32 msg_id = KBASE_TL_RUN_ATOM_END;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(atom)
+		+ sizeof(atom_nr)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &atom, sizeof(atom));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &atom_nr, sizeof(atom_nr));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
 /* clang-format on */
diff --git a/mali_kbase/tl/mali_kbase_tracepoints.h b/mali_kbase/tl/mali_kbase_tracepoints.h
index 887a1aa..3fc871c 100644
--- a/mali_kbase/tl/mali_kbase_tracepoints.h
+++ b/mali_kbase/tl/mali_kbase_tracepoints.h
@@ -296,21 +296,35 @@ void __kbase_tlstream_aux_event_job_slot(
 	u32 slot_nr,
 	u32 atom_nr,
 	u32 event);
+void __kbase_tlstream_aux_mmu_command(
+	struct kbase_tlstream *stream,
+	u32 kernel_ctx_id,
+	u32 mmu_cmd_id,
+	u32 mmu_synchronicity,
+	u64 mmu_lock_addr,
+	u32 mmu_lock_page_num);
 void __kbase_tlstream_tl_kbase_new_device(
 	struct kbase_tlstream *stream,
 	u32 kbase_device_id,
 	u32 kbase_device_gpu_core_count,
 	u32 kbase_device_max_num_csgs,
-	u32 kbase_device_as_count);
+	u32 kbase_device_as_count,
+	u32 kbase_device_sb_entry_count,
+	u32 kbase_device_has_cross_stream_sync);
 void __kbase_tlstream_tl_kbase_device_program_csg(
 	struct kbase_tlstream *stream,
 	u32 kbase_device_id,
+	u32 kernel_ctx_id,
 	u32 gpu_cmdq_grp_handle,
 	u32 kbase_device_csg_slot_index);
 void __kbase_tlstream_tl_kbase_device_deprogram_csg(
 	struct kbase_tlstream *stream,
 	u32 kbase_device_id,
 	u32 kbase_device_csg_slot_index);
+void __kbase_tlstream_tl_kbase_device_halt_csg(
+	struct kbase_tlstream *stream,
+	u32 kbase_device_id,
+	u32 kbase_device_csg_slot_index);
 void __kbase_tlstream_tl_kbase_new_ctx(
 	struct kbase_tlstream *stream,
 	u32 kernel_ctx_id,
@@ -491,6 +505,41 @@ void __kbase_tlstream_tl_kbase_csffw_tlstream_overflow(
 void __kbase_tlstream_tl_kbase_csffw_reset(
 	struct kbase_tlstream *stream,
 	u64 csffw_cycle);
+void __kbase_tlstream_tl_js_sched_start(
+	struct kbase_tlstream *stream,
+	u32 dummy);
+void __kbase_tlstream_tl_js_sched_end(
+	struct kbase_tlstream *stream,
+	u32 dummy);
+void __kbase_tlstream_tl_jd_submit_atom_start(
+	struct kbase_tlstream *stream,
+	const void *atom);
+void __kbase_tlstream_tl_jd_submit_atom_end(
+	struct kbase_tlstream *stream,
+	const void *atom);
+void __kbase_tlstream_tl_jd_done_no_lock_start(
+	struct kbase_tlstream *stream,
+	const void *atom);
+void __kbase_tlstream_tl_jd_done_no_lock_end(
+	struct kbase_tlstream *stream,
+	const void *atom);
+void __kbase_tlstream_tl_jd_done_start(
+	struct kbase_tlstream *stream,
+	const void *atom);
+void __kbase_tlstream_tl_jd_done_end(
+	struct kbase_tlstream *stream,
+	const void *atom);
+void __kbase_tlstream_tl_jd_atom_complete(
+	struct kbase_tlstream *stream,
+	const void *atom);
+void __kbase_tlstream_tl_run_atom_start(
+	struct kbase_tlstream *stream,
+	const void *atom,
+	u32 atom_nr);
+void __kbase_tlstream_tl_run_atom_end(
+	struct kbase_tlstream *stream,
+	const void *atom,
+	u32 atom_nr);
 
 struct kbase_tlstream;
 
@@ -1593,14 +1642,48 @@ struct kbase_tlstream;
 	} while (0)
 
 /**
+ * KBASE_TLSTREAM_AUX_MMU_COMMAND -
+ *   mmu commands with synchronicity info
+ *
+ * @kbdev: Kbase device
+ * @kernel_ctx_id: Unique ID for the KBase Context
+ * @mmu_cmd_id: MMU Command ID (e.g AS_COMMAND_UPDATE)
+ * @mmu_synchronicity: Indicates whether the command is related to current running job
+ * that needs to be resolved to make it progress (synchronous, e.g.
+ * grow on page fault, JIT) or not (asynchronous, e.g. IOCTL calls
+ * from user-space). This param will be 0 if it is an asynchronous
+ * operation.
+ * @mmu_lock_addr: start address of regions to be locked/unlocked/invalidated
+ * @mmu_lock_page_num: number of pages to be locked/unlocked/invalidated
+ */
+#define KBASE_TLSTREAM_AUX_MMU_COMMAND(	\
+	kbdev,	\
+	kernel_ctx_id,	\
+	mmu_cmd_id,	\
+	mmu_synchronicity,	\
+	mmu_lock_addr,	\
+	mmu_lock_page_num	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_aux_mmu_command(	\
+				__TL_DISPATCH_STREAM(kbdev, aux),	\
+				kernel_ctx_id, mmu_cmd_id, mmu_synchronicity, mmu_lock_addr, mmu_lock_page_num);	\
+	} while (0)
+
+/**
  * KBASE_TLSTREAM_TL_KBASE_NEW_DEVICE -
  *   New KBase Device
  *
  * @kbdev: Kbase device
- * @kbase_device_id: The id of the physical hardware
+ * @kbase_device_id: The ID of the physical hardware
  * @kbase_device_gpu_core_count: The number of gpu cores in the physical hardware
  * @kbase_device_max_num_csgs: The max number of CSGs the physical hardware supports
  * @kbase_device_as_count: The number of address spaces the physical hardware has available
+ * @kbase_device_sb_entry_count: The number of entries each scoreboard set in the
+ * physical hardware has available
+ * @kbase_device_has_cross_stream_sync: Whether cross-stream synchronization is supported
  */
 #if MALI_USE_CSF
 #define KBASE_TLSTREAM_TL_KBASE_NEW_DEVICE(	\
@@ -1608,14 +1691,16 @@ struct kbase_tlstream;
 	kbase_device_id,	\
 	kbase_device_gpu_core_count,	\
 	kbase_device_max_num_csgs,	\
-	kbase_device_as_count	\
+	kbase_device_as_count,	\
+	kbase_device_sb_entry_count,	\
+	kbase_device_has_cross_stream_sync	\
 	)	\
 	do {	\
 		int enabled = atomic_read(&kbdev->timeline_flags);	\
 		if (enabled & BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS)	\
 			__kbase_tlstream_tl_kbase_new_device(	\
 				__TL_DISPATCH_STREAM(kbdev, obj),	\
-				kbase_device_id, kbase_device_gpu_core_count, kbase_device_max_num_csgs, kbase_device_as_count);	\
+				kbase_device_id, kbase_device_gpu_core_count, kbase_device_max_num_csgs, kbase_device_as_count, kbase_device_sb_entry_count, kbase_device_has_cross_stream_sync);	\
 	} while (0)
 #else
 #define KBASE_TLSTREAM_TL_KBASE_NEW_DEVICE(	\
@@ -1623,7 +1708,9 @@ struct kbase_tlstream;
 	kbase_device_id,	\
 	kbase_device_gpu_core_count,	\
 	kbase_device_max_num_csgs,	\
-	kbase_device_as_count	\
+	kbase_device_as_count,	\
+	kbase_device_sb_entry_count,	\
+	kbase_device_has_cross_stream_sync	\
 	)	\
 	do { } while (0)
 #endif /* MALI_USE_CSF */
@@ -1633,7 +1720,8 @@ struct kbase_tlstream;
  *   CSG is programmed to a slot
  *
  * @kbdev: Kbase device
- * @kbase_device_id: The id of the physical hardware
+ * @kbase_device_id: The ID of the physical hardware
+ * @kernel_ctx_id: Unique ID for the KBase Context
  * @gpu_cmdq_grp_handle: GPU Command Queue Group handle which will match userspace
  * @kbase_device_csg_slot_index: The index of the slot in the scheduler being programmed
  */
@@ -1641,6 +1729,7 @@ struct kbase_tlstream;
 #define KBASE_TLSTREAM_TL_KBASE_DEVICE_PROGRAM_CSG(	\
 	kbdev,	\
 	kbase_device_id,	\
+	kernel_ctx_id,	\
 	gpu_cmdq_grp_handle,	\
 	kbase_device_csg_slot_index	\
 	)	\
@@ -1649,12 +1738,13 @@ struct kbase_tlstream;
 		if (enabled & BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS)	\
 			__kbase_tlstream_tl_kbase_device_program_csg(	\
 				__TL_DISPATCH_STREAM(kbdev, obj),	\
-				kbase_device_id, gpu_cmdq_grp_handle, kbase_device_csg_slot_index);	\
+				kbase_device_id, kernel_ctx_id, gpu_cmdq_grp_handle, kbase_device_csg_slot_index);	\
 	} while (0)
 #else
 #define KBASE_TLSTREAM_TL_KBASE_DEVICE_PROGRAM_CSG(	\
 	kbdev,	\
 	kbase_device_id,	\
+	kernel_ctx_id,	\
 	gpu_cmdq_grp_handle,	\
 	kbase_device_csg_slot_index	\
 	)	\
@@ -1666,7 +1756,7 @@ struct kbase_tlstream;
  *   CSG is deprogrammed from a slot
  *
  * @kbdev: Kbase device
- * @kbase_device_id: The id of the physical hardware
+ * @kbase_device_id: The ID of the physical hardware
  * @kbase_device_csg_slot_index: The index of the slot in the scheduler being programmed
  */
 #if MALI_USE_CSF
@@ -1692,12 +1782,33 @@ struct kbase_tlstream;
 #endif /* MALI_USE_CSF */
 
 /**
+ * KBASE_TLSTREAM_TL_KBASE_DEVICE_HALT_CSG -
+ *   CSG is halted
+ *
+ * @kbdev: Kbase device
+ * @kbase_device_id: The ID of the physical hardware
+ * @kbase_device_csg_slot_index: The index of the slot in the scheduler being programmed
+ */
+#define KBASE_TLSTREAM_TL_KBASE_DEVICE_HALT_CSG(	\
+	kbdev,	\
+	kbase_device_id,	\
+	kbase_device_csg_slot_index	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_tl_kbase_device_halt_csg(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				kbase_device_id, kbase_device_csg_slot_index);	\
+	} while (0)
+
+/**
  * KBASE_TLSTREAM_TL_KBASE_NEW_CTX -
  *   New KBase Context
  *
  * @kbdev: Kbase device
  * @kernel_ctx_id: Unique ID for the KBase Context
- * @kbase_device_id: The id of the physical hardware
+ * @kbase_device_id: The ID of the physical hardware
  */
 #if MALI_USE_CSF
 #define KBASE_TLSTREAM_TL_KBASE_NEW_CTX(	\
@@ -1935,7 +2046,7 @@ struct kbase_tlstream;
  * @cqs_obj_gpu_addr: CQS Object GPU pointer
  * @cqs_obj_compare_value: Semaphore value that should be exceeded
  * for the WAIT to pass
- * @cqs_obj_inherit_error: Indicates the error state should be inherited into the queue or not
+ * @cqs_obj_inherit_error: Flag which indicates if the CQS object error state should be inherited by the queue
  */
 #if MALI_USE_CSF
 #define KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_CQS_WAIT(	\
@@ -3091,6 +3202,219 @@ struct kbase_tlstream;
 	do { } while (0)
 #endif /* MALI_USE_CSF */
 
+/**
+ * KBASE_TLSTREAM_TL_JS_SCHED_START -
+ *   Scheduling starts
+ *
+ * @kbdev: Kbase device
+ * @dummy: dummy argument
+ */
+#define KBASE_TLSTREAM_TL_JS_SCHED_START(	\
+	kbdev,	\
+	dummy	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_tl_js_sched_start(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				dummy);	\
+	} while (0)
+
+/**
+ * KBASE_TLSTREAM_TL_JS_SCHED_END -
+ *   Scheduling ends
+ *
+ * @kbdev: Kbase device
+ * @dummy: dummy argument
+ */
+#define KBASE_TLSTREAM_TL_JS_SCHED_END(	\
+	kbdev,	\
+	dummy	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_tl_js_sched_end(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				dummy);	\
+	} while (0)
+
+/**
+ * KBASE_TLSTREAM_TL_JD_SUBMIT_ATOM_START -
+ *   Submitting an atom starts
+ *
+ * @kbdev: Kbase device
+ * @atom: Atom identifier
+ */
+#define KBASE_TLSTREAM_TL_JD_SUBMIT_ATOM_START(	\
+	kbdev,	\
+	atom	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_tl_jd_submit_atom_start(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				atom);	\
+	} while (0)
+
+/**
+ * KBASE_TLSTREAM_TL_JD_SUBMIT_ATOM_END -
+ *   Submitting an atom ends
+ *
+ * @kbdev: Kbase device
+ * @atom: Atom identifier
+ */
+#define KBASE_TLSTREAM_TL_JD_SUBMIT_ATOM_END(	\
+	kbdev,	\
+	atom	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_tl_jd_submit_atom_end(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				atom);	\
+	} while (0)
+
+/**
+ * KBASE_TLSTREAM_TL_JD_DONE_NO_LOCK_START -
+ *   Within function jd_done_nolock
+ *
+ * @kbdev: Kbase device
+ * @atom: Atom identifier
+ */
+#define KBASE_TLSTREAM_TL_JD_DONE_NO_LOCK_START(	\
+	kbdev,	\
+	atom	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_tl_jd_done_no_lock_start(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				atom);	\
+	} while (0)
+
+/**
+ * KBASE_TLSTREAM_TL_JD_DONE_NO_LOCK_END -
+ *   Within function jd_done_nolock - end
+ *
+ * @kbdev: Kbase device
+ * @atom: Atom identifier
+ */
+#define KBASE_TLSTREAM_TL_JD_DONE_NO_LOCK_END(	\
+	kbdev,	\
+	atom	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_tl_jd_done_no_lock_end(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				atom);	\
+	} while (0)
+
+/**
+ * KBASE_TLSTREAM_TL_JD_DONE_START -
+ *   Start of kbase_jd_done
+ *
+ * @kbdev: Kbase device
+ * @atom: Atom identifier
+ */
+#define KBASE_TLSTREAM_TL_JD_DONE_START(	\
+	kbdev,	\
+	atom	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_tl_jd_done_start(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				atom);	\
+	} while (0)
+
+/**
+ * KBASE_TLSTREAM_TL_JD_DONE_END -
+ *   End of kbase_jd_done
+ *
+ * @kbdev: Kbase device
+ * @atom: Atom identifier
+ */
+#define KBASE_TLSTREAM_TL_JD_DONE_END(	\
+	kbdev,	\
+	atom	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_tl_jd_done_end(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				atom);	\
+	} while (0)
+
+/**
+ * KBASE_TLSTREAM_TL_JD_ATOM_COMPLETE -
+ *   Atom marked complete
+ *
+ * @kbdev: Kbase device
+ * @atom: Atom identifier
+ */
+#define KBASE_TLSTREAM_TL_JD_ATOM_COMPLETE(	\
+	kbdev,	\
+	atom	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_tl_jd_atom_complete(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				atom);	\
+	} while (0)
+
+/**
+ * KBASE_TLSTREAM_TL_RUN_ATOM_START -
+ *   Running of atom starts
+ *
+ * @kbdev: Kbase device
+ * @atom: Atom identifier
+ * @atom_nr: Sequential number of an atom
+ */
+#define KBASE_TLSTREAM_TL_RUN_ATOM_START(	\
+	kbdev,	\
+	atom,	\
+	atom_nr	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_tl_run_atom_start(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				atom, atom_nr);	\
+	} while (0)
+
+/**
+ * KBASE_TLSTREAM_TL_RUN_ATOM_END -
+ *   Running of atom ends
+ *
+ * @kbdev: Kbase device
+ * @atom: Atom identifier
+ * @atom_nr: Sequential number of an atom
+ */
+#define KBASE_TLSTREAM_TL_RUN_ATOM_END(	\
+	kbdev,	\
+	atom,	\
+	atom_nr	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & TLSTREAM_ENABLED)	\
+			__kbase_tlstream_tl_run_atom_end(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				atom, atom_nr);	\
+	} while (0)
+
 
 /* Gator tracepoints are hooked into TLSTREAM interface.
  * When the following tracepoints are called, corresponding
author	Jesse Hall <jessehall@google.com>	2021-11-23 14:38:46 -0800
committer	Jesse Hall <jessehall@google.com>	2021-11-23 14:38:46 -0800
commit	0c596dc70431fa2c70021fa1685e3efc969a852d (patch)
tree	8c6cfe8da5d3bea214e991cc4438988f65d9081e
parent	bbbb1cf6bb211bb2094dd66656966277c326867f (diff)
download	gpu-0c596dc70431fa2c70021fa1685e3efc969a852d.tar.gz