Update KMD to r47p0

Provenance: ipdelivery@ad01e50d640910a99224382bb227e6d4de627657 Change-Id: I19ac9bce34a5c5a319c1b4a388e8b037b3dfe6e7
author: Jörg Wagner <jorwag@google.com> 2023-12-14 09:44:26 +0000
committer: Jörg Wagner <jorwag@google.com> 2023-12-14 09:44:26 +0000
commit: 049a542207ed694271316782397b78b2e202086a (patch)
tree: 105e9378d4d5062dc72109fdd4a77c915bd9425d /mali_kbase/hwcnt
parent: e61eb93296e9f940b32d4ad4b0c3a5557cbeaf17 (diff)
download: gpu-049a542207ed694271316782397b78b2e202086a.tar.gz
16 files changed, 1824 insertions, 1612 deletions
diff --git a/mali_kbase/hwcnt/Kbuild b/mali_kbase/hwcnt/Kbuild
index 8c8775f..d24d8ef 100644
--- a/mali_kbase/hwcnt/Kbuild
+++ b/mali_kbase/hwcnt/Kbuild
@@ -21,7 +21,6 @@
 mali_kbase-y += \
     hwcnt/mali_kbase_hwcnt.o \
     hwcnt/mali_kbase_hwcnt_gpu.o \
-    hwcnt/mali_kbase_hwcnt_gpu_narrow.o \
     hwcnt/mali_kbase_hwcnt_types.o \
     hwcnt/mali_kbase_hwcnt_virtualizer.o \
     hwcnt/mali_kbase_hwcnt_watchdog_if_timer.o
diff --git a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend.h b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend.h
index 6cfa6f5..cc3ba98 100644
--- a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend.h
+++ b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -123,11 +123,21 @@ kbase_hwcnt_backend_dump_enable_nolock_fn(struct kbase_hwcnt_backend *backend,
  * typedef kbase_hwcnt_backend_dump_disable_fn - Disable counter dumping with
  *                                               the backend.
  * @backend: Non-NULL pointer to backend.
+ * @dump_buffer: Pointer to an accumulated dump buffer to update or NULL.
+ * @enable_map: Pointer to enable map specifying enabled counters. Must be NULL if no @dump_buffer
  *
  * If the backend is already disabled, does nothing.
- * Any undumped counter values since the last dump get will be lost.
+ *
+ * Any undumped counter values since the last dump get will be lost. However, Undumped block state
+ * can be retained by the backend.
+ *
+ * @dump_buffer and @enable_map gives the backend an opportunity to update an existing accumulated
+ * buffer with state information, and for the caller take ownership of it. In particular, the
+ * caller can use this when they require such information whilst the counter dumps are disabled.
  */
-typedef void kbase_hwcnt_backend_dump_disable_fn(struct kbase_hwcnt_backend *backend);
+typedef void kbase_hwcnt_backend_dump_disable_fn(struct kbase_hwcnt_backend *backend,
+						 struct kbase_hwcnt_dump_buffer *dump_buffer,
+						 const struct kbase_hwcnt_enable_map *enable_map);
 
 /**
  * typedef kbase_hwcnt_backend_dump_clear_fn - Reset all the current undumped
diff --git a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
index 27acfc6..d7911ae 100644
--- a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
+++ b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
@@ -44,6 +44,9 @@
 #define HWCNT_BACKEND_WATCHDOG_TIMER_INTERVAL_MS ((u32)1000)
 #endif /* IS_FPGA && !NO_MALI */
 
+/* Used to check for a sample in which all counters in the block are disabled */
+#define HWCNT_BLOCK_EMPTY_SAMPLE (2)
+
 /**
  * enum kbase_hwcnt_backend_csf_dump_state - HWC CSF backend dumping states.
  *
@@ -172,15 +175,16 @@ struct kbase_hwcnt_backend_csf_info {
 
 /**
  * struct kbase_hwcnt_csf_physical_layout - HWC sample memory physical layout
- *                                          information.
- * @hw_block_cnt:       Total number of hardware counters blocks. The hw counters blocks are
- *                      sub-categorized into 4 classes: front-end, tiler, memory system, and shader.
- *                      hw_block_cnt = fe_cnt + tiler_cnt + mmu_l2_cnt + shader_cnt.
+ *                                          information, as defined by the spec.
  * @fe_cnt:             Front end block count.
  * @tiler_cnt:          Tiler block count.
  * @mmu_l2_cnt:         Memory system (MMU and L2 cache) block count.
  * @shader_cnt:         Shader Core block count.
- * @fw_block_cnt:       Total number of firmware counters blocks.
+ * @fw_block_cnt:       Total number of firmware counter blocks, with a single
+ *                      global FW block and a block per CSG.
+ * @hw_block_cnt:       Total number of hardware counter blocks. The hw counters blocks are
+ *                      sub-categorized into 4 classes: front-end, tiler, memory system, and shader.
+ *                      hw_block_cnt = fe_cnt + tiler_cnt + mmu_l2_cnt + shader_cnt.
  * @block_cnt:          Total block count (sum of all counter blocks: hw_block_cnt + fw_block_cnt).
  * @shader_avail_mask:  Bitmap of all shader cores in the system.
  * @enable_mask_offset: Offset in array elements of enable mask in each block
@@ -190,12 +194,12 @@ struct kbase_hwcnt_backend_csf_info {
  * @values_per_block:   For any block, the number of counters in total (header + payload).
  */
 struct kbase_hwcnt_csf_physical_layout {
-	u8 hw_block_cnt;
 	u8 fe_cnt;
 	u8 tiler_cnt;
 	u8 mmu_l2_cnt;
 	u8 shader_cnt;
 	u8 fw_block_cnt;
+	u8 hw_block_cnt;
 	u8 block_cnt;
 	u64 shader_avail_mask;
 	size_t enable_mask_offset;
@@ -220,6 +224,13 @@ struct kbase_hwcnt_csf_physical_layout {
  * @old_sample_buf:             HWC sample buffer to save the previous values
  *                              for delta calculation, size
  *                              prfcnt_info.dump_bytes.
+ * @block_states:               Pointer to array of block_state values for all
+ *                              blocks.
+ * @to_user_block_states:       Block state buffer for client user.
+ * @accum_all_blk_stt:          Block state to accumulate for all known blocks
+ *                              on next sample.
+ * @sampled_all_blk_stt:        Block State to accumulate for all known blocks
+ *                              into the current sample.
  * @watchdog_last_seen_insert_idx: The insert index which watchdog has last
  *                                 seen, to check any new firmware automatic
  *                                 samples generated during the watchdog
@@ -243,6 +254,8 @@ struct kbase_hwcnt_csf_physical_layout {
  * @hwc_dump_work:              Worker to accumulate samples.
  * @hwc_threshold_work:         Worker for consuming available samples when
  *                              threshold interrupt raised.
+ * @num_l2_slices:              Current number of L2 slices allocated to the GPU.
+ * @shader_present_bitmap:      Current shader-present bitmap that is allocated to the GPU.
  */
 struct kbase_hwcnt_backend_csf {
 	struct kbase_hwcnt_backend_csf_info *info;
@@ -253,6 +266,10 @@ struct kbase_hwcnt_backend_csf {
 	u64 *to_user_buf;
 	u64 *accum_buf;
 	u32 *old_sample_buf;
+	blk_stt_t *block_states;
+	blk_stt_t *to_user_block_states;
+	blk_stt_t accum_all_blk_stt;
+	blk_stt_t sampled_all_blk_stt;
 	u32 watchdog_last_seen_insert_idx;
 	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf;
 	void *ring_buf_cpu_base;
@@ -265,15 +282,45 @@ struct kbase_hwcnt_backend_csf {
 	struct workqueue_struct *hwc_dump_workq;
 	struct work_struct hwc_dump_work;
 	struct work_struct hwc_threshold_work;
+	size_t num_l2_slices;
+	u64 shader_present_bitmap;
 };
 
 static bool kbasep_hwcnt_backend_csf_backend_exists(struct kbase_hwcnt_backend_csf_info *csf_info)
 {
-	WARN_ON(!csf_info);
+	if (WARN_ON(!csf_info))
+		return false;
+
 	csf_info->csf_if->assert_lock_held(csf_info->csf_if->ctx);
 	return (csf_info->backend != NULL);
 }
 
+void kbase_hwcnt_backend_csf_set_hw_availability(struct kbase_hwcnt_backend_interface *iface,
+						 size_t num_l2_slices, u64 shader_present_bitmap)
+{
+	struct kbase_hwcnt_backend_csf_info *csf_info;
+
+	if (!iface)
+		return;
+
+	csf_info = (struct kbase_hwcnt_backend_csf_info *)iface->info;
+
+	/* Early out if the backend does not exist. */
+	if (!csf_info || !csf_info->backend)
+		return;
+
+	if (WARN_ON(csf_info->backend->enable_state != KBASE_HWCNT_BACKEND_CSF_DISABLED))
+		return;
+
+	if (WARN_ON(num_l2_slices > csf_info->backend->phys_layout.mmu_l2_cnt) ||
+	    WARN_ON((shader_present_bitmap & csf_info->backend->phys_layout.shader_avail_mask) !=
+		    shader_present_bitmap))
+		return;
+
+	csf_info->backend->num_l2_slices = num_l2_slices;
+	csf_info->backend->shader_present_bitmap = shader_present_bitmap;
+}
+
 /**
  * kbasep_hwcnt_backend_csf_cc_initial_sample() - Initialize cycle count
  *                                                tracking.
@@ -295,8 +342,7 @@ kbasep_hwcnt_backend_csf_cc_initial_sample(struct kbase_hwcnt_backend_csf *backe
 	backend_csf->info->csf_if->get_gpu_cycle_count(backend_csf->info->csf_if->ctx, cycle_counts,
 						       clk_enable_map);
 
-	kbase_hwcnt_metadata_for_each_clock(enable_map->metadata, clk)
-	{
+	kbase_hwcnt_metadata_for_each_clock(enable_map->metadata, clk) {
 		if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, clk))
 			backend_csf->prev_cycle_count[clk] = cycle_counts[clk];
 	}
@@ -317,8 +363,7 @@ static void kbasep_hwcnt_backend_csf_cc_update(struct kbase_hwcnt_backend_csf *b
 	backend_csf->info->csf_if->get_gpu_cycle_count(backend_csf->info->csf_if->ctx, cycle_counts,
 						       backend_csf->clk_enable_map);
 
-	kbase_hwcnt_metadata_for_each_clock(backend_csf->info->metadata, clk)
-	{
+	kbase_hwcnt_metadata_for_each_clock(backend_csf->info->metadata, clk) {
 		if (kbase_hwcnt_clk_enable_map_enabled(backend_csf->clk_enable_map, clk)) {
 			backend_csf->cycle_count_elapsed[clk] =
 				cycle_counts[clk] - backend_csf->prev_cycle_count[clk];
@@ -340,29 +385,29 @@ static u64 kbasep_hwcnt_backend_csf_timestamp_ns(struct kbase_hwcnt_backend *bac
 
 /** kbasep_hwcnt_backend_csf_process_enable_map() - Process the enable_map to
  *                                                  guarantee headers are
- *                                                  enabled if any counter is
- *                                                  required.
+ *                                                  enabled.
  *@phys_enable_map: HWC physical enable map to be processed.
  */
-static void
-kbasep_hwcnt_backend_csf_process_enable_map(struct kbase_hwcnt_physical_enable_map *phys_enable_map)
+void kbasep_hwcnt_backend_csf_process_enable_map(
+	struct kbase_hwcnt_physical_enable_map *phys_enable_map)
 {
 	WARN_ON(!phys_enable_map);
 
-	/* Enable header if any counter is required from user, the header is
-	 * controlled by bit 0 of the enable mask.
+	/* Unconditionally enable each block header and first counter,
+	 * the header is controlled by bit 0 of the enable mask.
 	 */
-	if (phys_enable_map->fe_bm)
-		phys_enable_map->fe_bm |= 1;
+	phys_enable_map->fe_bm |= 3;
 
-	if (phys_enable_map->tiler_bm)
-		phys_enable_map->tiler_bm |= 1;
+	phys_enable_map->tiler_bm |= 3;
 
-	if (phys_enable_map->mmu_l2_bm)
-		phys_enable_map->mmu_l2_bm |= 1;
+	phys_enable_map->mmu_l2_bm |= 3;
+
+	phys_enable_map->shader_bm |= 3;
+
+	phys_enable_map->fw_bm |= 3;
+
+	phys_enable_map->csg_bm |= 3;
 
-	if (phys_enable_map->shader_bm)
-		phys_enable_map->shader_bm |= 1;
 }
 
 static void kbasep_hwcnt_backend_csf_init_layout(
@@ -371,32 +416,35 @@ static void kbasep_hwcnt_backend_csf_init_layout(
 {
 	size_t shader_core_cnt;
 	size_t values_per_block;
-	size_t fw_blocks_count;
-	size_t hw_blocks_count;
+	size_t fw_block_cnt;
+	size_t hw_block_cnt;
+	size_t core_cnt;
+
 
 	WARN_ON(!prfcnt_info);
 	WARN_ON(!phys_layout);
 
-	shader_core_cnt = fls64(prfcnt_info->core_mask);
+	shader_core_cnt = (size_t)fls64(prfcnt_info->core_mask);
 	values_per_block = prfcnt_info->prfcnt_block_size / KBASE_HWCNT_VALUE_HW_BYTES;
-	fw_blocks_count = div_u64(prfcnt_info->prfcnt_fw_size, prfcnt_info->prfcnt_block_size);
-	hw_blocks_count = div_u64(prfcnt_info->prfcnt_hw_size, prfcnt_info->prfcnt_block_size);
+	fw_block_cnt = div_u64(prfcnt_info->prfcnt_fw_size, prfcnt_info->prfcnt_block_size);
+	hw_block_cnt = div_u64(prfcnt_info->prfcnt_hw_size, prfcnt_info->prfcnt_block_size);
+
+	core_cnt = shader_core_cnt;
 
 	/* The number of hardware counters reported by the GPU matches the legacy guess-work we
 	 * have done in the past
 	 */
-	WARN_ON(hw_blocks_count != KBASE_HWCNT_V5_FE_BLOCK_COUNT +
-					   KBASE_HWCNT_V5_TILER_BLOCK_COUNT +
-					   prfcnt_info->l2_count + shader_core_cnt);
+	WARN_ON(hw_block_cnt != KBASE_HWCNT_V5_FE_BLOCK_COUNT + KBASE_HWCNT_V5_TILER_BLOCK_COUNT +
+					prfcnt_info->l2_count + core_cnt);
 
 	*phys_layout = (struct kbase_hwcnt_csf_physical_layout){
 		.fe_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT,
 		.tiler_cnt = KBASE_HWCNT_V5_TILER_BLOCK_COUNT,
 		.mmu_l2_cnt = prfcnt_info->l2_count,
 		.shader_cnt = shader_core_cnt,
-		.fw_block_cnt = fw_blocks_count,
-		.hw_block_cnt = hw_blocks_count,
-		.block_cnt = fw_blocks_count + hw_blocks_count,
+		.fw_block_cnt = fw_block_cnt,
+		.hw_block_cnt = hw_block_cnt,
+		.block_cnt = fw_block_cnt + hw_block_cnt,
 		.shader_avail_mask = prfcnt_info->core_mask,
 		.headers_per_block = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
 		.values_per_block = values_per_block,
@@ -409,10 +457,14 @@ static void
 kbasep_hwcnt_backend_csf_reset_internal_buffers(struct kbase_hwcnt_backend_csf *backend_csf)
 {
 	size_t user_buf_bytes = backend_csf->info->metadata->dump_buf_bytes;
+	size_t block_state_bytes = backend_csf->phys_layout.block_cnt *
+				   KBASE_HWCNT_BLOCK_STATE_BYTES * KBASE_HWCNT_BLOCK_STATE_STRIDE;
 
 	memset(backend_csf->to_user_buf, 0, user_buf_bytes);
 	memset(backend_csf->accum_buf, 0, user_buf_bytes);
 	memset(backend_csf->old_sample_buf, 0, backend_csf->info->prfcnt_info.dump_bytes);
+	memset(backend_csf->block_states, 0, block_state_bytes);
+	memset(backend_csf->to_user_block_states, 0, block_state_bytes);
 }
 
 static void
@@ -450,40 +502,130 @@ kbasep_hwcnt_backend_csf_zero_all_prfcnt_en_header(struct kbase_hwcnt_backend_cs
 static void kbasep_hwcnt_backend_csf_update_user_sample(struct kbase_hwcnt_backend_csf *backend_csf)
 {
 	size_t user_buf_bytes = backend_csf->info->metadata->dump_buf_bytes;
+	size_t block_state_bytes = backend_csf->phys_layout.block_cnt *
+				   KBASE_HWCNT_BLOCK_STATE_BYTES * KBASE_HWCNT_BLOCK_STATE_STRIDE;
 
 	/* Copy the data into the sample and wait for the user to get it. */
 	memcpy(backend_csf->to_user_buf, backend_csf->accum_buf, user_buf_bytes);
+	memcpy(backend_csf->to_user_block_states, backend_csf->block_states, block_state_bytes);
 
 	/* After copied data into user sample, clear the accumulator values to
 	 * prepare for the next accumulator, such as the next request or
 	 * threshold.
 	 */
 	memset(backend_csf->accum_buf, 0, user_buf_bytes);
+	memset(backend_csf->block_states, 0, block_state_bytes);
+}
+
+/**
+ * kbasep_hwcnt_backend_csf_update_block_state - Update block state of a block instance with
+ *						   information from a sample.
+ * @phys_layout:                Physical memory layout information of HWC
+ *                              sample buffer.
+ * @enable_mask:                Counter enable mask for the block whose state is being updated.
+ * @enable_state:               The CSF backend internal enabled state.
+ * @exiting_protm:              Whether or not the sample is taken when the GPU is exiting
+ *                              protected mode.
+ * @block_idx:                  Index of block within the ringbuffer.
+ * @block_state:                Pointer to existing block state of the block whose state is being
+ *                              updated.
+ * @fw_in_protected_mode:       Whether or not GPU is in protected mode during sampling.
+ */
+static void kbasep_hwcnt_backend_csf_update_block_state(
+	const struct kbase_hwcnt_csf_physical_layout *phys_layout, const u32 enable_mask,
+	enum kbase_hwcnt_backend_csf_enable_state enable_state, bool exiting_protm,
+	size_t block_idx, blk_stt_t *const block_state, bool fw_in_protected_mode)
+{
+	/* Offset of shader core blocks from the start of the HW blocks in the sample */
+	size_t shader_core_block_offset =
+		(size_t)(phys_layout->hw_block_cnt - phys_layout->shader_cnt);
+	bool is_shader_core_block;
+
+	is_shader_core_block = block_idx >= shader_core_block_offset;
+
+	/* Set power bits for the block state for the block, for the sample */
+	switch (enable_state) {
+	/* Disabled states */
+	case KBASE_HWCNT_BACKEND_CSF_DISABLED:
+	case KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED:
+	case KBASE_HWCNT_BACKEND_CSF_DISABLED_WAIT_FOR_WORKER:
+		kbase_hwcnt_block_state_append(block_state, KBASE_HWCNT_STATE_OFF);
+		break;
+	/* Enabled states */
+	case KBASE_HWCNT_BACKEND_CSF_ENABLED:
+	case KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED:
+		if (!is_shader_core_block)
+			kbase_hwcnt_block_state_append(block_state, KBASE_HWCNT_STATE_ON);
+		else if (!exiting_protm) {
+			/* When not exiting protected mode, a zero enable mask on a shader core
+			 * counter block indicates the block was powered off for the sample, and
+			 * a non-zero counter enable mask indicates the block was powered on for
+			 * the sample.
+			 */
+			kbase_hwcnt_block_state_append(block_state,
+						       (enable_mask ? KBASE_HWCNT_STATE_ON :
+									    KBASE_HWCNT_STATE_OFF));
+		}
+		break;
+	/* Error states */
+	case KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR_WAIT_FOR_WORKER:
+	case KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR:
+	default:
+		/* Do nothing */
+		break;
+	}
+
+	/* The following four cases apply to a block state in either normal mode or protected mode:
+	 * 1. GPU executing in normal mode: Only set normal mode bit.
+	 * 2. First sample request after GPU enters protected mode: Set both normal mode and
+	 *    protected mode bit. In this case, there will at least be one sample to accumulate
+	 *    in the ring buffer which was automatically triggered before GPU entered protected
+	 *    mode.
+	 * 3. Subsequent sample requests while GPU remains in protected mode: Only set protected
+	 *    mode bit. In this case, the ring buffer should be empty and dump should return 0s but
+	 *    block state should be updated accordingly. This case is not handled here.
+	 * 4. Samples requested after GPU exits protected mode: Set both protected mode and normal
+	 *    mode bits.
+	 */
+	if (exiting_protm || fw_in_protected_mode)
+		kbase_hwcnt_block_state_append(block_state, KBASE_HWCNT_STATE_PROTECTED |
+								    KBASE_HWCNT_STATE_NORMAL);
+	else
+		kbase_hwcnt_block_state_append(block_state, KBASE_HWCNT_STATE_NORMAL);
 }
 
 static void kbasep_hwcnt_backend_csf_accumulate_sample(
 	const struct kbase_hwcnt_csf_physical_layout *phys_layout, size_t dump_bytes,
-	u64 *accum_buf, const u32 *old_sample_buf, const u32 *new_sample_buf, bool clearing_samples)
+	u64 *accum_buf, const u32 *old_sample_buf, const u32 *new_sample_buf,
+	blk_stt_t *const block_states, bool clearing_samples,
+	enum kbase_hwcnt_backend_csf_enable_state enable_state, bool fw_in_protected_mode)
 {
 	size_t block_idx;
 	const u32 *old_block = old_sample_buf;
 	const u32 *new_block = new_sample_buf;
 	u64 *acc_block = accum_buf;
+	/* Flag to indicate whether current sample is exiting protected mode. */
+	bool exiting_protm = false;
 	const size_t values_per_block = phys_layout->values_per_block;
 
-	/* Performance counter blocks for firmware are stored before blocks for hardware.
-	 * We skip over the firmware's performance counter blocks (counters dumping is not
-	 * supported for firmware blocks, only hardware ones).
+	/* The block pointers now point to the first HW block, which is always a CSHW/front-end
+	 * block. The counter enable mask for this block can be checked to determine whether this
+	 * sample is taken after leaving protected mode - this is the only scenario where the CSHW
+	 * block counter enable mask has only the first bit set, and no others. In this case,
+	 * the values in this sample would not be meaningful, so they don't need to be accumulated.
 	 */
-	old_block += values_per_block * phys_layout->fw_block_cnt;
-	new_block += values_per_block * phys_layout->fw_block_cnt;
+	exiting_protm = (new_block[phys_layout->enable_mask_offset] == 1);
 
-	for (block_idx = phys_layout->fw_block_cnt; block_idx < phys_layout->block_cnt;
-	     block_idx++) {
+	for (block_idx = 0; block_idx < phys_layout->block_cnt; block_idx++) {
 		const u32 old_enable_mask = old_block[phys_layout->enable_mask_offset];
 		const u32 new_enable_mask = new_block[phys_layout->enable_mask_offset];
+		/* Update block state with information of the current sample */
+		kbasep_hwcnt_backend_csf_update_block_state(phys_layout, new_enable_mask,
+							    enable_state, exiting_protm, block_idx,
+							    &block_states[block_idx],
+							    fw_in_protected_mode);
 
-		if (new_enable_mask == 0) {
+		if (!(new_enable_mask & HWCNT_BLOCK_EMPTY_SAMPLE)) {
 			/* Hardware block was unavailable or we didn't turn on
 			 * any counters. Do nothing.
 			 */
@@ -492,7 +634,6 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 			 * enabled. We need to update the accumulation buffer.
 			 */
 			size_t ctr_idx;
-
 			/* Unconditionally copy the headers. */
 			for (ctr_idx = 0; ctr_idx < phys_layout->headers_per_block; ctr_idx++) {
 				acc_block[ctr_idx] = new_block[ctr_idx];
@@ -517,8 +658,8 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 			 * saturating at their maximum value.
 			 */
 			if (!clearing_samples) {
-				if (old_enable_mask == 0) {
-					/* Hardware block was previously
+				if (!(old_enable_mask & HWCNT_BLOCK_EMPTY_SAMPLE)) {
+					/* Block was previously
 					 * unavailable. Accumulate the new
 					 * counters only, as we know previous
 					 * values are zeroes.
@@ -545,15 +686,14 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 				}
 			}
 		}
+
 		old_block += values_per_block;
 		new_block += values_per_block;
 		acc_block += values_per_block;
 	}
-
 	WARN_ON(old_block != old_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
 	WARN_ON(new_block != new_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
-	WARN_ON(acc_block != accum_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES) -
-				     (values_per_block * phys_layout->fw_block_cnt));
+	WARN_ON(acc_block != accum_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
 	(void)dump_bytes;
 }
 
@@ -569,10 +709,23 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(struct kbase_hwcnt_backe
 	bool clearing_samples = backend_csf->info->prfcnt_info.clearing_samples;
 	u32 *old_sample_buf = backend_csf->old_sample_buf;
 	u32 *new_sample_buf = old_sample_buf;
+	const struct kbase_hwcnt_csf_physical_layout *phys_layout = &backend_csf->phys_layout;
+
+	if (extract_index_to_start == insert_index_to_stop) {
+		/* No samples to accumulate but block states need to be updated for dump. */
+		size_t block_idx;
 
-	if (extract_index_to_start == insert_index_to_stop)
-		/* No samples to accumulate. Early out. */
+		for (block_idx = 0; block_idx < phys_layout->block_cnt; block_idx++) {
+			/* Set protected mode bit for block state if GPU is in protected mode,
+			 * otherwise set the normal mode bit.
+			 */
+			kbase_hwcnt_block_state_append(&backend_csf->block_states[block_idx],
+						       backend_csf->info->fw_in_protected_mode ?
+								     KBASE_HWCNT_STATE_PROTECTED :
+								     KBASE_HWCNT_STATE_NORMAL);
+		}
 		return;
+	}
 
 	/* Sync all the buffers to CPU side before read the data. */
 	backend_csf->info->csf_if->ring_buf_sync(backend_csf->info->csf_if->ctx,
@@ -587,11 +740,10 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(struct kbase_hwcnt_backe
 		const u32 buf_idx = raw_idx & (ring_buf_cnt - 1);
 
 		new_sample_buf = (u32 *)&cpu_dump_base[buf_idx * buf_dump_bytes];
-
-		kbasep_hwcnt_backend_csf_accumulate_sample(&backend_csf->phys_layout,
-							   buf_dump_bytes, backend_csf->accum_buf,
-							   old_sample_buf, new_sample_buf,
-							   clearing_samples);
+		kbasep_hwcnt_backend_csf_accumulate_sample(
+			phys_layout, buf_dump_bytes, backend_csf->accum_buf, old_sample_buf,
+			new_sample_buf, backend_csf->block_states, clearing_samples,
+			backend_csf->enable_state, backend_csf->info->fw_in_protected_mode);
 
 		old_sample_buf = new_sample_buf;
 	}
@@ -875,6 +1027,8 @@ kbasep_hwcnt_backend_csf_get_physical_enable(struct kbase_hwcnt_backend_csf *bac
 	enable->shader_bm = phys_enable_map.shader_bm;
 	enable->tiler_bm = phys_enable_map.tiler_bm;
 	enable->mmu_l2_bm = phys_enable_map.mmu_l2_bm;
+	enable->fw_bm = phys_enable_map.fw_bm;
+	enable->csg_bm = phys_enable_map.csg_bm;
 	enable->counter_set = phys_counter_set;
 	enable->clk_enable_map = enable_map->clk_enable_map;
 }
@@ -893,6 +1047,17 @@ kbasep_hwcnt_backend_csf_dump_enable_nolock(struct kbase_hwcnt_backend *backend,
 
 	backend_csf->info->csf_if->assert_lock_held(backend_csf->info->csf_if->ctx);
 
+	/* Enabling counters is an indication that the power may have previously been off for all
+	 * blocks.
+	 *
+	 * In any case, the counters would not have been counting recently, so an 'off' block state
+	 * is an approximation for this.
+	 *
+	 * This will be transferred to the dump only after a dump_wait(), or dump_disable() in
+	 * cases where the caller requested such information. This is to handle when a
+	 * dump_enable() happens in between dump_wait() and dump_get().
+	 */
+	kbase_hwcnt_block_state_append(&backend_csf->accum_all_blk_stt, KBASE_HWCNT_STATE_OFF);
 	kbasep_hwcnt_backend_csf_get_physical_enable(backend_csf, enable_map, &enable);
 
 	/* enable_state should be DISABLED before we transfer it to enabled */
@@ -956,13 +1121,19 @@ static void kbasep_hwcnt_backend_csf_wait_enable_transition_complete(
 }
 
 /* CSF backend implementation of kbase_hwcnt_backend_dump_disable_fn */
-static void kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
+static void kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend,
+						  struct kbase_hwcnt_dump_buffer *dump_buffer,
+						  const struct kbase_hwcnt_enable_map *enable_map)
 {
 	unsigned long flags = 0UL;
 	struct kbase_hwcnt_backend_csf *backend_csf = (struct kbase_hwcnt_backend_csf *)backend;
 	bool do_disable = false;
 
-	WARN_ON(!backend_csf);
+	if (WARN_ON(!backend_csf ||
+		    (dump_buffer && (backend_csf->info->metadata != dump_buffer->metadata)) ||
+		    (enable_map && (backend_csf->info->metadata != enable_map->metadata)) ||
+		    (dump_buffer && !enable_map)))
+		return;
 
 	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 
@@ -1048,6 +1219,42 @@ static void kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *ba
 	 * for next enable.
 	 */
 	kbasep_hwcnt_backend_csf_reset_internal_buffers(backend_csf);
+
+	/* Disabling HWCNT is an indication that blocks have been powered off. This is important to
+	 * know for L2, CSHW, and Tiler blocks, as this is currently the only way a backend can
+	 * know if they are being powered off.
+	 *
+	 * In any case, even if they weren't really powered off, we won't be counting whilst
+	 * disabled.
+	 *
+	 * Update the block state information in the block state accumulator to show this, so that
+	 * in the next dump blocks will have been seen as powered off for some of the time.
+	 */
+	kbase_hwcnt_block_state_append(&backend_csf->accum_all_blk_stt, KBASE_HWCNT_STATE_OFF);
+
+	if (dump_buffer) {
+		/* In some use-cases, the caller will need the information whilst the counters are
+		 * disabled, but will not be able to call into the backend to dump them. Instead,
+		 * they have an opportunity here to request them to be accumulated into their
+		 * buffer immediately.
+		 *
+		 * This consists of taking a sample of the accumulated block state (as though a
+		 * real dump_get() had happened), then transfer ownership of that to the caller
+		 * (i.e. erasing our copy of it).
+		 */
+		kbase_hwcnt_block_state_accumulate(&backend_csf->sampled_all_blk_stt,
+						   &backend_csf->accum_all_blk_stt);
+		kbase_hwcnt_dump_buffer_block_state_update(dump_buffer, enable_map,
+							   backend_csf->sampled_all_blk_stt);
+		/* Now the block state has been passed out into the caller's own accumulation
+		 * buffer, clear our own accumulated and sampled block state - ownership has been
+		 * transferred.
+		 */
+		kbase_hwcnt_block_state_set(&backend_csf->sampled_all_blk_stt,
+					    KBASE_HWCNT_STATE_UNKNOWN);
+		kbase_hwcnt_block_state_set(&backend_csf->accum_all_blk_stt,
+					    KBASE_HWCNT_STATE_UNKNOWN);
+	}
 }
 
 /* CSF backend implementation of kbase_hwcnt_backend_dump_request_fn */
@@ -1183,6 +1390,16 @@ static int kbasep_hwcnt_backend_csf_dump_wait(struct kbase_hwcnt_backend *backen
 
 	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 
+	/* Now that we've completed a sample, also sample+clear the accumulated block state.
+	 *
+	 * This is to ensure that a dump_enable() that happens in between dump_wait() and
+	 * dump_get() is reported on the _next_ dump, not the _current_ dump. That is, the block
+	 * state is reported at the actual time that counters are being sampled.
+	 */
+	kbase_hwcnt_block_state_accumulate(&backend_csf->sampled_all_blk_stt,
+					   &backend_csf->accum_all_blk_stt);
+	kbase_hwcnt_block_state_set(&backend_csf->accum_all_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
+
 	return errcode;
 }
 
@@ -1223,8 +1440,7 @@ static int kbasep_hwcnt_backend_csf_dump_get(struct kbase_hwcnt_backend *backend
 		return -EINVAL;
 
 	/* Extract elapsed cycle count for each clock domain if enabled. */
-	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk)
-	{
+	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk) {
 		if (!kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			continue;
 
@@ -1238,7 +1454,20 @@ static int kbasep_hwcnt_backend_csf_dump_get(struct kbase_hwcnt_backend *backend
 	 * as it is undefined to call this function without a prior succeeding
 	 * one to dump_wait().
 	 */
-	ret = kbase_hwcnt_csf_dump_get(dst, backend_csf->to_user_buf, dst_enable_map, accumulate);
+	ret = kbase_hwcnt_csf_dump_get(dst, backend_csf->to_user_buf,
+				       backend_csf->to_user_block_states, dst_enable_map,
+				       backend_csf->num_l2_slices,
+				       backend_csf->shader_present_bitmap, accumulate);
+
+	/* If no error occurred (zero ret value), then update block state for all blocks in the
+	 * accumulation with the current sample's block state.
+	 */
+	if (!ret) {
+		kbase_hwcnt_dump_buffer_block_state_update(dst, dst_enable_map,
+							   backend_csf->sampled_all_blk_stt);
+		kbase_hwcnt_block_state_set(&backend_csf->sampled_all_blk_stt,
+					    KBASE_HWCNT_STATE_UNKNOWN);
+	}
 
 	return ret;
 }
@@ -1269,6 +1498,12 @@ static void kbasep_hwcnt_backend_csf_destroy(struct kbase_hwcnt_backend_csf *bac
 	kfree(backend_csf->to_user_buf);
 	backend_csf->to_user_buf = NULL;
 
+	kfree(backend_csf->block_states);
+	backend_csf->block_states = NULL;
+
+	kfree(backend_csf->to_user_block_states);
+	backend_csf->to_user_block_states = NULL;
+
 	kfree(backend_csf);
 }
 
@@ -1285,6 +1520,7 @@ static int kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *
 {
 	struct kbase_hwcnt_backend_csf *backend_csf = NULL;
 	int errcode = -ENOMEM;
+	size_t block_state_bytes;
 
 	WARN_ON(!csf_info);
 	WARN_ON(!out_backend);
@@ -1308,6 +1544,17 @@ static int kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *
 	if (!backend_csf->to_user_buf)
 		goto err_alloc_user_sample_buf;
 
+	/* Allocate space to store block state values for each block */
+	block_state_bytes = backend_csf->phys_layout.block_cnt * KBASE_HWCNT_BLOCK_STATE_BYTES *
+			    KBASE_HWCNT_BLOCK_STATE_STRIDE;
+	backend_csf->block_states = kzalloc(block_state_bytes, GFP_KERNEL);
+	if (!backend_csf->block_states)
+		goto err_alloc_block_states_buf;
+
+	backend_csf->to_user_block_states = kzalloc(block_state_bytes, GFP_KERNEL);
+	if (!backend_csf->to_user_block_states)
+		goto err_alloc_user_block_state_buf;
+
 	errcode = csf_info->csf_if->ring_buf_alloc(csf_info->csf_if->ctx, csf_info->ring_buf_cnt,
 						   &backend_csf->ring_buf_cpu_base,
 						   &backend_csf->ring_buf);
@@ -1343,6 +1590,8 @@ static int kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *
 	complete_all(&backend_csf->dump_completed);
 	backend_csf->user_requested = false;
 	backend_csf->watchdog_last_seen_insert_idx = 0;
+	kbase_hwcnt_block_state_set(&backend_csf->accum_all_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
+	kbase_hwcnt_block_state_set(&backend_csf->sampled_all_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
 
 	*out_backend = backend_csf;
 	return 0;
@@ -1351,6 +1600,12 @@ err_alloc_workqueue:
 	backend_csf->info->csf_if->ring_buf_free(backend_csf->info->csf_if->ctx,
 						 backend_csf->ring_buf);
 err_ring_buf_alloc:
+	kfree(backend_csf->to_user_block_states);
+	backend_csf->to_user_block_states = NULL;
+err_alloc_user_block_state_buf:
+	kfree(backend_csf->block_states);
+	backend_csf->block_states = NULL;
+err_alloc_block_states_buf:
 	kfree(backend_csf->to_user_buf);
 	backend_csf->to_user_buf = NULL;
 err_alloc_user_sample_buf:
@@ -1417,7 +1672,7 @@ static void kbasep_hwcnt_backend_csf_term(struct kbase_hwcnt_backend *backend)
 	if (!backend)
 		return;
 
-	kbasep_hwcnt_backend_csf_dump_disable(backend);
+	kbasep_hwcnt_backend_csf_dump_disable(backend, NULL, NULL);
 
 	/* Set the backend in csf_info to NULL so we won't handle any external
 	 * notification anymore since we are terminating.
@@ -1828,7 +2083,21 @@ int kbase_hwcnt_backend_csf_metadata_init(struct kbase_hwcnt_backend_interface *
 	if (csf_info->prfcnt_info.clk_cnt > BASE_MAX_NR_CLOCKS_REGULATORS)
 		return -EIO;
 
+	/* We should reject initializing the metadata for any malformed
+	 * firmware size. The legitimate firmware sizes are as follows:
+	 * 1. fw_size == 0 on older GPUs
+	 * 2. fw_size == block_size on GPUs that support FW counters but not CSG counters
+	 * 3. fw_size == (1 + #CSG) * block size on GPUs that support CSG counters
+	 */
+	if ((csf_info->prfcnt_info.prfcnt_fw_size != 0) &&
+	    (csf_info->prfcnt_info.prfcnt_fw_size != csf_info->prfcnt_info.prfcnt_block_size) &&
+	    (csf_info->prfcnt_info.prfcnt_fw_size !=
+	     ((csf_info->prfcnt_info.csg_count + 1) * csf_info->prfcnt_info.prfcnt_block_size)))
+		return -EINVAL;
+
+	gpu_info.has_fw_counters = csf_info->prfcnt_info.prfcnt_fw_size > 0;
 	gpu_info.l2_count = csf_info->prfcnt_info.l2_count;
+	gpu_info.csg_cnt = csf_info->prfcnt_info.csg_count;
 	gpu_info.core_mask = csf_info->prfcnt_info.core_mask;
 	gpu_info.clk_cnt = csf_info->prfcnt_info.clk_cnt;
 	gpu_info.prfcnt_values_per_block =
diff --git a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.h b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.h
index 9c5a5c9..2487db2 100644
--- a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.h
+++ b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -31,6 +31,8 @@
 #include "hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h"
 #include "hwcnt/mali_kbase_hwcnt_watchdog_if.h"
 
+struct kbase_hwcnt_physical_enable_map;
+
 /**
  * kbase_hwcnt_backend_csf_create() - Create a CSF hardware counter backend
  *                                    interface.
@@ -115,6 +117,28 @@ void kbase_hwcnt_backend_csf_on_unrecoverable_error(struct kbase_hwcnt_backend_i
 void kbase_hwcnt_backend_csf_on_before_reset(struct kbase_hwcnt_backend_interface *iface);
 
 /**
+ * kbase_hwcnt_backend_csf_set_hw_availability() - CSF HWC backend function to
+ *                                                 set current HW configuration.
+ *                                                 HWC must be disabled before
+ *                                                 this function is called.
+ * @iface: Non-NULL pointer to HWC backend interface.
+ * @num_l2_slices: Current number of L2 slices allocated to the GPU.
+ * @shader_present_bitmap: Current shader-present bitmap that is allocated to the GPU.
+ */
+void kbase_hwcnt_backend_csf_set_hw_availability(struct kbase_hwcnt_backend_interface *iface,
+						 size_t num_l2_slices,
+						 uint64_t shader_present_bitmap);
+
+/** kbasep_hwcnt_backend_csf_process_enable_map() - Process the enable_map to
+ *                                                  guarantee headers are
+ *                                                  enabled if any counter is
+ *                                                  required.
+ * @phys_enable_map: HWC physical enable map to be processed.
+ */
+void kbasep_hwcnt_backend_csf_process_enable_map(
+	struct kbase_hwcnt_physical_enable_map *phys_enable_map);
+
+/**
  * kbase_hwcnt_backend_csf_on_prfcnt_sample() - CSF performance counter sample
  *                                              complete interrupt handler.
  * @iface: Non-NULL pointer to HWC backend interface.
diff --git a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h
index 382a3ad..65bb965 100644
--- a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h
+++ b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -39,6 +39,8 @@ struct kbase_hwcnt_backend_csf_if_ring_buf;
  * @shader_bm:      Shader counters selection bitmask.
  * @tiler_bm:       Tiler counters selection bitmask.
  * @mmu_l2_bm:      MMU_L2 counters selection bitmask.
+ * @fw_bm:          FW counters selection bitmask
+ * @csg_bm:         FW CSG counters selection bitmask.
  * @counter_set:    The performance counter set to enable.
  * @clk_enable_map: An array of u64 bitfields, each bit of which enables cycle
  *                  counter for a given clock domain.
@@ -48,6 +50,8 @@ struct kbase_hwcnt_backend_csf_if_enable {
 	u32 shader_bm;
 	u32 tiler_bm;
 	u32 mmu_l2_bm;
+	u32 fw_bm;
+	u32 csg_bm;
 	u8 counter_set;
 	u64 clk_enable_map;
 };
@@ -63,6 +67,7 @@ struct kbase_hwcnt_backend_csf_if_enable {
  *                     counter dump. dump_bytes = prfcnt_hw_size + prfcnt_fw_size.
  * @prfcnt_block_size: Bytes of each performance counter block.
  * @l2_count:          The MMU L2 cache count.
+ * @csg_count:         The total number of CSGs in the system
  * @core_mask:         Shader core mask.
  * @clk_cnt:           Clock domain count in the system.
  * @clearing_samples:  Indicates whether counters are cleared after each sample
@@ -74,6 +79,7 @@ struct kbase_hwcnt_backend_csf_if_prfcnt_info {
 	size_t dump_bytes;
 	size_t prfcnt_block_size;
 	size_t l2_count;
+	u32 csg_count;
 	u64 core_mask;
 	u8 clk_cnt;
 	bool clearing_samples;
@@ -85,8 +91,8 @@ struct kbase_hwcnt_backend_csf_if_prfcnt_info {
  *                                                          held.
  * @ctx: Non-NULL pointer to a CSF context.
  */
-typedef void
-kbase_hwcnt_backend_csf_if_assert_lock_held_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx);
+typedef void (*kbase_hwcnt_backend_csf_if_assert_lock_held_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_lock_fn - Acquire backend spinlock.
@@ -95,8 +101,8 @@ kbase_hwcnt_backend_csf_if_assert_lock_held_fn(struct kbase_hwcnt_backend_csf_if
  * @flags: Pointer to the memory location that would store the previous
  *         interrupt state.
  */
-typedef void kbase_hwcnt_backend_csf_if_lock_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-						unsigned long *flags);
+typedef void (*kbase_hwcnt_backend_csf_if_lock_fn)(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						   unsigned long *flags);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_unlock_fn - Release backend spinlock.
@@ -105,8 +111,8 @@ typedef void kbase_hwcnt_backend_csf_if_lock_fn(struct kbase_hwcnt_backend_csf_i
  * @flags: Previously stored interrupt state when Scheduler interrupt
  *         spinlock was acquired.
  */
-typedef void kbase_hwcnt_backend_csf_if_unlock_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-						  unsigned long flags);
+typedef void (*kbase_hwcnt_backend_csf_if_unlock_fn)(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						     unsigned long flags);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_get_prfcnt_info_fn - Get performance
@@ -115,7 +121,7 @@ typedef void kbase_hwcnt_backend_csf_if_unlock_fn(struct kbase_hwcnt_backend_csf
  * @prfcnt_info:  Non-NULL pointer to struct where performance counter
  *                information should be stored.
  */
-typedef void kbase_hwcnt_backend_csf_if_get_prfcnt_info_fn(
+typedef void (*kbase_hwcnt_backend_csf_if_get_prfcnt_info_fn)(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
 	struct kbase_hwcnt_backend_csf_if_prfcnt_info *prfcnt_info);
 
@@ -135,10 +141,9 @@ typedef void kbase_hwcnt_backend_csf_if_get_prfcnt_info_fn(
  *
  * Return: 0 on success, else error code.
  */
-typedef int
-kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-					     u32 buf_count, void **cpu_dump_base,
-					     struct kbase_hwcnt_backend_csf_if_ring_buf **ring_buf);
+typedef int (*kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 buf_count, void **cpu_dump_base,
+	struct kbase_hwcnt_backend_csf_if_ring_buf **ring_buf);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_ring_buf_sync_fn - Sync HWC dump buffers
@@ -157,10 +162,10 @@ kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn(struct kbase_hwcnt_backend_csf_if_c
  * Flush cached HWC dump buffer data to ensure that all writes from GPU and CPU
  * are correctly observed.
  */
-typedef void
-kbase_hwcnt_backend_csf_if_ring_buf_sync_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-					    struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
-					    u32 buf_index_first, u32 buf_index_last, bool for_cpu);
+typedef void (*kbase_hwcnt_backend_csf_if_ring_buf_sync_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf, u32 buf_index_first,
+	u32 buf_index_last, bool for_cpu);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_ring_buf_free_fn - Free a ring buffer for
@@ -169,9 +174,9 @@ kbase_hwcnt_backend_csf_if_ring_buf_sync_fn(struct kbase_hwcnt_backend_csf_if_ct
  * @ctx:      Non-NULL pointer to a CSF interface context.
  * @ring_buf: Non-NULL pointer to the ring buffer which to be freed.
  */
-typedef void
-kbase_hwcnt_backend_csf_if_ring_buf_free_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-					    struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf);
+typedef void (*kbase_hwcnt_backend_csf_if_ring_buf_free_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_timestamp_ns_fn - Get the current
@@ -181,7 +186,8 @@ kbase_hwcnt_backend_csf_if_ring_buf_free_fn(struct kbase_hwcnt_backend_csf_if_ct
  *
  * Return: CSF interface timestamp in nanoseconds.
  */
-typedef u64 kbase_hwcnt_backend_csf_if_timestamp_ns_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx);
+typedef u64 (*kbase_hwcnt_backend_csf_if_timestamp_ns_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_dump_enable_fn - Setup and enable hardware
@@ -192,10 +198,10 @@ typedef u64 kbase_hwcnt_backend_csf_if_timestamp_ns_fn(struct kbase_hwcnt_backen
  *
  * Requires lock to be taken before calling.
  */
-typedef void
-kbase_hwcnt_backend_csf_if_dump_enable_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-					  struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
-					  struct kbase_hwcnt_backend_csf_if_enable *enable);
+typedef void (*kbase_hwcnt_backend_csf_if_dump_enable_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
+	struct kbase_hwcnt_backend_csf_if_enable *enable);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_dump_disable_fn - Disable hardware counter
@@ -204,7 +210,8 @@ kbase_hwcnt_backend_csf_if_dump_enable_fn(struct kbase_hwcnt_backend_csf_if_ctx
  *
  * Requires lock to be taken before calling.
  */
-typedef void kbase_hwcnt_backend_csf_if_dump_disable_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx);
+typedef void (*kbase_hwcnt_backend_csf_if_dump_disable_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_dump_request_fn - Request a HWC dump.
@@ -213,7 +220,8 @@ typedef void kbase_hwcnt_backend_csf_if_dump_disable_fn(struct kbase_hwcnt_backe
  *
  * Requires lock to be taken before calling.
  */
-typedef void kbase_hwcnt_backend_csf_if_dump_request_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx);
+typedef void (*kbase_hwcnt_backend_csf_if_dump_request_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_get_indexes_fn - Get current extract and
@@ -226,8 +234,8 @@ typedef void kbase_hwcnt_backend_csf_if_dump_request_fn(struct kbase_hwcnt_backe
  *
  * Requires lock to be taken before calling.
  */
-typedef void kbase_hwcnt_backend_csf_if_get_indexes_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-						       u32 *extract_index, u32 *insert_index);
+typedef void (*kbase_hwcnt_backend_csf_if_get_indexes_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 *extract_index, u32 *insert_index);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_set_extract_index_fn - Update the extract
@@ -239,9 +247,8 @@ typedef void kbase_hwcnt_backend_csf_if_get_indexes_fn(struct kbase_hwcnt_backen
  *
  * Requires lock to be taken before calling.
  */
-typedef void
-kbase_hwcnt_backend_csf_if_set_extract_index_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-						u32 extract_index);
+typedef void (*kbase_hwcnt_backend_csf_if_set_extract_index_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 extract_index);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn - Get the current
@@ -255,9 +262,8 @@ kbase_hwcnt_backend_csf_if_set_extract_index_fn(struct kbase_hwcnt_backend_csf_i
  *
  * Requires lock to be taken before calling.
  */
-typedef void
-kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-						  u64 *cycle_counts, u64 clk_enable_map);
+typedef void (*kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn)(
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u64 *cycle_counts, u64 clk_enable_map);
 
 /**
  * struct kbase_hwcnt_backend_csf_if - Hardware counter backend CSF virtual
@@ -283,20 +289,20 @@ kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn(struct kbase_hwcnt_backend_csf
  */
 struct kbase_hwcnt_backend_csf_if {
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx;
-	kbase_hwcnt_backend_csf_if_assert_lock_held_fn *assert_lock_held;
-	kbase_hwcnt_backend_csf_if_lock_fn *lock;
-	kbase_hwcnt_backend_csf_if_unlock_fn *unlock;
-	kbase_hwcnt_backend_csf_if_get_prfcnt_info_fn *get_prfcnt_info;
-	kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn *ring_buf_alloc;
-	kbase_hwcnt_backend_csf_if_ring_buf_sync_fn *ring_buf_sync;
-	kbase_hwcnt_backend_csf_if_ring_buf_free_fn *ring_buf_free;
-	kbase_hwcnt_backend_csf_if_timestamp_ns_fn *timestamp_ns;
-	kbase_hwcnt_backend_csf_if_dump_enable_fn *dump_enable;
-	kbase_hwcnt_backend_csf_if_dump_disable_fn *dump_disable;
-	kbase_hwcnt_backend_csf_if_dump_request_fn *dump_request;
-	kbase_hwcnt_backend_csf_if_get_indexes_fn *get_indexes;
-	kbase_hwcnt_backend_csf_if_set_extract_index_fn *set_extract_index;
-	kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn *get_gpu_cycle_count;
+	kbase_hwcnt_backend_csf_if_assert_lock_held_fn assert_lock_held;
+	kbase_hwcnt_backend_csf_if_lock_fn lock;
+	kbase_hwcnt_backend_csf_if_unlock_fn unlock;
+	kbase_hwcnt_backend_csf_if_get_prfcnt_info_fn get_prfcnt_info;
+	kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn ring_buf_alloc;
+	kbase_hwcnt_backend_csf_if_ring_buf_sync_fn ring_buf_sync;
+	kbase_hwcnt_backend_csf_if_ring_buf_free_fn ring_buf_free;
+	kbase_hwcnt_backend_csf_if_timestamp_ns_fn timestamp_ns;
+	kbase_hwcnt_backend_csf_if_dump_enable_fn dump_enable;
+	kbase_hwcnt_backend_csf_if_dump_disable_fn dump_disable;
+	kbase_hwcnt_backend_csf_if_dump_request_fn dump_request;
+	kbase_hwcnt_backend_csf_if_get_indexes_fn get_indexes;
+	kbase_hwcnt_backend_csf_if_set_extract_index_fn set_extract_index;
+	kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn get_gpu_cycle_count;
 };
 
 #endif /* #define _KBASE_HWCNT_BACKEND_CSF_IF_H_ */
diff --git a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.c b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.c
index c8cf934..1b7a116 100644
--- a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.c
+++ b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.c
@@ -24,7 +24,7 @@
  */
 
 #include <mali_kbase.h>
-#include <gpu/mali_kbase_gpu_regmap.h>
+#include <hw_access/mali_kbase_hw_access_regmap.h>
 #include <device/mali_kbase_device.h>
 #include "hwcnt/mali_kbase_hwcnt_gpu.h"
 #include "hwcnt/mali_kbase_hwcnt_types.h"
@@ -39,7 +39,6 @@
 #include <linux/log2.h>
 #include "mali_kbase_ccswe.h"
 
-
 /* Ring buffer virtual address start at 4GB  */
 #define KBASE_HWC_CSF_RING_BUFFER_VA_START (1ull << 32)
 
@@ -206,6 +205,20 @@ kbasep_hwcnt_backend_csf_if_fw_cc_disable(struct kbase_hwcnt_backend_csf_if_fw_c
 		kbase_clk_rate_trace_manager_unsubscribe(rtm, &fw_ctx->rate_listener);
 }
 
+#if !IS_ENABLED(CONFIG_MALI_NO_MALI)
+/**
+ * kbasep_hwcnt_backend_csf_core_mask() - Obtain Core Mask - MAX Core ID
+ *
+ * @gpu_props:  gpu_props structure
+ *
+ * Return:      calculated core mask (maximum Core ID)
+ */
+static u64 kbasep_hwcnt_backend_csf_core_mask(struct kbase_gpu_props *gpu_props)
+{
+	return gpu_props->coherency_info.group.core_mask;
+}
+#endif
+
 static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
 	struct kbase_hwcnt_backend_csf_if_prfcnt_info *prfcnt_info)
@@ -234,6 +247,8 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 	u32 prfcnt_size;
 	u32 prfcnt_hw_size;
 	u32 prfcnt_fw_size;
+	u32 csg_count;
+	u32 fw_block_count = 0;
 	u32 prfcnt_block_size =
 		KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK * KBASE_HWCNT_VALUE_HW_BYTES;
 
@@ -242,28 +257,41 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 
 	fw_ctx = (struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
 	kbdev = fw_ctx->kbdev;
+	csg_count = kbdev->csf.global_iface.group_num;
 	prfcnt_size = kbdev->csf.global_iface.prfcnt_size;
 	prfcnt_hw_size = GLB_PRFCNT_SIZE_HARDWARE_SIZE_GET(prfcnt_size);
 	prfcnt_fw_size = GLB_PRFCNT_SIZE_FIRMWARE_SIZE_GET(prfcnt_size);
-	fw_ctx->buf_bytes = prfcnt_hw_size + prfcnt_fw_size;
 
 	/* Read the block size if the GPU has the register PRFCNT_FEATURES
 	 * which was introduced in architecture version 11.x.7.
 	 */
-	if ((kbdev->gpu_props.props.raw_props.gpu_id & GPU_ID2_PRODUCT_MODEL) >=
-	    GPU_ID2_PRODUCT_TTUX) {
-		prfcnt_block_size = PRFCNT_FEATURES_COUNTER_BLOCK_SIZE_GET(
-					    kbase_reg_read(kbdev, GPU_CONTROL_REG(PRFCNT_FEATURES)))
+	if (kbase_reg_is_valid(kbdev, GPU_CONTROL_ENUM(PRFCNT_FEATURES))) {
+		prfcnt_block_size = PRFCNT_FEATURES_COUNTER_BLOCK_SIZE_GET(KBASE_REG_READ(
+					    kbdev, GPU_CONTROL_ENUM(PRFCNT_FEATURES)))
 				    << 8;
 	}
 
+	/* Extra sanity check to ensure that we support two different configurations:
+	 * a global FW block without CSG blocks and a global FW block with CSG blocks.
+	 */
+	if (!prfcnt_fw_size)
+		fw_block_count = 0;
+	else if (prfcnt_fw_size == prfcnt_block_size)
+		fw_block_count = 1;
+	else if (prfcnt_fw_size == ((1 + csg_count) * prfcnt_block_size))
+		fw_block_count = 1 + csg_count;
+	else
+		WARN_ON_ONCE(true);
+
+	fw_ctx->buf_bytes = prfcnt_hw_size + prfcnt_fw_size;
 	*prfcnt_info = (struct kbase_hwcnt_backend_csf_if_prfcnt_info){
 		.prfcnt_hw_size = prfcnt_hw_size,
 		.prfcnt_fw_size = prfcnt_fw_size,
 		.dump_bytes = fw_ctx->buf_bytes,
 		.prfcnt_block_size = prfcnt_block_size,
-		.l2_count = kbdev->gpu_props.props.l2_props.num_l2_slices,
-		.core_mask = kbdev->gpu_props.props.coherency_info.group[0].core_mask,
+		.l2_count = kbdev->gpu_props.num_l2_slices,
+		.core_mask = kbasep_hwcnt_backend_csf_core_mask(&kbdev->gpu_props),
+		.csg_count = fw_block_count > 1 ? csg_count : 0,
 		.clk_cnt = fw_ctx->clk_cnt,
 		.clearing_samples = true,
 	};
@@ -284,7 +312,7 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
 	struct page **page_list;
 	void *cpu_addr;
 	int ret;
-	int i;
+	size_t i;
 	size_t num_pages;
 	u64 flags;
 	struct kbase_hwcnt_backend_csf_if_fw_ring_buf *fw_ring_buf;
@@ -330,7 +358,7 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
 	/* Get physical page for the buffer */
 	ret = kbase_mem_pool_alloc_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], num_pages,
 					 phys, false, NULL);
-	if (ret != num_pages)
+	if ((size_t)ret != num_pages)
 		goto phys_mem_pool_alloc_error;
 
 	/* Get the CPU virtual address */
@@ -342,7 +370,7 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
 		goto vmap_error;
 
 	flags = KBASE_REG_GPU_WR | KBASE_REG_GPU_NX |
-		KBASE_REG_MEMATTR_INDEX(AS_MEMATTR_INDEX_NON_CACHEABLE);
+		KBASE_REG_MEMATTR_INDEX(KBASE_MEMATTR_INDEX_NON_CACHEABLE);
 
 	/* Update MMU table */
 	ret = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu, gpu_va_base >> PAGE_SHIFT, phys,
@@ -508,6 +536,7 @@ kbasep_hwcnt_backend_csf_if_fw_dump_enable(struct kbase_hwcnt_backend_csf_if_ctx
 		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
 	struct kbase_hwcnt_backend_csf_if_fw_ring_buf *fw_ring_buf =
 		(struct kbase_hwcnt_backend_csf_if_fw_ring_buf *)ring_buf;
+	u32 csg_mask;
 
 	WARN_ON(!ctx);
 	WARN_ON(!ring_buf);
@@ -516,6 +545,7 @@ kbasep_hwcnt_backend_csf_if_fw_dump_enable(struct kbase_hwcnt_backend_csf_if_ctx
 
 	kbdev = fw_ctx->kbdev;
 	global_iface = &kbdev->csf.global_iface;
+	csg_mask = (1 << kbdev->csf.global_iface.group_num) - 1;
 
 	/* Configure */
 	prfcnt_config = GLB_PRFCNT_CONFIG_SIZE_SET(0, fw_ring_buf->buf_count);
@@ -536,6 +566,12 @@ kbasep_hwcnt_backend_csf_if_fw_dump_enable(struct kbase_hwcnt_backend_csf_if_ctx
 	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_SHADER_EN, enable->shader_bm);
 	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_MMU_L2_EN, enable->mmu_l2_bm);
 	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_TILER_EN, enable->tiler_bm);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_FW_EN, enable->fw_bm);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_CSG_EN, enable->csg_bm);
+
+	/* Enable all of the CSGs by default. */
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_CSG_SELECT, csg_mask);
+
 
 	/* Configure the HWC set and buffer size */
 	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_CONFIG, prfcnt_config);
diff --git a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_jm.c b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_jm.c
index 8b3caac..4df7dd4 100644
--- a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_jm.c
+++ b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_jm.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2018-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -80,30 +80,40 @@ struct kbase_hwcnt_jm_physical_layout {
 
 /**
  * struct kbase_hwcnt_backend_jm - Instance of a JM hardware counter backend.
- * @info:             Info used to create the backend.
- * @kctx:             KBase context used for GPU memory allocation and
- *                    counter dumping.
- * @gpu_dump_va:      GPU hardware counter dump buffer virtual address.
- * @cpu_dump_va:      CPU mapping of gpu_dump_va.
- * @vmap:             Dump buffer vmap.
- * @to_user_buf:      HWC sample buffer for client user, size
- *                    metadata.dump_buf_bytes.
- * @enabled:          True if dumping has been enabled, else false.
- * @pm_core_mask:     PM state sync-ed shaders core mask for the enabled
- *                    dumping.
- * @curr_config:      Current allocated hardware resources to correctly map the
- *                    source raw dump buffer to the destination dump buffer.
- * @clk_enable_map:   The enable map specifying enabled clock domains.
- * @cycle_count_elapsed:
- *                    Cycle count elapsed for a given sample period.
- *                    The top clock cycle, index 0, is read directly from
- *                    hardware, but the other clock domains need to be
- *                    calculated with software estimation.
- * @prev_cycle_count: Previous cycle count to calculate the cycle count for
- *                    sample period.
- * @rate_listener:    Clock rate listener callback state.
- * @ccswe_shader_cores: Shader cores cycle count software estimator.
- * @phys_layout:      Physical memory layout information of HWC sample buffer.
+ * @info:                Info used to create the backend.
+ * @kctx:                KBase context used for GPU memory allocation and
+ *                       counter dumping.
+ * @gpu_dump_va:         GPU hardware counter dump buffer virtual address.
+ * @cpu_dump_va:         CPU mapping of gpu_dump_va.
+ * @vmap:                Dump buffer vmap.
+ * @to_user_buf:         HWC sample buffer for client user, size
+ *                       metadata.dump_buf_bytes.
+ * @enabled:             True if dumping has been enabled, else false.
+ * @accum_all_blk_stt:   Block State to accumulate on next sample, for all types
+ *                       of block.
+ * @sampled_all_blk_stt: Block State to accumulate into the current sample, for
+ *                       all types of block.
+ * @debug_core_mask:     User-set mask of shader cores that can be used.
+ * @pm_core_mask:        PM state sync-ed shaders core mask for the enabled
+ *                       dumping.
+ * @curr_config:         Current allocated hardware resources to correctly map the
+ *                       source raw dump buffer to the destination dump buffer.
+ * @max_core_mask:       Core mask of all cores allocated to the GPU (non
+ *                       virtualized platforms) or resource group (virtualized
+ *                       platforms).
+ * @max_l2_slices:       Maximum number of L2 slices allocated to the GPU (non
+ *                       virtualized platforms) or resource group (virtualized
+ *                       platforms).
+ * @clk_enable_map:      The enable map specifying enabled clock domains.
+ * @cycle_count_elapsed: Cycle count elapsed for a given sample period.
+ *                       The top clock cycle, index 0, is read directly from
+ *                       hardware, but the other clock domains need to be
+ *                       calculated with software estimation.
+ * @prev_cycle_count:    Previous cycle count to calculate the cycle count for
+ *                       sample period.
+ * @rate_listener:       Clock rate listener callback state.
+ * @ccswe_shader_cores:  Shader cores cycle count software estimator.
+ * @phys_layout:         Physical memory layout information of HWC sample buffer.
  */
 struct kbase_hwcnt_backend_jm {
 	const struct kbase_hwcnt_backend_jm_info *info;
@@ -113,8 +123,13 @@ struct kbase_hwcnt_backend_jm {
 	struct kbase_vmap_struct *vmap;
 	u64 *to_user_buf;
 	bool enabled;
+	blk_stt_t accum_all_blk_stt;
+	blk_stt_t sampled_all_blk_stt;
+	u64 debug_core_mask;
 	u64 pm_core_mask;
 	struct kbase_hwcnt_curr_config curr_config;
+	u64 max_core_mask;
+	size_t max_l2_slices;
 	u64 clk_enable_map;
 	u64 cycle_count_elapsed[BASE_MAX_NR_CLOCKS_REGULATORS];
 	u64 prev_cycle_count[BASE_MAX_NR_CLOCKS_REGULATORS];
@@ -136,26 +151,22 @@ struct kbase_hwcnt_backend_jm {
 static int kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
 						 struct kbase_hwcnt_gpu_info *info)
 {
-	size_t clk;
+	size_t clk, l2_count, core_mask;
 
 	if (!kbdev || !info)
 		return -EINVAL;
 
 #if IS_ENABLED(CONFIG_MALI_NO_MALI)
-	info->l2_count = KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS;
-	info->core_mask = (1ull << KBASE_DUMMY_MODEL_MAX_SHADER_CORES) - 1;
-	info->prfcnt_values_per_block = KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK;
-#else /* CONFIG_MALI_NO_MALI */
-	{
-		const struct base_gpu_props *props = &kbdev->gpu_props.props;
-		const size_t l2_count = props->l2_props.num_l2_slices;
-		const size_t core_mask = props->coherency_info.group[0].core_mask;
+	l2_count = KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS;
+	core_mask = (1ull << KBASE_DUMMY_MODEL_MAX_SHADER_CORES) - 1;
+#else
+	l2_count = kbdev->gpu_props.num_l2_slices;
+	core_mask = kbdev->gpu_props.coherency_info.group.core_mask;
+#endif
 
-		info->l2_count = l2_count;
-		info->core_mask = core_mask;
-		info->prfcnt_values_per_block = KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK;
-	}
-#endif /* CONFIG_MALI_NO_MALI */
+	info->l2_count = l2_count;
+	info->core_mask = core_mask;
+	info->prfcnt_values_per_block = KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK;
 
 	/* Determine the number of available clock domains. */
 	for (clk = 0; clk < BASE_MAX_NR_CLOCKS_REGULATORS; clk++) {
@@ -353,9 +364,9 @@ kbasep_hwcnt_backend_jm_dump_enable_nolock(struct kbase_hwcnt_backend *backend,
 	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 	struct kbase_context *kctx;
 	struct kbase_device *kbdev;
-	struct kbase_hwcnt_physical_enable_map phys_enable_map;
+	struct kbase_hwcnt_physical_enable_map phys_enable_map = { 0 };
 	enum kbase_hwcnt_physical_set phys_counter_set;
-	struct kbase_instr_hwcnt_enable enable;
+	struct kbase_instr_hwcnt_enable enable = { 0 };
 	u64 timestamp_ns;
 
 	if (!backend_jm || !enable_map || backend_jm->enabled ||
@@ -371,18 +382,21 @@ kbasep_hwcnt_backend_jm_dump_enable_nolock(struct kbase_hwcnt_backend *backend,
 
 	kbase_hwcnt_gpu_set_to_physical(&phys_counter_set, backend_jm->info->counter_set);
 
-	enable.fe_bm = phys_enable_map.fe_bm;
-	enable.shader_bm = phys_enable_map.shader_bm;
-	enable.tiler_bm = phys_enable_map.tiler_bm;
-	enable.mmu_l2_bm = phys_enable_map.mmu_l2_bm;
-	enable.counter_set = phys_counter_set;
+	enable = (struct kbase_instr_hwcnt_enable)
+	{
+		.fe_bm = phys_enable_map.fe_bm,
+		.shader_bm = phys_enable_map.shader_bm,
+		.tiler_bm = phys_enable_map.tiler_bm,
+		.mmu_l2_bm = phys_enable_map.mmu_l2_bm,
+		.counter_set = phys_counter_set,
 #if IS_ENABLED(CONFIG_MALI_NO_MALI)
-	/* The dummy model needs the CPU mapping. */
-	enable.dump_buffer = (uintptr_t)backend_jm->cpu_dump_va;
+		/* The dummy model needs the CPU mapping. */
+		.dump_buffer = (uintptr_t)backend_jm->cpu_dump_va,
 #else
-	enable.dump_buffer = backend_jm->gpu_dump_va;
+		.dump_buffer = backend_jm->gpu_dump_va,
 #endif /* CONFIG_MALI_NO_MALI */
-	enable.dump_buffer_bytes = backend_jm->info->dump_bytes;
+		.dump_buffer_bytes = backend_jm->info->dump_bytes,
+	};
 
 	timestamp_ns = kbasep_hwcnt_backend_jm_timestamp_ns(backend);
 
@@ -395,9 +409,24 @@ kbasep_hwcnt_backend_jm_dump_enable_nolock(struct kbase_hwcnt_backend *backend,
 	if (errcode)
 		goto error;
 
+	backend_jm->debug_core_mask = kbase_pm_ca_get_debug_core_mask(kbdev);
+	backend_jm->max_l2_slices = backend_jm->info->hwcnt_gpu_info.l2_count;
+	backend_jm->max_core_mask = backend_jm->info->hwcnt_gpu_info.core_mask;
+
 	backend_jm->pm_core_mask = kbase_pm_ca_get_instr_core_mask(kbdev);
 
 	backend_jm->enabled = true;
+	/* Enabling counters is an indication that the power may have previously been off for all
+	 * blocks.
+	 *
+	 * In any case, the counters would not have been counting recently, so an 'off' block state
+	 * is an approximation for this.
+	 *
+	 * This will be transferred to the dump only after a dump_wait(), or dump_disable() in
+	 * cases where the caller requested such information. This is to handle when a
+	 * dump_enable() happens in between dump_wait() and dump_get().
+	 */
+	kbase_hwcnt_block_state_append(&backend_jm->accum_all_blk_stt, KBASE_HWCNT_STATE_OFF);
 
 	kbasep_hwcnt_backend_jm_cc_enable(backend_jm, enable_map, timestamp_ns);
 
@@ -430,12 +459,20 @@ static int kbasep_hwcnt_backend_jm_dump_enable(struct kbase_hwcnt_backend *backe
 }
 
 /* JM backend implementation of kbase_hwcnt_backend_dump_disable_fn */
-static void kbasep_hwcnt_backend_jm_dump_disable(struct kbase_hwcnt_backend *backend)
+static void kbasep_hwcnt_backend_jm_dump_disable(struct kbase_hwcnt_backend *backend,
+						 struct kbase_hwcnt_dump_buffer *dump_buffer,
+						 const struct kbase_hwcnt_enable_map *enable_map)
 {
 	int errcode;
 	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 
-	if (WARN_ON(!backend_jm) || !backend_jm->enabled)
+	if (WARN_ON(!backend_jm ||
+		    (dump_buffer && (backend_jm->info->metadata != dump_buffer->metadata)) ||
+		    (enable_map && (backend_jm->info->metadata != enable_map->metadata)) ||
+		    (dump_buffer && !enable_map)))
+		return;
+	/* No WARN needed here, but still return early if backend is already disabled */
+	if (!backend_jm->enabled)
 		return;
 
 	kbasep_hwcnt_backend_jm_cc_disable(backend_jm);
@@ -443,6 +480,42 @@ static void kbasep_hwcnt_backend_jm_dump_disable(struct kbase_hwcnt_backend *bac
 	errcode = kbase_instr_hwcnt_disable_internal(backend_jm->kctx);
 	WARN_ON(errcode);
 
+	/* Disabling HWCNT is an indication that blocks have been powered off. This is important to
+	 * know for L2 and Tiler blocks, as this is currently the only way a backend can know if
+	 * they are being powered off.
+	 *
+	 * In any case, even if they weren't really powered off, we won't be counting whilst
+	 * disabled.
+	 *
+	 * Update the block state information in the block state accumulator to show this, so that
+	 * in the next dump blocks will have been seen as powered off for some of the time.
+	 */
+	kbase_hwcnt_block_state_append(&backend_jm->accum_all_blk_stt, KBASE_HWCNT_STATE_OFF);
+
+	if (dump_buffer) {
+		/* In some use-cases, the caller will need the information whilst the counters are
+		 * disabled, but will not be able to call into the backend to dump them. Instead,
+		 * they have an opportunity here to request them to be accumulated into their
+		 * buffer immediately.
+		 *
+		 * This consists of taking a sample of the accumulated block state (as though a
+		 * real dump_get() had happened), then transfer ownership of that to the caller
+		 * (i.e. erasing our copy of it).
+		 */
+		kbase_hwcnt_block_state_accumulate(&backend_jm->sampled_all_blk_stt,
+						   &backend_jm->accum_all_blk_stt);
+		kbase_hwcnt_dump_buffer_block_state_update(dump_buffer, enable_map,
+							   backend_jm->sampled_all_blk_stt);
+		/* Now the block state has been passed out into the caller's own accumulation
+		 * buffer, clear our own accumulated and sampled block state - ownership has been
+		 * transferred.
+		 */
+		kbase_hwcnt_block_state_set(&backend_jm->sampled_all_blk_stt,
+					    KBASE_HWCNT_STATE_UNKNOWN);
+		kbase_hwcnt_block_state_set(&backend_jm->accum_all_blk_stt,
+					    KBASE_HWCNT_STATE_UNKNOWN);
+	}
+
 	backend_jm->enabled = false;
 }
 
@@ -480,8 +553,7 @@ static int kbasep_hwcnt_backend_jm_dump_request(struct kbase_hwcnt_backend *back
 		*dump_time_ns = kbasep_hwcnt_backend_jm_timestamp_ns(backend);
 		ret = kbase_instr_hwcnt_request_dump(backend_jm->kctx);
 
-		kbase_hwcnt_metadata_for_each_clock(metadata, clk)
-		{
+		kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
 			if (!kbase_hwcnt_clk_enable_map_enabled(backend_jm->clk_enable_map, clk))
 				continue;
 
@@ -514,12 +586,27 @@ static int kbasep_hwcnt_backend_jm_dump_request(struct kbase_hwcnt_backend *back
 /* JM backend implementation of kbase_hwcnt_backend_dump_wait_fn */
 static int kbasep_hwcnt_backend_jm_dump_wait(struct kbase_hwcnt_backend *backend)
 {
+	int errcode;
 	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 
 	if (!backend_jm || !backend_jm->enabled)
 		return -EINVAL;
 
-	return kbase_instr_hwcnt_wait_for_dump(backend_jm->kctx);
+	errcode = kbase_instr_hwcnt_wait_for_dump(backend_jm->kctx);
+	if (errcode)
+		return errcode;
+
+	/* Now that we've completed a sample, also sample+clear the accumulated block state.
+	 *
+	 * This is to ensure that a dump_enable() that happens in between dump_wait() and
+	 * dump_get() is reported on the _next_ dump, not the _current_ dump. That is, the block
+	 * state is reported at the actual time that counters are being sampled.
+	 */
+	kbase_hwcnt_block_state_accumulate(&backend_jm->sampled_all_blk_stt,
+					   &backend_jm->accum_all_blk_stt);
+	kbase_hwcnt_block_state_set(&backend_jm->accum_all_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
+
+	return errcode;
 }
 
 /* JM backend implementation of kbase_hwcnt_backend_dump_get_fn */
@@ -533,8 +620,8 @@ static int kbasep_hwcnt_backend_jm_dump_get(struct kbase_hwcnt_backend *backend,
 #if IS_ENABLED(CONFIG_MALI_NO_MALI)
 	struct kbase_device *kbdev;
 	unsigned long flags;
-	int errcode;
 #endif /* CONFIG_MALI_NO_MALI */
+	int errcode;
 
 	if (!backend_jm || !dst || !dst_enable_map ||
 	    (backend_jm->info->metadata != dst->metadata) ||
@@ -548,8 +635,7 @@ static int kbasep_hwcnt_backend_jm_dump_get(struct kbase_hwcnt_backend *backend,
 	kbasep_hwcnt_backend_jm_dump_sample(backend_jm);
 
 	/* Extract elapsed cycle count for each clock domain if enabled. */
-	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk)
-	{
+	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk) {
 		if (!kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			continue;
 
@@ -572,9 +658,18 @@ static int kbasep_hwcnt_backend_jm_dump_get(struct kbase_hwcnt_backend *backend,
 	if (errcode)
 		return errcode;
 #endif /* CONFIG_MALI_NO_MALI */
-	return kbase_hwcnt_jm_dump_get(dst, backend_jm->to_user_buf, dst_enable_map,
-				       backend_jm->pm_core_mask, &backend_jm->curr_config,
-				       accumulate);
+	errcode = kbase_hwcnt_jm_dump_get(dst, backend_jm->to_user_buf, dst_enable_map,
+					  backend_jm->pm_core_mask, backend_jm->debug_core_mask,
+					  backend_jm->max_core_mask, backend_jm->max_l2_slices,
+					  &backend_jm->curr_config, accumulate);
+
+	if (errcode)
+		return errcode;
+
+	kbase_hwcnt_dump_buffer_block_state_update(dst, dst_enable_map,
+						   backend_jm->sampled_all_blk_stt);
+	kbase_hwcnt_block_state_set(&backend_jm->sampled_all_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
+	return errcode;
 }
 
 /**
@@ -705,6 +800,8 @@ static int kbasep_hwcnt_backend_jm_create(const struct kbase_hwcnt_backend_jm_in
 
 	kbase_ccswe_init(&backend->ccswe_shader_cores);
 	backend->rate_listener.notify = kbasep_hwcnt_backend_jm_on_freq_change;
+	kbase_hwcnt_block_state_set(&backend->accum_all_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
+	kbase_hwcnt_block_state_set(&backend->sampled_all_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
 
 	*out_backend = backend;
 	return 0;
@@ -752,7 +849,7 @@ static void kbasep_hwcnt_backend_jm_term(struct kbase_hwcnt_backend *backend)
 	if (!backend)
 		return;
 
-	kbasep_hwcnt_backend_jm_dump_disable(backend);
+	kbasep_hwcnt_backend_jm_dump_disable(backend, NULL, NULL);
 	kbasep_hwcnt_backend_jm_destroy((struct kbase_hwcnt_backend_jm *)backend);
 }
 
diff --git a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.c b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.c
index a8654ea..1b54151 100644
--- a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.c
+++ b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -268,9 +268,9 @@ kbasep_hwcnt_backend_jm_watchdog_info_create(struct kbase_hwcnt_backend_interfac
 	if (!info)
 		return NULL;
 
-	*info = (struct kbase_hwcnt_backend_jm_watchdog_info){ .jm_backend_iface = backend_iface,
-							       .dump_watchdog_iface =
-								       watchdog_iface };
+	*info = (struct kbase_hwcnt_backend_jm_watchdog_info){
+		.jm_backend_iface = backend_iface, .dump_watchdog_iface = watchdog_iface
+	};
 
 	return info;
 }
@@ -443,7 +443,8 @@ static int kbasep_hwcnt_backend_jm_watchdog_dump_enable_common(
 			spin_unlock_irqrestore(&wd_backend->locked.watchdog_lock, flags);
 		} else
 			/*Reverting the job manager backend back to disabled*/
-			wd_backend->info->jm_backend_iface->dump_disable(wd_backend->jm_backend);
+			wd_backend->info->jm_backend_iface->dump_disable(wd_backend->jm_backend,
+									 NULL, NULL);
 	}
 
 	return errcode;
@@ -472,7 +473,10 @@ kbasep_hwcnt_backend_jm_watchdog_dump_enable_nolock(struct kbase_hwcnt_backend *
 }
 
 /* Job manager watchdog backend, implementation of dump_disable */
-static void kbasep_hwcnt_backend_jm_watchdog_dump_disable(struct kbase_hwcnt_backend *backend)
+static void
+kbasep_hwcnt_backend_jm_watchdog_dump_disable(struct kbase_hwcnt_backend *backend,
+					      struct kbase_hwcnt_dump_buffer *dump_buffer,
+					      const struct kbase_hwcnt_enable_map *buf_enable_map)
 {
 	struct kbase_hwcnt_backend_jm_watchdog *const wd_backend = (void *)backend;
 	unsigned long flags;
@@ -497,7 +501,8 @@ static void kbasep_hwcnt_backend_jm_watchdog_dump_disable(struct kbase_hwcnt_bac
 	wd_backend->info->dump_watchdog_iface->disable(
 		wd_backend->info->dump_watchdog_iface->timer);
 
-	wd_backend->info->jm_backend_iface->dump_disable(wd_backend->jm_backend);
+	wd_backend->info->jm_backend_iface->dump_disable(wd_backend->jm_backend, dump_buffer,
+							 buf_enable_map);
 }
 
 /* Job manager watchdog backend, implementation of dump_clear */
diff --git a/mali_kbase/hwcnt/mali_kbase_hwcnt.c b/mali_kbase/hwcnt/mali_kbase_hwcnt.c
index 34deb5d..8b1de2e 100644
--- a/mali_kbase/hwcnt/mali_kbase_hwcnt.c
+++ b/mali_kbase/hwcnt/mali_kbase_hwcnt.c
@@ -292,7 +292,8 @@ static void kbasep_hwcnt_accumulator_disable(struct kbase_hwcnt_context *hctx, b
 	accum->accumulated = true;
 
 disable:
-	hctx->iface->dump_disable(accum->backend);
+	hctx->iface->dump_disable(accum->backend, (accum->accumulated) ? &accum->accum_buf : NULL,
+				  &accum->enable_map);
 
 	/* Regardless of any errors during the accumulate, put the accumulator
 	 * in the disabled state.
@@ -453,8 +454,20 @@ static int kbasep_hwcnt_accumulator_dump(struct kbase_hwcnt_context *hctx, u64 *
 	 */
 	if ((state == ACCUM_STATE_ENABLED) && new_map) {
 		/* Backend is only enabled if there were any enabled counters */
-		if (cur_map_any_enabled)
-			hctx->iface->dump_disable(accum->backend);
+		if (cur_map_any_enabled) {
+			/* In this case we do *not* want to have the buffer updated with extra
+			 * block state, it should instead remain in the backend until the next dump
+			 * happens, hence supplying NULL as the dump_buffer parameter here.
+			 *
+			 * Attempting to take ownership of backend-accumulated block state at this
+			 * point will instead give inaccurate information. For example the dump
+			 * buffer for 'set_counters' operation might be dumping a period that
+			 * should've been entirely in the 'ON' state, but would report it as
+			 * partially in the 'OFF' state. Instead, that 'OFF' state should be
+			 * reported in the _next_ dump.
+			 */
+			hctx->iface->dump_disable(accum->backend, NULL, NULL);
+		}
 
 		/* (Re-)enable the backend if the new map has enabled counters.
 		 * No need to acquire the spinlock, as concurrent enable while
@@ -481,9 +494,15 @@ static int kbasep_hwcnt_accumulator_dump(struct kbase_hwcnt_context *hctx, u64 *
 
 		/* If we've not written anything into the dump buffer so far, it
 		 * means there was nothing to write. Zero any enabled counters.
+		 *
+		 * In this state, the blocks are likely to be off (and at the very least, not
+		 * counting), so write in the 'off' block state
 		 */
-		if (!dump_written)
+		if (!dump_written) {
 			kbase_hwcnt_dump_buffer_zero(dump_buf, cur_map);
+			kbase_hwcnt_dump_buffer_block_state_update(dump_buf, cur_map,
+								   KBASE_HWCNT_STATE_OFF);
+		}
 	}
 
 	/* Write out timestamps */
@@ -498,8 +517,13 @@ error:
 	/* An error was only physically possible if the backend was enabled */
 	WARN_ON(state != ACCUM_STATE_ENABLED);
 
-	/* Disable the backend, and transition to the error state */
-	hctx->iface->dump_disable(accum->backend);
+	/* Disable the backend, and transition to the error state. In this case, we can try to save
+	 * the block state into the accumulated buffer, but there's no guarantee we'll have one, so
+	 * this is more of a 'best effort' for error cases. There would be an suitable block
+	 * state recorded on the next dump_enable() anyway.
+	 */
+	hctx->iface->dump_disable(accum->backend, (accum->accumulated) ? &accum->accum_buf : NULL,
+				  cur_map);
 	spin_lock_irqsave(&hctx->state_lock, flags);
 
 	accum->state = ACCUM_STATE_ERROR;
diff --git a/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.c b/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.c
index 74916da..5da5645 100644
--- a/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.c
+++ b/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2018-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -19,10 +19,11 @@
  *
  */
 
+#include <mali_kbase.h>
 #include "hwcnt/mali_kbase_hwcnt_gpu.h"
-#include "hwcnt/mali_kbase_hwcnt_types.h"
 
 #include <linux/err.h>
+#include <linux/log2.h>
 
 /** enum enable_map_idx - index into a block enable map that spans multiple u64 array elements
  */
@@ -32,78 +33,107 @@ enum enable_map_idx {
 	EM_COUNT,
 };
 
-static void kbasep_get_fe_block_type(u64 *dst, enum kbase_hwcnt_set counter_set, bool is_csf)
+static enum kbase_hwcnt_gpu_v5_block_type kbasep_get_fe_block_type(enum kbase_hwcnt_set counter_set,
+								   bool is_csf)
 {
 	switch (counter_set) {
 	case KBASE_HWCNT_SET_PRIMARY:
-		*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE;
-		break;
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE;
 	case KBASE_HWCNT_SET_SECONDARY:
 		if (is_csf)
-			*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE2;
+			return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE2;
 		else
-			*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED;
-		break;
+			return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED;
 	case KBASE_HWCNT_SET_TERTIARY:
 		if (is_csf)
-			*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE3;
+			return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE3;
 		else
-			*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED;
-		break;
+			return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED;
 	default:
-		WARN_ON(true);
+		WARN(true, "Invalid counter set for FE block type: %d", counter_set);
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED;
 	}
 }
 
-static void kbasep_get_tiler_block_type(u64 *dst, enum kbase_hwcnt_set counter_set)
+static enum kbase_hwcnt_gpu_v5_block_type
+kbasep_get_tiler_block_type(enum kbase_hwcnt_set counter_set)
 {
 	switch (counter_set) {
 	case KBASE_HWCNT_SET_PRIMARY:
-		*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER;
-		break;
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER;
 	case KBASE_HWCNT_SET_SECONDARY:
 	case KBASE_HWCNT_SET_TERTIARY:
-		*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER_UNDEFINED;
-		break;
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER_UNDEFINED;
 	default:
-		WARN_ON(true);
+		WARN(true, "Invalid counter set for tiler block type: %d", counter_set);
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER_UNDEFINED;
 	}
 }
 
-static void kbasep_get_sc_block_type(u64 *dst, enum kbase_hwcnt_set counter_set, bool is_csf)
+static enum kbase_hwcnt_gpu_v5_block_type kbasep_get_sc_block_type(enum kbase_hwcnt_set counter_set,
+								   bool is_csf)
 {
 	switch (counter_set) {
 	case KBASE_HWCNT_SET_PRIMARY:
-		*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC;
-		break;
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC;
 	case KBASE_HWCNT_SET_SECONDARY:
-		*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2;
-		break;
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2;
 	case KBASE_HWCNT_SET_TERTIARY:
 		if (is_csf)
-			*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC3;
+			return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC3;
 		else
-			*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC_UNDEFINED;
-		break;
+			return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC_UNDEFINED;
 	default:
-		WARN_ON(true);
+		WARN(true, "Invalid counter set for shader core block type: %d", counter_set);
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC_UNDEFINED;
 	}
 }
 
-static void kbasep_get_memsys_block_type(u64 *dst, enum kbase_hwcnt_set counter_set)
+
+static enum kbase_hwcnt_gpu_v5_block_type
+kbasep_get_memsys_block_type(enum kbase_hwcnt_set counter_set)
 {
 	switch (counter_set) {
 	case KBASE_HWCNT_SET_PRIMARY:
-		*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS;
-		break;
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS;
 	case KBASE_HWCNT_SET_SECONDARY:
-		*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2;
-		break;
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2;
 	case KBASE_HWCNT_SET_TERTIARY:
-		*dst = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED;
-		break;
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED;
 	default:
-		WARN_ON(true);
+		WARN(true, "Invalid counter set for Memsys block type: %d", counter_set);
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED;
+	}
+}
+
+static enum kbase_hwcnt_gpu_v5_block_type kbasep_get_fw_block_type(enum kbase_hwcnt_set counter_set)
+{
+	switch (counter_set) {
+	case KBASE_HWCNT_SET_PRIMARY:
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW;
+	case KBASE_HWCNT_SET_SECONDARY:
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW2;
+	case KBASE_HWCNT_SET_TERTIARY:
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW3;
+	default:
+		WARN(true, "Invalid counter set for FW type: %d", counter_set);
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW_UNDEFINED;
+	}
+}
+
+static enum kbase_hwcnt_gpu_v5_block_type
+kbasep_get_csg_block_type(enum kbase_hwcnt_set counter_set)
+{
+	switch (counter_set) {
+	case KBASE_HWCNT_SET_PRIMARY:
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG;
+	case KBASE_HWCNT_SET_SECONDARY:
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG2;
+	case KBASE_HWCNT_SET_TERTIARY:
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG3;
+	default:
+		WARN(true, "Invalid counter set for CSG type: %d", counter_set);
+		return KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG_UNDEFINED;
 	}
 }
 
@@ -124,49 +154,89 @@ static int kbasep_hwcnt_backend_gpu_metadata_create(const struct kbase_hwcnt_gpu
 						    const struct kbase_hwcnt_metadata **metadata)
 {
 	struct kbase_hwcnt_description desc;
-	struct kbase_hwcnt_group_description group;
-	struct kbase_hwcnt_block_description blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT];
-	size_t non_sc_block_count;
+	struct kbase_hwcnt_block_description blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT] = {};
+	size_t non_core_block_count;
+	size_t core_block_count;
 	size_t sc_block_count;
+	size_t blk_idx = 0;
 
-	WARN_ON(!gpu_info);
-	WARN_ON(!metadata);
+	if (WARN_ON(!gpu_info))
+		return -EINVAL;
 
-	/* Calculate number of block instances that aren't shader cores */
-	non_sc_block_count = 2 + gpu_info->l2_count;
+	if (WARN_ON(!metadata))
+		return -EINVAL;
+
+	/* Calculate number of block instances that aren't cores */
+	non_core_block_count = 2 + gpu_info->l2_count;
 	/* Calculate number of block instances that are shader cores */
-	sc_block_count = fls64(gpu_info->core_mask);
+	sc_block_count = (size_t)fls64(gpu_info->core_mask);
+	/* Determine the total number of cores */
+	core_block_count = sc_block_count;
+
+
+	if (gpu_info->has_fw_counters)
+		non_core_block_count += 1 + gpu_info->csg_cnt;
 
 	/*
-	 * A system can have up to 64 shader cores, but the 64-bit
-	 * availability mask can't physically represent that many cores as well
-	 * as the other hardware blocks.
-	 * Error out if there are more blocks than our implementation can
+	 * Check we have enough bits to represent the number of cores that
+	 * exist in the system. Error-out if there are more blocks than our implementation can
 	 * support.
 	 */
-	if ((sc_block_count + non_sc_block_count) > KBASE_HWCNT_AVAIL_MASK_BITS)
+	if ((core_block_count + non_core_block_count) > KBASE_HWCNT_AVAIL_MASK_BITS)
 		return -EINVAL;
 
+	/* The dump starts with, on supporting systems, the FW blocks, and as such,
+	 * they should be taken into account first.
+	 */
+	if (gpu_info->has_fw_counters) {
+		blks[blk_idx++] = (struct kbase_hwcnt_block_description){
+			.type = kbasep_get_fw_block_type(counter_set),
+			.inst_cnt = 1,
+			.hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+			.ctr_cnt = gpu_info->prfcnt_values_per_block -
+				   KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+		};
+	}
+
+	/* Some systems may support FW counters but not CSG counters, so the
+	 * two are handled differently.
+	 */
+	if (gpu_info->csg_cnt > 0) {
+		blks[blk_idx++] = (struct kbase_hwcnt_block_description){
+			.type = kbasep_get_csg_block_type(counter_set),
+			.inst_cnt = gpu_info->csg_cnt,
+			.hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+			.ctr_cnt = gpu_info->prfcnt_values_per_block -
+				   KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+		};
+	}
+
 	/* One Front End block */
-	kbasep_get_fe_block_type(&blks[0].type, counter_set, is_csf);
-	blks[0].inst_cnt = 1;
-	blks[0].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
-	blks[0].ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[blk_idx++] = (struct kbase_hwcnt_block_description){
+		.type = kbasep_get_fe_block_type(counter_set, is_csf),
+		.inst_cnt = 1,
+		.hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+		.ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+	};
 
 	/* One Tiler block */
-	kbasep_get_tiler_block_type(&blks[1].type, counter_set);
-	blks[1].inst_cnt = 1;
-	blks[1].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
-	blks[1].ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[blk_idx++] = (struct kbase_hwcnt_block_description){
+		.type = kbasep_get_tiler_block_type(counter_set),
+		.inst_cnt = 1,
+		.hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+		.ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+	};
 
 	/* l2_count memsys blks */
-	kbasep_get_memsys_block_type(&blks[2].type, counter_set);
-	blks[2].inst_cnt = gpu_info->l2_count;
-	blks[2].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
-	blks[2].ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[blk_idx++] = (struct kbase_hwcnt_block_description){
+		.type = kbasep_get_memsys_block_type(counter_set),
+		.inst_cnt = gpu_info->l2_count,
+		.hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+		.ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+	};
 
 	/*
-	 * There are as many shader cores in the system as there are bits set in
+	 * There are as many cores in the system as there are bits set in
 	 * the core mask. However, the dump buffer memory requirements need to
 	 * take into account the fact that the core mask may be non-contiguous.
 	 *
@@ -179,27 +249,36 @@ static int kbasep_hwcnt_backend_gpu_metadata_create(const struct kbase_hwcnt_gpu
 	 *
 	 * We find the core mask's last set bit to determine the memory
 	 * requirements, and embed the core mask into the availability mask so
-	 * we can determine later which shader cores physically exist.
+	 * we can determine later which cores physically exist.
 	 */
-	kbasep_get_sc_block_type(&blks[3].type, counter_set, is_csf);
-	blks[3].inst_cnt = sc_block_count;
-	blks[3].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
-	blks[3].ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[blk_idx++] = (struct kbase_hwcnt_block_description){
+		.type = kbasep_get_sc_block_type(counter_set, is_csf),
+		.inst_cnt = sc_block_count,
+		.hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+		.ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+	};
+
 
-	WARN_ON(KBASE_HWCNT_V5_BLOCK_TYPE_COUNT != 4);
+	/* Currently, we're only handling a maximum of seven blocks, and this needs
+	 * to be changed whenever the number of blocks increases
+	 */
+	BUILD_BUG_ON(KBASE_HWCNT_V5_BLOCK_TYPE_COUNT != 7);
 
-	group.type = KBASE_HWCNT_GPU_GROUP_TYPE_V5;
-	group.blk_cnt = KBASE_HWCNT_V5_BLOCK_TYPE_COUNT;
-	group.blks = blks;
+	/* After assembling the block list in the code above, we should not end up with more
+	 * elements than KBASE_HWCNT_V5_BLOCK_TYPE_COUNT.
+	 */
+	WARN_ON(blk_idx > KBASE_HWCNT_V5_BLOCK_TYPE_COUNT);
 
-	desc.grp_cnt = 1;
-	desc.grps = &group;
+	desc.blk_cnt = blk_idx;
+	desc.blks = blks;
 	desc.clk_cnt = gpu_info->clk_cnt;
 
 	/* The JM, Tiler, and L2s are always available, and are before cores */
-	desc.avail_mask = (1ull << non_sc_block_count) - 1;
-	/* Embed the core mask directly in the availability mask */
-	desc.avail_mask |= (gpu_info->core_mask << non_sc_block_count);
+	kbase_hwcnt_set_avail_mask(&desc.avail_mask, 0, 0);
+	kbase_hwcnt_set_avail_mask_bits(&desc.avail_mask, 0, non_core_block_count, U64_MAX);
+	kbase_hwcnt_set_avail_mask_bits(&desc.avail_mask, non_core_block_count, sc_block_count,
+					gpu_info->core_mask);
+
 
 	return kbase_hwcnt_metadata_create(&desc, metadata);
 }
@@ -215,7 +294,7 @@ static size_t kbasep_hwcnt_backend_jm_dump_bytes(const struct kbase_hwcnt_gpu_in
 {
 	WARN_ON(!gpu_info);
 
-	return (2 + gpu_info->l2_count + fls64(gpu_info->core_mask)) *
+	return (2 + gpu_info->l2_count + (size_t)fls64(gpu_info->core_mask)) *
 	       gpu_info->prfcnt_values_per_block * KBASE_HWCNT_VALUE_HW_BYTES;
 }
 
@@ -248,7 +327,10 @@ int kbase_hwcnt_jm_metadata_create(const struct kbase_hwcnt_gpu_info *gpu_info,
 	 * metadata since physical HW uses 32-bit per value but metadata
 	 * specifies 64-bit per value.
 	 */
-	WARN_ON(dump_bytes * 2 != metadata->dump_buf_bytes);
+	if (WARN(dump_bytes * 2 != metadata->dump_buf_bytes,
+		 "Dump buffer size expected to be %zu, instead is %zu", dump_bytes * 2,
+		 metadata->dump_buf_bytes))
+		return -EINVAL;
 
 	*out_metadata = metadata;
 	*out_dump_bytes = dump_bytes;
@@ -291,72 +373,76 @@ void kbase_hwcnt_csf_metadata_destroy(const struct kbase_hwcnt_metadata *metadat
 	kbase_hwcnt_metadata_destroy(metadata);
 }
 
-static bool is_block_type_shader(const u64 grp_type, const u64 blk_type, const size_t blk)
+bool kbase_hwcnt_is_block_type_shader(const enum kbase_hwcnt_gpu_v5_block_type blk_type)
 {
-	bool is_shader = false;
-
-	/* Warn on unknown group type */
-	if (WARN_ON(grp_type != KBASE_HWCNT_GPU_GROUP_TYPE_V5))
-		return false;
-
 	if (blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC ||
 	    blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2 ||
 	    blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC3 ||
 	    blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC_UNDEFINED)
-		is_shader = true;
+		return true;
 
-	return is_shader;
+	return false;
 }
 
-static bool is_block_type_l2_cache(const u64 grp_type, const u64 blk_type)
+bool kbase_hwcnt_is_block_type_memsys(const enum kbase_hwcnt_gpu_v5_block_type blk_type)
 {
-	bool is_l2_cache = false;
-
-	switch (grp_type) {
-	case KBASE_HWCNT_GPU_GROUP_TYPE_V5:
-		if (blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS ||
-		    blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2 ||
-		    blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED)
-			is_l2_cache = true;
-		break;
-	default:
-		/* Warn on unknown group type */
-		WARN_ON(true);
-	}
+	if (blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS ||
+	    blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2 ||
+	    blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED)
+		return true;
+
+	return false;
+}
+
+bool kbase_hwcnt_is_block_type_tiler(const enum kbase_hwcnt_gpu_v5_block_type blk_type)
+{
+	if (blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER ||
+	    blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER_UNDEFINED)
+		return true;
+
+	return false;
+}
 
-	return is_l2_cache;
+bool kbase_hwcnt_is_block_type_fe(const enum kbase_hwcnt_gpu_v5_block_type blk_type)
+{
+	if (blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE ||
+	    blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE2 ||
+	    blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE3 ||
+	    blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED)
+		return true;
+
+	return false;
 }
 
 int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 			    const struct kbase_hwcnt_enable_map *dst_enable_map, u64 pm_core_mask,
+			    u64 debug_core_mask, u64 max_core_mask, size_t max_l2_slices,
 			    const struct kbase_hwcnt_curr_config *curr_config, bool accumulate)
 {
 	const struct kbase_hwcnt_metadata *metadata;
-	size_t grp, blk, blk_inst;
+	size_t blk, blk_inst;
 	const u64 *dump_src = src;
 	size_t src_offset = 0;
 	u64 core_mask = pm_core_mask;
+	u64 shader_present = curr_config->shader_present;
 
 	/* Variables to deal with the current configuration */
-	int l2_count = 0;
+	size_t l2_count = 0;
 
 	if (!dst || !src || !dst_enable_map || (dst_enable_map->metadata != dst->metadata))
 		return -EINVAL;
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
-	{
-		const size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
-		const size_t ctr_cnt =
-			kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk);
-		const u64 blk_type = kbase_hwcnt_metadata_block_type(metadata, grp, blk);
-		const bool is_shader_core = is_block_type_shader(
-			kbase_hwcnt_metadata_group_type(metadata, grp), blk_type, blk);
-		const bool is_l2_cache = is_block_type_l2_cache(
-			kbase_hwcnt_metadata_group_type(metadata, grp), blk_type);
-		const bool is_undefined = kbase_hwcnt_is_block_type_undefined(
-			kbase_hwcnt_metadata_group_type(metadata, grp), blk_type);
+	kbase_hwcnt_metadata_for_each_block(metadata, blk, blk_inst) {
+		const size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, blk);
+		const size_t ctr_cnt = kbase_hwcnt_metadata_block_counters_count(metadata, blk);
+		const u64 blk_type = kbase_hwcnt_metadata_block_type(metadata, blk);
+		const bool is_shader_core = kbase_hwcnt_is_block_type_shader(blk_type);
+		const bool is_l2_cache = kbase_hwcnt_is_block_type_memsys(blk_type);
+		const bool is_undefined = kbase_hwcnt_is_block_type_undefined(blk_type);
+		blk_stt_t *dst_blk_stt =
+			kbase_hwcnt_dump_buffer_block_state_instance(dst, blk, blk_inst);
 		bool hw_res_available = true;
 
 		/*
@@ -383,45 +469,107 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 		/*
 		 * Skip block if no values in the destination block are enabled.
 		 */
-		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst)) {
-			u64 *dst_blk =
-				kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, blk, blk_inst)) {
+			u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, blk, blk_inst);
 			const u64 *src_blk = dump_src + src_offset;
-			bool blk_powered;
+			bool blk_valid = (!is_undefined && hw_res_available);
+
+			if (blk_valid) {
+				bool blk_powered;
+				blk_stt_t current_block_state = 0;
+
+				if (!is_shader_core) {
+					/* The L2 block must be available at this point, or handled
+					 * differently below.
+					 * Every partition must have a FE and a tiler, so they
+					 * must be implicitly available as part of the current
+					 * configuration.
+					 */
+					blk_powered = true;
+					current_block_state |= KBASE_HWCNT_STATE_AVAILABLE;
+				} else {
+					/* Check the PM core mask to see if the shader core is
+					 * powered up.
+					 */
+					blk_powered = core_mask & 1;
+
+					/* Set availability bits based on whether the core is
+					 * present in both the shader_present AND the core
+					 * mask in sysFS. The core masks are shifted to the
+					 * right at the end of the loop so always check the
+					 * rightmost bit.
+					 */
+					if ((shader_present & debug_core_mask) & 0x1)
+						current_block_state |= KBASE_HWCNT_STATE_AVAILABLE;
+					else {
+						/* If this branch is taken, the shader core may
+						 * be:
+						 * * in the max configuration, but not enabled
+						 * through the sysFS core mask
+						 * * in the max configuration, but not in the
+						 * current configuration
+						 * * physically not present
+						 */
+						current_block_state |=
+							KBASE_HWCNT_STATE_UNAVAILABLE;
+					}
+				}
 
-			if (!is_shader_core) {
-				/* Under the current PM system, counters will
-				 * only be enabled after all non shader core
-				 * blocks are powered up.
-				 */
-				blk_powered = true;
-			} else {
-				/* Check the PM core mask to see if the shader
-				 * core is powered up.
+				/* Note: KBASE_HWCNT_STATE_OFF for non-shader cores (L2, Tiler, JM)
+				 * is handled on this backend's dump_disable function (since
+				 * they are considered to always be powered here).
 				 */
-				blk_powered = core_mask & 1;
-			}
+				current_block_state |= (blk_powered) ? KBASE_HWCNT_STATE_ON :
+									     KBASE_HWCNT_STATE_OFF;
 
-			if (blk_powered && !is_undefined && hw_res_available) {
-				/* Only powered and defined blocks have valid data. */
 				if (accumulate) {
-					kbase_hwcnt_dump_buffer_block_accumulate(dst_blk, src_blk,
-										 hdr_cnt, ctr_cnt);
+					/* Only update existing counter values if block was powered
+					 * and valid
+					 */
+					if (blk_powered)
+						kbase_hwcnt_dump_buffer_block_accumulate(
+							dst_blk, src_blk, hdr_cnt, ctr_cnt);
+
+					kbase_hwcnt_block_state_append(dst_blk_stt,
+								       current_block_state);
 				} else {
-					kbase_hwcnt_dump_buffer_block_copy(dst_blk, src_blk,
-									   (hdr_cnt + ctr_cnt));
+					if (blk_powered) {
+						kbase_hwcnt_dump_buffer_block_copy(
+							dst_blk, src_blk, (hdr_cnt + ctr_cnt));
+					} else {
+						/* src is garbage, so zero the dst */
+						kbase_hwcnt_dump_buffer_block_zero(
+							dst_blk, (hdr_cnt + ctr_cnt));
+					}
+
+					kbase_hwcnt_block_state_set(dst_blk_stt,
+								    current_block_state);
+				}
+			} else if (is_l2_cache && !is_undefined) {
+				/* Defined L2 can only reach here when the partition does not
+				 * own it. Check that the L2 count is within the resource
+				 * group or whole GPU's max L2 count, and if so,
+				 * mark it as unavailable.
+				 */
+				if (l2_count <= max_l2_slices) {
+					kbase_hwcnt_block_state_set(
+						dst_blk_stt, KBASE_HWCNT_STATE_OFF |
+								     KBASE_HWCNT_STATE_UNAVAILABLE);
 				}
+				kbase_hwcnt_dump_buffer_block_zero(dst_blk, (hdr_cnt + ctr_cnt));
 			} else {
-				/* Even though the block might be undefined, the
-				 * user has enabled counter collection for it.
-				 * We should not propagate garbage data.
+				/* Even though the block is undefined, the user has
+				 * enabled counter collection for it. We should not propagate
+				 * garbage data, or copy/accumulate the block states.
 				 */
 				if (accumulate) {
 					/* No-op to preserve existing values */
 				} else {
-					/* src is garbage, so zero the dst */
+					/* src is garbage, so zero the dst and reset block state */
 					kbase_hwcnt_dump_buffer_block_zero(dst_blk,
 									   (hdr_cnt + ctr_cnt));
+					kbase_hwcnt_block_state_set(dst_blk_stt,
+								    KBASE_HWCNT_STATE_UNKNOWN);
 				}
 			}
 		}
@@ -429,66 +577,79 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 		/* Just increase the src_offset if the HW is available */
 		if (hw_res_available)
 			src_offset += (hdr_cnt + ctr_cnt);
-		if (is_shader_core)
-			core_mask = core_mask >> 1;
+		if (is_shader_core) {
+			/* Shift each core mask right by 1 */
+			core_mask >>= 1;
+			debug_core_mask >>= 1;
+			max_core_mask >>= 1;
+			shader_present >>= 1;
+		}
 	}
 
 	return 0;
 }
 
 int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
-			     const struct kbase_hwcnt_enable_map *dst_enable_map, bool accumulate)
+			     blk_stt_t *src_block_stt,
+			     const struct kbase_hwcnt_enable_map *dst_enable_map,
+			     size_t num_l2_slices, u64 shader_present_bitmap, bool accumulate)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	const u64 *dump_src = src;
 	size_t src_offset = 0;
-	size_t grp, blk, blk_inst;
+	size_t blk, blk_inst;
+	size_t blk_inst_count = 0;
 
-	if (!dst || !src || !dst_enable_map || (dst_enable_map->metadata != dst->metadata))
+	if (!dst || !src || !src_block_stt || !dst_enable_map ||
+	    (dst_enable_map->metadata != dst->metadata))
 		return -EINVAL;
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
-	{
-		const size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
-		const size_t ctr_cnt =
-			kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk);
-		const uint64_t blk_type = kbase_hwcnt_metadata_block_type(metadata, grp, blk);
-		const bool is_undefined = kbase_hwcnt_is_block_type_undefined(
-			kbase_hwcnt_metadata_group_type(metadata, grp), blk_type);
+	kbase_hwcnt_metadata_for_each_block(metadata, blk, blk_inst) {
+		const size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, blk);
+		const size_t ctr_cnt = kbase_hwcnt_metadata_block_counters_count(metadata, blk);
+		const uint64_t blk_type = kbase_hwcnt_metadata_block_type(metadata, blk);
+		const bool is_undefined = kbase_hwcnt_is_block_type_undefined(blk_type);
+		blk_stt_t *dst_blk_stt =
+			kbase_hwcnt_dump_buffer_block_state_instance(dst, blk, blk_inst);
 
 		/*
 		 * Skip block if no values in the destination block are enabled.
 		 */
-		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst)) {
-			u64 *dst_blk =
-				kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, blk, blk_inst)) {
+			u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, blk, blk_inst);
 			const u64 *src_blk = dump_src + src_offset;
 
 			if (!is_undefined) {
 				if (accumulate) {
 					kbase_hwcnt_dump_buffer_block_accumulate(dst_blk, src_blk,
 										 hdr_cnt, ctr_cnt);
+					kbase_hwcnt_block_state_append(
+						dst_blk_stt, src_block_stt[blk_inst_count]);
 				} else {
 					kbase_hwcnt_dump_buffer_block_copy(dst_blk, src_blk,
 									   (hdr_cnt + ctr_cnt));
+					kbase_hwcnt_block_state_set(dst_blk_stt,
+								    src_block_stt[blk_inst_count]);
 				}
 			} else {
-				/* Even though the block might be undefined, the
-				 * user has enabled counter collection for it.
-				 * We should not propagate garbage data.
+				/* Even though the block might be undefined, the user has enabled
+				 * counter collection for it. We should not propagate garbage
+				 * data, or copy/accumulate the block states.
 				 */
 				if (accumulate) {
 					/* No-op to preserve existing values */
 				} else {
-					/* src is garbage, so zero the dst */
+					/* src is garbage, so zero the dst and reset block state */
 					kbase_hwcnt_dump_buffer_block_zero(dst_blk,
 									   (hdr_cnt + ctr_cnt));
+					kbase_hwcnt_block_state_set(dst_blk_stt,
+								    KBASE_HWCNT_STATE_UNKNOWN);
 				}
 			}
 		}
-
+		blk_inst_count++;
 		src_offset += (hdr_cnt + ctr_cnt);
 	}
 
@@ -541,58 +702,79 @@ void kbase_hwcnt_gpu_enable_map_to_physical(struct kbase_hwcnt_physical_enable_m
 	u64 shader_bm[EM_COUNT] = { 0 };
 	u64 tiler_bm[EM_COUNT] = { 0 };
 	u64 mmu_l2_bm[EM_COUNT] = { 0 };
-	size_t grp, blk, blk_inst;
+	u64 fw_bm[EM_COUNT] = { 0 };
+	u64 csg_bm[EM_COUNT] = { 0 };
+	size_t blk, blk_inst;
 
 	if (WARN_ON(!src) || WARN_ON(!dst))
 		return;
 
 	metadata = src->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
-	{
-		const u64 grp_type = kbase_hwcnt_metadata_group_type(metadata, grp);
-		const u64 blk_type = kbase_hwcnt_metadata_block_type(metadata, grp, blk);
-		const u64 *blk_map = kbase_hwcnt_enable_map_block_instance(src, grp, blk, blk_inst);
-
-		if ((enum kbase_hwcnt_gpu_group_type)grp_type == KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
-			const size_t map_stride =
-				kbase_hwcnt_metadata_block_enable_map_stride(metadata, grp, blk);
-			size_t map_idx;
-
-			for (map_idx = 0; map_idx < map_stride; ++map_idx) {
-				if (WARN_ON(map_idx >= EM_COUNT))
-					break;
-
-				switch ((enum kbase_hwcnt_gpu_v5_block_type)blk_type) {
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC_UNDEFINED:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER_UNDEFINED:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED:
-					/* Nothing to do in this case. */
-					break;
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE2:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE3:
-					fe_bm[map_idx] |= blk_map[map_idx];
-					break;
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER:
-					tiler_bm[map_idx] |= blk_map[map_idx];
-					break;
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC3:
-					shader_bm[map_idx] |= blk_map[map_idx];
-					break;
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2:
-					mmu_l2_bm[map_idx] |= blk_map[map_idx];
-					break;
-				default:
-					WARN_ON(true);
-				}
+	kbase_hwcnt_metadata_for_each_block(metadata, blk, blk_inst) {
+		const u64 blk_type = kbase_hwcnt_metadata_block_type(metadata, blk);
+		const u64 *blk_map = kbase_hwcnt_enable_map_block_instance(src, blk, blk_inst);
+		const size_t map_stride =
+			kbase_hwcnt_metadata_block_enable_map_stride(metadata, blk);
+		size_t map_idx;
+
+		for (map_idx = 0; map_idx < map_stride; ++map_idx) {
+			if (WARN_ON(map_idx >= EM_COUNT))
+				break;
+
+			switch ((enum kbase_hwcnt_gpu_v5_block_type)blk_type) {
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC_UNDEFINED:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER_UNDEFINED:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW_UNDEFINED:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG_UNDEFINED:
+				/* Nothing to do in this case. */
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE2:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE3:
+				fe_bm[map_idx] |= blk_map[map_idx];
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER:
+				tiler_bm[map_idx] |= blk_map[map_idx];
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC3:
+				shader_bm[map_idx] |= blk_map[map_idx];
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2:
+				mmu_l2_bm[map_idx] |= blk_map[map_idx];
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW2:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW3:
+				fw_bm[map_idx] |= blk_map[map_idx];
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG2:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG3:
+				csg_bm[map_idx] |= blk_map[map_idx];
+				break;
+			default:
+				WARN(true, "Unknown block type %llu", blk_type);
 			}
-		} else {
-			WARN_ON(true);
 		}
 	}
 
@@ -603,6 +785,8 @@ void kbase_hwcnt_gpu_enable_map_to_physical(struct kbase_hwcnt_physical_enable_m
 		kbase_hwcnt_backend_gpu_block_map_to_physical(tiler_bm[EM_LO], tiler_bm[EM_HI]);
 	dst->mmu_l2_bm =
 		kbase_hwcnt_backend_gpu_block_map_to_physical(mmu_l2_bm[EM_LO], mmu_l2_bm[EM_HI]);
+	dst->fw_bm = kbase_hwcnt_backend_gpu_block_map_to_physical(fw_bm[EM_LO], fw_bm[EM_HI]);
+	dst->csg_bm = kbase_hwcnt_backend_gpu_block_map_to_physical(csg_bm[EM_LO], csg_bm[EM_HI]);
 }
 
 void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst, enum kbase_hwcnt_set src)
@@ -625,72 +809,102 @@ void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst, enum kb
 void kbase_hwcnt_gpu_enable_map_from_physical(struct kbase_hwcnt_enable_map *dst,
 					      const struct kbase_hwcnt_physical_enable_map *src)
 {
-	const struct kbase_hwcnt_metadata *metadata;
+	struct kbase_hwcnt_enable_cm cm = {};
 
-	u64 fe_bm[EM_COUNT] = { 0 };
-	u64 shader_bm[EM_COUNT] = { 0 };
-	u64 tiler_bm[EM_COUNT] = { 0 };
-	u64 mmu_l2_bm[EM_COUNT] = { 0 };
-	size_t grp, blk, blk_inst;
+	if (WARN_ON(!src) || WARN_ON(!dst))
+		return;
+
+	kbasep_hwcnt_backend_gpu_block_map_from_physical(src->fe_bm, &cm.fe_bm[EM_LO],
+							 &cm.fe_bm[EM_HI]);
+	kbasep_hwcnt_backend_gpu_block_map_from_physical(src->shader_bm, &cm.shader_bm[EM_LO],
+							 &cm.shader_bm[EM_HI]);
+	kbasep_hwcnt_backend_gpu_block_map_from_physical(src->tiler_bm, &cm.tiler_bm[EM_LO],
+							 &cm.tiler_bm[EM_HI]);
+	kbasep_hwcnt_backend_gpu_block_map_from_physical(src->mmu_l2_bm, &cm.mmu_l2_bm[EM_LO],
+							 &cm.mmu_l2_bm[EM_HI]);
+	kbasep_hwcnt_backend_gpu_block_map_from_physical(src->fw_bm, &cm.fw_bm[EM_LO],
+							 &cm.fw_bm[EM_HI]);
+	kbasep_hwcnt_backend_gpu_block_map_from_physical(src->csg_bm, &cm.csg_bm[EM_LO],
+							 &cm.csg_bm[EM_HI]);
+
+	kbase_hwcnt_gpu_enable_map_from_cm(dst, &cm);
+}
+
+void kbase_hwcnt_gpu_enable_map_from_cm(struct kbase_hwcnt_enable_map *dst,
+					const struct kbase_hwcnt_enable_cm *src)
+{
+	const struct kbase_hwcnt_metadata *metadata;
+	size_t blk, blk_inst;
 
 	if (WARN_ON(!src) || WARN_ON(!dst))
 		return;
 
 	metadata = dst->metadata;
 
-	kbasep_hwcnt_backend_gpu_block_map_from_physical(src->fe_bm, &fe_bm[EM_LO], &fe_bm[EM_HI]);
-	kbasep_hwcnt_backend_gpu_block_map_from_physical(src->shader_bm, &shader_bm[EM_LO],
-							 &shader_bm[EM_HI]);
-	kbasep_hwcnt_backend_gpu_block_map_from_physical(src->tiler_bm, &tiler_bm[EM_LO],
-							 &tiler_bm[EM_HI]);
-	kbasep_hwcnt_backend_gpu_block_map_from_physical(src->mmu_l2_bm, &mmu_l2_bm[EM_LO],
-							 &mmu_l2_bm[EM_HI]);
-
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
-	{
-		const u64 grp_type = kbase_hwcnt_metadata_group_type(metadata, grp);
-		const u64 blk_type = kbase_hwcnt_metadata_block_type(metadata, grp, blk);
-		u64 *blk_map = kbase_hwcnt_enable_map_block_instance(dst, grp, blk, blk_inst);
-
-		if ((enum kbase_hwcnt_gpu_group_type)grp_type == KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
-			const size_t map_stride =
-				kbase_hwcnt_metadata_block_enable_map_stride(metadata, grp, blk);
-			size_t map_idx;
-
-			for (map_idx = 0; map_idx < map_stride; ++map_idx) {
-				if (WARN_ON(map_idx >= EM_COUNT))
-					break;
-
-				switch ((enum kbase_hwcnt_gpu_v5_block_type)blk_type) {
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC_UNDEFINED:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER_UNDEFINED:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED:
-					/* Nothing to do in this case. */
-					break;
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE2:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE3:
-					blk_map[map_idx] = fe_bm[map_idx];
-					break;
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER:
-					blk_map[map_idx] = tiler_bm[map_idx];
-					break;
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC3:
-					blk_map[map_idx] = shader_bm[map_idx];
-					break;
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS:
-				case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2:
-					blk_map[map_idx] = mmu_l2_bm[map_idx];
-					break;
-				default:
-					WARN_ON(true);
-				}
+	kbase_hwcnt_metadata_for_each_block(metadata, blk, blk_inst) {
+		const u64 blk_type = kbase_hwcnt_metadata_block_type(metadata, blk);
+		u64 *blk_map = kbase_hwcnt_enable_map_block_instance(dst, blk, blk_inst);
+		const size_t map_stride =
+			kbase_hwcnt_metadata_block_enable_map_stride(metadata, blk);
+		size_t map_idx;
+
+		for (map_idx = 0; map_idx < map_stride; ++map_idx) {
+			if (WARN_ON(map_idx >= EM_COUNT))
+				break;
+
+			switch ((enum kbase_hwcnt_gpu_v5_block_type)blk_type) {
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC_UNDEFINED:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER_UNDEFINED:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW_UNDEFINED:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG_UNDEFINED:
+				/* Nothing to do in this case. */
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE2:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE3:
+				blk_map[map_idx] = src->fe_bm[map_idx];
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER:
+				blk_map[map_idx] = src->tiler_bm[map_idx];
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC3:
+				blk_map[map_idx] = src->shader_bm[map_idx];
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2:
+				blk_map[map_idx] = src->mmu_l2_bm[map_idx];
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW2:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW3:
+				blk_map[map_idx] = src->fw_bm[map_idx];
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG2:
+				fallthrough;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG3:
+				blk_map[map_idx] = src->csg_bm[map_idx];
+				break;
+			default:
+				WARN(true, "Invalid block type %llu", blk_type);
 			}
-		} else {
-			WARN_ON(true);
 		}
 	}
 }
@@ -699,40 +913,34 @@ void kbase_hwcnt_gpu_patch_dump_headers(struct kbase_hwcnt_dump_buffer *buf,
 					const struct kbase_hwcnt_enable_map *enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
-	size_t grp, blk, blk_inst;
+	size_t blk, blk_inst;
 
 	if (WARN_ON(!buf) || WARN_ON(!enable_map) || WARN_ON(buf->metadata != enable_map->metadata))
 		return;
 
 	metadata = buf->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
-	{
-		const u64 grp_type = kbase_hwcnt_metadata_group_type(metadata, grp);
-		u64 *buf_blk = kbase_hwcnt_dump_buffer_block_instance(buf, grp, blk, blk_inst);
+	kbase_hwcnt_metadata_for_each_block(metadata, blk, blk_inst) {
+		u64 *buf_blk = kbase_hwcnt_dump_buffer_block_instance(buf, blk, blk_inst);
 		const u64 *blk_map =
-			kbase_hwcnt_enable_map_block_instance(enable_map, grp, blk, blk_inst);
+			kbase_hwcnt_enable_map_block_instance(enable_map, blk, blk_inst);
 
-		if ((enum kbase_hwcnt_gpu_group_type)grp_type == KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
-			const size_t map_stride =
-				kbase_hwcnt_metadata_block_enable_map_stride(metadata, grp, blk);
-			u64 prfcnt_bm[EM_COUNT] = { 0 };
-			u32 prfcnt_en = 0;
-			size_t map_idx;
+		const size_t map_stride =
+			kbase_hwcnt_metadata_block_enable_map_stride(metadata, blk);
+		u64 prfcnt_bm[EM_COUNT] = { 0 };
+		u32 prfcnt_en = 0;
+		size_t map_idx;
 
-			for (map_idx = 0; map_idx < map_stride; ++map_idx) {
-				if (WARN_ON(map_idx >= EM_COUNT))
-					break;
+		for (map_idx = 0; map_idx < map_stride; ++map_idx) {
+			if (WARN_ON(map_idx >= EM_COUNT))
+				break;
 
-				prfcnt_bm[map_idx] = blk_map[map_idx];
-			}
+			prfcnt_bm[map_idx] = blk_map[map_idx];
+		}
 
-			prfcnt_en = kbase_hwcnt_backend_gpu_block_map_to_physical(prfcnt_bm[EM_LO],
-										  prfcnt_bm[EM_HI]);
+		prfcnt_en = kbase_hwcnt_backend_gpu_block_map_to_physical(prfcnt_bm[EM_LO],
+									  prfcnt_bm[EM_HI]);
 
-			buf_blk[KBASE_HWCNT_V5_PRFCNT_EN_HEADER] = prfcnt_en;
-		} else {
-			WARN_ON(true);
-		}
+		buf_blk[KBASE_HWCNT_V5_PRFCNT_EN_HEADER] = prfcnt_en;
 	}
 }
diff --git a/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.h b/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.h
index a49c31e..4339fdd 100644
--- a/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.h
+++ b/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -22,6 +22,8 @@
 #ifndef _KBASE_HWCNT_GPU_H_
 #define _KBASE_HWCNT_GPU_H_
 
+#include "hwcnt/mali_kbase_hwcnt_types.h"
+
 #include <linux/bug.h>
 #include <linux/types.h>
 
@@ -31,10 +33,10 @@ struct kbase_hwcnt_enable_map;
 struct kbase_hwcnt_dump_buffer;
 
 /* Hardware counter version 5 definitions, V5 is the only supported version. */
-#define KBASE_HWCNT_V5_BLOCK_TYPE_COUNT 4
+#define KBASE_HWCNT_V5_BLOCK_TYPE_COUNT 7
 #define KBASE_HWCNT_V5_HEADERS_PER_BLOCK 4
 #define KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK 60
-#define KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK                                                    \
+#define KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK \
 	(KBASE_HWCNT_V5_HEADERS_PER_BLOCK + KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK)
 
 /* FrontEnd block count in V5 GPU hardware counter. */
@@ -49,15 +51,6 @@ struct kbase_hwcnt_dump_buffer;
 #define KBASE_HWCNT_VALUE_HW_BYTES (sizeof(u32))
 
 /**
- * enum kbase_hwcnt_gpu_group_type - GPU hardware counter group types, used to
- *                                   identify metadata groups.
- * @KBASE_HWCNT_GPU_GROUP_TYPE_V5: GPU V5 group type.
- */
-enum kbase_hwcnt_gpu_group_type {
-	KBASE_HWCNT_GPU_GROUP_TYPE_V5,
-};
-
-/**
  * enum kbase_hwcnt_gpu_v5_block_type - GPU V5 hardware counter block types,
  *                                      used to identify metadata blocks.
  * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE:        Front End block (Job manager
@@ -79,6 +72,14 @@ enum kbase_hwcnt_gpu_group_type {
  * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS:    Memsys block.
  * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2:   Secondary Memsys block.
  * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED: Undefined Memsys block.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW:    FW block.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW2:   Secondary FW block.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW3:   Tertiary FW block.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW_UNDEFINED: Undefined FW block.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG:    CSG block.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG2:   Secondary CSG block.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG3:   Tertiary CSG block.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG_UNDEFINED: Undefined CSG block.
  */
 enum kbase_hwcnt_gpu_v5_block_type {
 	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE,
@@ -94,6 +95,14 @@ enum kbase_hwcnt_gpu_v5_block_type {
 	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS,
 	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2,
 	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED,
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW,
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW2,
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW3,
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW_UNDEFINED,
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG,
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG2,
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG3,
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG_UNDEFINED,
 };
 
 /**
@@ -117,12 +126,34 @@ enum kbase_hwcnt_set {
  * @shader_bm: Shader counters selection bitmask.
  * @tiler_bm:  Tiler counters selection bitmask.
  * @mmu_l2_bm: MMU_L2 counters selection bitmask.
+ * @fw_bm: CSF firmware counters selection bitmask.
+ * @csg_bm: CSF CSG counters selection bitmask.
  */
 struct kbase_hwcnt_physical_enable_map {
 	u32 fe_bm;
 	u32 shader_bm;
 	u32 tiler_bm;
 	u32 mmu_l2_bm;
+	u32 fw_bm;
+	u32 csg_bm;
+};
+
+/**
+ * struct kbase_hwcnt_enable_cm - 128-bit enable counter masks.
+ * @fe_bm:     Front end (JM/CSHW) counters selection bitmask.
+ * @shader_bm: Shader counters selection bitmask.
+ * @tiler_bm:  Tiler counters selection bitmask.
+ * @mmu_l2_bm: MMU_L2 counters selection bitmask.
+ * @fw_bm: CSF firmware counters selection bitmask.
+ * @csg_bm: CSF CSG counters selection bitmask.
+ */
+struct kbase_hwcnt_enable_cm {
+	u64 fe_bm[2];
+	u64 shader_bm[2];
+	u64 tiler_bm[2];
+	u64 mmu_l2_bm[2];
+	u64 fw_bm[2];
+	u64 csg_bm[2];
 };
 
 /*
@@ -140,14 +171,18 @@ enum kbase_hwcnt_physical_set {
  * @l2_count:                L2 cache count.
  * @core_mask:               Shader core mask. May be sparse.
  * @clk_cnt:                 Number of clock domains available.
+ * @csg_cnt:                 Number of CSGs available.
  * @prfcnt_values_per_block: Total entries (header + counters) of performance
  *                           counter per block.
+ * @has_fw_counters:         Whether the GPU has FW counters available.
  */
 struct kbase_hwcnt_gpu_info {
 	size_t l2_count;
 	u64 core_mask;
 	u8 clk_cnt;
+	u8 csg_cnt;
 	size_t prfcnt_values_per_block;
+	bool has_fw_counters;
 };
 
 /**
@@ -197,18 +232,12 @@ struct kbase_hwcnt_curr_config {
 /**
  * kbase_hwcnt_is_block_type_undefined() - Check if a block type is undefined.
  *
- * @grp_type: Hardware counter group type.
  * @blk_type: Hardware counter block type.
  *
  * Return: true if the block type is undefined, else false.
  */
-static inline bool kbase_hwcnt_is_block_type_undefined(const uint64_t grp_type,
-						       const uint64_t blk_type)
+static inline bool kbase_hwcnt_is_block_type_undefined(const uint64_t blk_type)
 {
-	/* Warn on unknown group type */
-	if (WARN_ON(grp_type != KBASE_HWCNT_GPU_GROUP_TYPE_V5))
-		return false;
-
 	return (blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED ||
 		blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER_UNDEFINED ||
 		blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC_UNDEFINED ||
@@ -264,16 +293,23 @@ void kbase_hwcnt_csf_metadata_destroy(const struct kbase_hwcnt_metadata *metadat
  * kbase_hwcnt_jm_dump_get() - Copy or accumulate enabled counters from the raw
  *                             dump buffer in src into the dump buffer
  *                             abstraction in dst.
- * @dst:            Non-NULL pointer to destination dump buffer.
- * @src:            Non-NULL pointer to source raw dump buffer, of same length
- *                  as dump_buf_bytes in the metadata of destination dump
- *                  buffer.
- * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
- * @pm_core_mask:   PM state synchronized shaders core mask with the dump.
- * @curr_config:    Current allocated hardware resources to correctly map the
- *                  source raw dump buffer to the destination dump buffer.
- * @accumulate:     True if counters in source should be accumulated into
- *                  destination, rather than copied.
+ * @dst:             Non-NULL pointer to destination dump buffer.
+ * @src:             Non-NULL pointer to source raw dump buffer, of same length
+ *                   as dump_buf_bytes in the metadata of destination dump
+ *                   buffer.
+ * @dst_enable_map:  Non-NULL pointer to enable map specifying enabled values.
+ * @pm_core_mask:    PM state synchronized shaders core mask with the dump.
+ * @debug_core_mask: User-set mask of cores to be used by the GPU.
+ * @max_core_mask:   Core mask of all cores allocated to the GPU (non
+ *                   virtualized platforms) or resource group (virtualized
+ *                   platforms).
+ * @max_l2_slices:   Maximum number of L2 slices allocated to the GPU (non
+ *                   virtualised platforms) or resource group (virtualized
+ *                   platforms).
+ * @curr_config:     Current allocated hardware resources to correctly map the
+ *                   source raw dump buffer to the destination dump buffer.
+ * @accumulate:      True if counters in source should be accumulated into
+ *                   destination, rather than copied.
  *
  * The dst and dst_enable_map MUST have been created from the same metadata as
  * returned from the call to kbase_hwcnt_jm_metadata_create as was used to get
@@ -283,19 +319,23 @@ void kbase_hwcnt_csf_metadata_destroy(const struct kbase_hwcnt_metadata *metadat
  */
 int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 			    const struct kbase_hwcnt_enable_map *dst_enable_map,
-			    const u64 pm_core_mask,
-			    const struct kbase_hwcnt_curr_config *curr_config, bool accumulate);
+			    const u64 pm_core_mask, u64 debug_core_mask, u64 max_core_mask,
+			    size_t max_l2_slices, const struct kbase_hwcnt_curr_config *curr_config,
+			    bool accumulate);
 
 /**
  * kbase_hwcnt_csf_dump_get() - Copy or accumulate enabled counters from the raw
  *                              dump buffer in src into the dump buffer
  *                              abstraction in dst.
- * @dst:            Non-NULL pointer to destination dump buffer.
- * @src:            Non-NULL pointer to source raw dump buffer, of same length
- *                  as dump_buf_bytes in the metadata of dst dump buffer.
- * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
- * @accumulate:     True if counters in src should be accumulated into
- *                  destination, rather than copied.
+ * @dst:                   Non-NULL pointer to destination dump buffer.
+ * @src:                   Non-NULL pointer to source raw dump buffer, of same length
+ *                         as dump_buf_bytes in the metadata of dst dump buffer.
+ * @src_block_stt:         Non-NULL pointer to source block state buffer.
+ * @dst_enable_map:        Non-NULL pointer to enable map specifying enabled values.
+ * @num_l2_slices:         Current number of L2 slices allocated to the GPU.
+ * @shader_present_bitmap: Current shader-present bitmap that is allocated to the GPU.
+ * @accumulate:            True if counters in src should be accumulated into
+ *                         destination, rather than copied.
  *
  * The dst and dst_enable_map MUST have been created from the same metadata as
  * returned from the call to kbase_hwcnt_csf_metadata_create as was used to get
@@ -304,7 +344,9 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
  * Return: 0 on success, else error code.
  */
 int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
-			     const struct kbase_hwcnt_enable_map *dst_enable_map, bool accumulate);
+			     blk_stt_t *src_block_stt,
+			     const struct kbase_hwcnt_enable_map *dst_enable_map,
+			     size_t num_l2_slices, u64 shader_present_bitmap, bool accumulate);
 
 /**
  * kbase_hwcnt_backend_gpu_block_map_to_physical() - Convert from a block
@@ -404,4 +446,23 @@ void kbase_hwcnt_gpu_enable_map_from_physical(struct kbase_hwcnt_enable_map *dst
 void kbase_hwcnt_gpu_patch_dump_headers(struct kbase_hwcnt_dump_buffer *buf,
 					const struct kbase_hwcnt_enable_map *enable_map);
 
+bool kbase_hwcnt_is_block_type_shader(const enum kbase_hwcnt_gpu_v5_block_type blk_type);
+
+bool kbase_hwcnt_is_block_type_memsys(const enum kbase_hwcnt_gpu_v5_block_type blk_type);
+
+bool kbase_hwcnt_is_block_type_tiler(const enum kbase_hwcnt_gpu_v5_block_type blk_type);
+
+bool kbase_hwcnt_is_block_type_fe(const enum kbase_hwcnt_gpu_v5_block_type blk_type);
+/**
+ * kbase_hwcnt_gpu_enable_map_from_cm() - Builds enable map abstraction from
+ *                                        counter selection bitmasks.
+ * @dst: Non-NULL pointer to destination enable map abstraction.
+ * @src: Non-NULL pointer to source counter selection bitmasks.
+ *
+ * The dst must have been created from a metadata returned from a call to
+ * kbase_hwcnt_jm_metadata_create or kbase_hwcnt_csf_metadata_create.
+ */
+void kbase_hwcnt_gpu_enable_map_from_cm(struct kbase_hwcnt_enable_map *dst,
+					const struct kbase_hwcnt_enable_cm *src);
+
 #endif /* _KBASE_HWCNT_GPU_H_ */
diff --git a/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu_narrow.c b/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu_narrow.c
deleted file mode 100644
index 0cf2f94..0000000
--- a/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu_narrow.c
+++ /dev/null
@@ -1,298 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
-/*
- *
- * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU license.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- */
-
-#include "hwcnt/mali_kbase_hwcnt_gpu.h"
-#include "hwcnt/mali_kbase_hwcnt_gpu_narrow.h"
-
-#include <linux/bug.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-
-int kbase_hwcnt_gpu_metadata_narrow_create(const struct kbase_hwcnt_metadata_narrow **dst_md_narrow,
-					   const struct kbase_hwcnt_metadata *src_md)
-{
-	struct kbase_hwcnt_description desc;
-	struct kbase_hwcnt_group_description group;
-	struct kbase_hwcnt_block_description blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT];
-	size_t prfcnt_values_per_block;
-	size_t blk;
-	int err;
-	struct kbase_hwcnt_metadata_narrow *metadata_narrow;
-
-	if (!dst_md_narrow || !src_md || !src_md->grp_metadata ||
-	    !src_md->grp_metadata[0].blk_metadata)
-		return -EINVAL;
-
-	/* Only support 1 group count and KBASE_HWCNT_V5_BLOCK_TYPE_COUNT block
-	 * count in the metadata.
-	 */
-	if ((kbase_hwcnt_metadata_group_count(src_md) != 1) ||
-	    (kbase_hwcnt_metadata_block_count(src_md, 0) != KBASE_HWCNT_V5_BLOCK_TYPE_COUNT))
-		return -EINVAL;
-
-	/* Get the values count in the first block. */
-	prfcnt_values_per_block = kbase_hwcnt_metadata_block_values_count(src_md, 0, 0);
-
-	/* check all blocks should have same values count. */
-	for (blk = 1; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) {
-		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(src_md, 0, blk);
-		if (val_cnt != prfcnt_values_per_block)
-			return -EINVAL;
-	}
-
-	/* Only support 64 and 128 entries per block. */
-	if ((prfcnt_values_per_block != 64) && (prfcnt_values_per_block != 128))
-		return -EINVAL;
-
-	metadata_narrow = kmalloc(sizeof(*metadata_narrow), GFP_KERNEL);
-	if (!metadata_narrow)
-		return -ENOMEM;
-
-	/* Narrow to 64 entries per block to keep API backward compatibility. */
-	prfcnt_values_per_block = 64;
-
-	for (blk = 0; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) {
-		size_t blk_hdr_cnt = kbase_hwcnt_metadata_block_headers_count(src_md, 0, blk);
-		blks[blk] = (struct kbase_hwcnt_block_description){
-			.type = kbase_hwcnt_metadata_block_type(src_md, 0, blk),
-			.inst_cnt = kbase_hwcnt_metadata_block_instance_count(src_md, 0, blk),
-			.hdr_cnt = blk_hdr_cnt,
-			.ctr_cnt = prfcnt_values_per_block - blk_hdr_cnt,
-		};
-	}
-
-	group = (struct kbase_hwcnt_group_description){
-		.type = kbase_hwcnt_metadata_group_type(src_md, 0),
-		.blk_cnt = KBASE_HWCNT_V5_BLOCK_TYPE_COUNT,
-		.blks = blks,
-	};
-
-	desc = (struct kbase_hwcnt_description){
-		.grp_cnt = kbase_hwcnt_metadata_group_count(src_md),
-		.avail_mask = src_md->avail_mask,
-		.clk_cnt = src_md->clk_cnt,
-		.grps = &group,
-	};
-
-	err = kbase_hwcnt_metadata_create(&desc, &metadata_narrow->metadata);
-	if (!err) {
-		/* Narrow down the buffer size to half as the narrowed metadata
-		 * only supports 32-bit but the created metadata uses 64-bit for
-		 * block entry.
-		 */
-		metadata_narrow->dump_buf_bytes = metadata_narrow->metadata->dump_buf_bytes >> 1;
-		*dst_md_narrow = metadata_narrow;
-	} else {
-		kfree(metadata_narrow);
-	}
-
-	return err;
-}
-
-void kbase_hwcnt_gpu_metadata_narrow_destroy(const struct kbase_hwcnt_metadata_narrow *md_narrow)
-{
-	if (!md_narrow)
-		return;
-
-	kbase_hwcnt_metadata_destroy(md_narrow->metadata);
-	kfree(md_narrow);
-}
-
-int kbase_hwcnt_dump_buffer_narrow_alloc(const struct kbase_hwcnt_metadata_narrow *md_narrow,
-					 struct kbase_hwcnt_dump_buffer_narrow *dump_buf)
-{
-	size_t dump_buf_bytes;
-	size_t clk_cnt_buf_bytes;
-	u8 *buf;
-
-	if (!md_narrow || !dump_buf)
-		return -EINVAL;
-
-	dump_buf_bytes = md_narrow->dump_buf_bytes;
-	clk_cnt_buf_bytes = sizeof(*dump_buf->clk_cnt_buf) * md_narrow->metadata->clk_cnt;
-
-	/* Make a single allocation for both dump_buf and clk_cnt_buf. */
-	buf = kmalloc(dump_buf_bytes + clk_cnt_buf_bytes, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	*dump_buf = (struct kbase_hwcnt_dump_buffer_narrow){
-		.md_narrow = md_narrow,
-		.dump_buf = (u32 *)buf,
-		.clk_cnt_buf = (u64 *)(buf + dump_buf_bytes),
-	};
-
-	return 0;
-}
-
-void kbase_hwcnt_dump_buffer_narrow_free(struct kbase_hwcnt_dump_buffer_narrow *dump_buf_narrow)
-{
-	if (!dump_buf_narrow)
-		return;
-
-	kfree(dump_buf_narrow->dump_buf);
-	*dump_buf_narrow = (struct kbase_hwcnt_dump_buffer_narrow){ .md_narrow = NULL,
-								    .dump_buf = NULL,
-								    .clk_cnt_buf = NULL };
-}
-
-int kbase_hwcnt_dump_buffer_narrow_array_alloc(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t n,
-	struct kbase_hwcnt_dump_buffer_narrow_array *dump_bufs)
-{
-	struct kbase_hwcnt_dump_buffer_narrow *buffers;
-	size_t buf_idx;
-	unsigned int order;
-	unsigned long addr;
-	size_t dump_buf_bytes;
-	size_t clk_cnt_buf_bytes;
-	size_t total_dump_buf_size;
-
-	if (!md_narrow || !dump_bufs)
-		return -EINVAL;
-
-	dump_buf_bytes = md_narrow->dump_buf_bytes;
-	clk_cnt_buf_bytes = sizeof(*dump_bufs->bufs->clk_cnt_buf) * md_narrow->metadata->clk_cnt;
-
-	/* Allocate memory for the dump buffer struct array */
-	buffers = kmalloc_array(n, sizeof(*buffers), GFP_KERNEL);
-	if (!buffers)
-		return -ENOMEM;
-
-	/* Allocate pages for the actual dump buffers, as they tend to be fairly
-	 * large.
-	 */
-	order = get_order((dump_buf_bytes + clk_cnt_buf_bytes) * n);
-	addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-
-	if (!addr) {
-		kfree(buffers);
-		return -ENOMEM;
-	}
-
-	*dump_bufs = (struct kbase_hwcnt_dump_buffer_narrow_array){
-		.page_addr = addr,
-		.page_order = order,
-		.buf_cnt = n,
-		.bufs = buffers,
-	};
-
-	total_dump_buf_size = dump_buf_bytes * n;
-	/* Set the buffer of each dump buf */
-	for (buf_idx = 0; buf_idx < n; buf_idx++) {
-		const size_t dump_buf_offset = dump_buf_bytes * buf_idx;
-		const size_t clk_cnt_buf_offset =
-			total_dump_buf_size + (clk_cnt_buf_bytes * buf_idx);
-
-		buffers[buf_idx] = (struct kbase_hwcnt_dump_buffer_narrow){
-			.md_narrow = md_narrow,
-			.dump_buf = (u32 *)(addr + dump_buf_offset),
-			.clk_cnt_buf = (u64 *)(addr + clk_cnt_buf_offset),
-		};
-	}
-
-	return 0;
-}
-
-void kbase_hwcnt_dump_buffer_narrow_array_free(
-	struct kbase_hwcnt_dump_buffer_narrow_array *dump_bufs)
-{
-	if (!dump_bufs)
-		return;
-
-	kfree(dump_bufs->bufs);
-	free_pages(dump_bufs->page_addr, dump_bufs->page_order);
-	memset(dump_bufs, 0, sizeof(*dump_bufs));
-}
-
-void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk, const u64 *src_blk,
-						      const u64 *blk_em, size_t val_cnt)
-{
-	size_t val;
-
-	for (val = 0; val < val_cnt; val++) {
-		bool val_enabled = kbase_hwcnt_enable_map_block_value_enabled(blk_em, val);
-		u32 src_val = (src_blk[val] > U32_MAX) ? U32_MAX : (u32)src_blk[val];
-
-		dst_blk[val] = val_enabled ? src_val : 0;
-	}
-}
-
-void kbase_hwcnt_dump_buffer_copy_strict_narrow(struct kbase_hwcnt_dump_buffer_narrow *dst_narrow,
-						const struct kbase_hwcnt_dump_buffer *src,
-						const struct kbase_hwcnt_enable_map *dst_enable_map)
-{
-	const struct kbase_hwcnt_metadata_narrow *metadata_narrow;
-	size_t grp;
-	size_t clk;
-
-	if (WARN_ON(!dst_narrow) || WARN_ON(!src) || WARN_ON(!dst_enable_map) ||
-	    WARN_ON(dst_narrow->md_narrow->metadata == src->metadata) ||
-	    WARN_ON(dst_narrow->md_narrow->metadata->grp_cnt != src->metadata->grp_cnt) ||
-	    WARN_ON(src->metadata->grp_cnt != 1) ||
-	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0].blk_cnt !=
-		    src->metadata->grp_metadata[0].blk_cnt) ||
-	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0].blk_cnt !=
-		    KBASE_HWCNT_V5_BLOCK_TYPE_COUNT) ||
-	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0].blk_metadata[0].ctr_cnt >
-		    src->metadata->grp_metadata[0].blk_metadata[0].ctr_cnt))
-		return;
-
-	/* Don't use src metadata since src buffer is bigger than dst buffer. */
-	metadata_narrow = dst_narrow->md_narrow;
-
-	for (grp = 0; grp < kbase_hwcnt_metadata_narrow_group_count(metadata_narrow); grp++) {
-		size_t blk;
-		size_t blk_cnt = kbase_hwcnt_metadata_narrow_block_count(metadata_narrow, grp);
-
-		for (blk = 0; blk < blk_cnt; blk++) {
-			size_t blk_inst;
-			size_t blk_inst_cnt = kbase_hwcnt_metadata_narrow_block_instance_count(
-				metadata_narrow, grp, blk);
-
-			for (blk_inst = 0; blk_inst < blk_inst_cnt; blk_inst++) {
-				/* The narrowed down buffer is only 32-bit. */
-				u32 *dst_blk = kbase_hwcnt_dump_buffer_narrow_block_instance(
-					dst_narrow, grp, blk, blk_inst);
-				const u64 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
-					src, grp, blk, blk_inst);
-				const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
-					dst_enable_map, grp, blk, blk_inst);
-				size_t val_cnt = kbase_hwcnt_metadata_narrow_block_values_count(
-					metadata_narrow, grp, blk);
-				/* Align upwards to include padding bytes */
-				val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
-					val_cnt, (KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
-						  KBASE_HWCNT_VALUE_BYTES));
-
-				kbase_hwcnt_dump_buffer_block_copy_strict_narrow(dst_blk, src_blk,
-										 blk_em, val_cnt);
-			}
-		}
-	}
-
-	for (clk = 0; clk < metadata_narrow->metadata->clk_cnt; clk++) {
-		bool clk_enabled =
-			kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk);
-
-		dst_narrow->clk_cnt_buf[clk] = clk_enabled ? src->clk_cnt_buf[clk] : 0;
-	}
-}
diff --git a/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu_narrow.h b/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu_narrow.h
deleted file mode 100644
index afd236d..0000000
--- a/mali_kbase/hwcnt/mali_kbase_hwcnt_gpu_narrow.h
+++ /dev/null
@@ -1,330 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *
- * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU license.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- */
-
-#ifndef _KBASE_HWCNT_GPU_NARROW_H_
-#define _KBASE_HWCNT_GPU_NARROW_H_
-
-#include "hwcnt/mali_kbase_hwcnt_types.h"
-#include <linux/types.h>
-
-struct kbase_device;
-struct kbase_hwcnt_metadata;
-struct kbase_hwcnt_enable_map;
-struct kbase_hwcnt_dump_buffer;
-
-/**
- * struct kbase_hwcnt_metadata_narrow - Narrow metadata describing the physical
- *                                      layout of narrow dump buffers.
- *                                      For backward compatibility, the narrow
- *                                      metadata only supports 64 counters per
- *                                      block and 32-bit per block entry.
- * @metadata:       Non-NULL pointer to the metadata before narrow down to
- *                  32-bit per block entry, it has 64 counters per block and
- *                  64-bit per value.
- * @dump_buf_bytes: The size in bytes after narrow 64-bit to 32-bit per block
- *                  entry.
- */
-struct kbase_hwcnt_metadata_narrow {
-	const struct kbase_hwcnt_metadata *metadata;
-	size_t dump_buf_bytes;
-};
-
-/**
- * struct kbase_hwcnt_dump_buffer_narrow - Hardware counter narrow dump buffer.
- * @md_narrow:   Non-NULL pointer to narrow metadata used to identify, and to
- *               describe the layout of the narrow dump buffer.
- * @dump_buf:    Non-NULL pointer to an array of u32 values, the array size
- *               is md_narrow->dump_buf_bytes.
- * @clk_cnt_buf: A pointer to an array of u64 values for cycle count elapsed
- *               for each clock domain.
- */
-struct kbase_hwcnt_dump_buffer_narrow {
-	const struct kbase_hwcnt_metadata_narrow *md_narrow;
-	u32 *dump_buf;
-	u64 *clk_cnt_buf;
-};
-
-/**
- * struct kbase_hwcnt_dump_buffer_narrow_array - Hardware counter narrow dump
- *                                               buffer array.
- * @page_addr:  Address of first allocated page. A single allocation is used for
- *              all narrow dump buffers in the array.
- * @page_order: The allocation order of the pages, the order is on a logarithmic
- *              scale.
- * @buf_cnt:    The number of allocated dump buffers.
- * @bufs:       Non-NULL pointer to the array of narrow dump buffer descriptors.
- */
-struct kbase_hwcnt_dump_buffer_narrow_array {
-	unsigned long page_addr;
-	unsigned int page_order;
-	size_t buf_cnt;
-	struct kbase_hwcnt_dump_buffer_narrow *bufs;
-};
-
-/**
- * kbase_hwcnt_metadata_narrow_group_count() - Get the number of groups from
- *                                             narrow metadata.
- * @md_narrow: Non-NULL pointer to narrow metadata.
- *
- * Return: Number of hardware counter groups described by narrow metadata.
- */
-static inline size_t
-kbase_hwcnt_metadata_narrow_group_count(const struct kbase_hwcnt_metadata_narrow *md_narrow)
-{
-	return kbase_hwcnt_metadata_group_count(md_narrow->metadata);
-}
-
-/**
- * kbase_hwcnt_metadata_narrow_group_type() - Get the arbitrary type of a group
- *                                            from narrow metadata.
- * @md_narrow: Non-NULL pointer to narrow metadata.
- * @grp:      Index of the group in the narrow metadata.
- *
- * Return: Type of the group grp.
- */
-static inline u64
-kbase_hwcnt_metadata_narrow_group_type(const struct kbase_hwcnt_metadata_narrow *md_narrow,
-				       size_t grp)
-{
-	return kbase_hwcnt_metadata_group_type(md_narrow->metadata, grp);
-}
-
-/**
- * kbase_hwcnt_metadata_narrow_block_count() - Get the number of blocks in a
- *                                             group from narrow metadata.
- * @md_narrow: Non-NULL pointer to narrow metadata.
- * @grp:       Index of the group in the narrow metadata.
- *
- * Return: Number of blocks in group grp.
- */
-static inline size_t
-kbase_hwcnt_metadata_narrow_block_count(const struct kbase_hwcnt_metadata_narrow *md_narrow,
-					size_t grp)
-{
-	return kbase_hwcnt_metadata_block_count(md_narrow->metadata, grp);
-}
-
-/**
- * kbase_hwcnt_metadata_narrow_block_instance_count() - Get the number of
- *                                                      instances of a block
- *                                                      from narrow metadata.
- * @md_narrow: Non-NULL pointer to narrow metadata.
- * @grp:       Index of the group in the narrow metadata.
- * @blk:       Index of the block in the group.
- *
- * Return: Number of instances of block blk in group grp.
- */
-static inline size_t kbase_hwcnt_metadata_narrow_block_instance_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp, size_t blk)
-{
-	return kbase_hwcnt_metadata_block_instance_count(md_narrow->metadata, grp, blk);
-}
-
-/**
- * kbase_hwcnt_metadata_narrow_block_headers_count() - Get the number of counter
- *                                                     headers from narrow
- *                                                     metadata.
- * @md_narrow: Non-NULL pointer to narrow metadata.
- * @grp:       Index of the group in the narrow metadata.
- * @blk:       Index of the block in the group.
- *
- * Return: Number of counter headers in each instance of block blk in group grp.
- */
-static inline size_t
-kbase_hwcnt_metadata_narrow_block_headers_count(const struct kbase_hwcnt_metadata_narrow *md_narrow,
-						size_t grp, size_t blk)
-{
-	return kbase_hwcnt_metadata_block_headers_count(md_narrow->metadata, grp, blk);
-}
-
-/**
- * kbase_hwcnt_metadata_narrow_block_counters_count() - Get the number of
- *                                                      counters from narrow
- *                                                      metadata.
- * @md_narrow: Non-NULL pointer to narrow metadata.
- * @grp:       Index of the group in the narrow metadata.
- * @blk:       Index of the block in the group.
- *
- * Return: Number of counters in each instance of block blk in group grp.
- */
-static inline size_t kbase_hwcnt_metadata_narrow_block_counters_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp, size_t blk)
-{
-	return kbase_hwcnt_metadata_block_counters_count(md_narrow->metadata, grp, blk);
-}
-
-/**
- * kbase_hwcnt_metadata_narrow_block_values_count() - Get the number of values
- *                                                    from narrow metadata.
- * @md_narrow: Non-NULL pointer to narrow metadata.
- * @grp:       Index of the group in the narrow metadata.
- * @blk:       Index of the block in the group.
- *
- * Return: Number of headers plus counters in each instance of block blk
- *         in group grp.
- */
-static inline size_t
-kbase_hwcnt_metadata_narrow_block_values_count(const struct kbase_hwcnt_metadata_narrow *md_narrow,
-					       size_t grp, size_t blk)
-{
-	return kbase_hwcnt_metadata_narrow_block_counters_count(md_narrow, grp, blk) +
-	       kbase_hwcnt_metadata_narrow_block_headers_count(md_narrow, grp, blk);
-}
-
-/**
- * kbase_hwcnt_dump_buffer_narrow_block_instance() - Get the pointer to a
- *                                                   narrowed block instance's
- *                                                   dump buffer.
- * @buf:      Non-NULL pointer to narrow dump buffer.
- * @grp:      Index of the group in the narrow metadata.
- * @blk:      Index of the block in the group.
- * @blk_inst: Index of the block instance in the block.
- *
- * Return: u32* to the dump buffer for the block instance.
- */
-static inline u32 *
-kbase_hwcnt_dump_buffer_narrow_block_instance(const struct kbase_hwcnt_dump_buffer_narrow *buf,
-					      size_t grp, size_t blk, size_t blk_inst)
-{
-	return buf->dump_buf + buf->md_narrow->metadata->grp_metadata[grp].dump_buf_index +
-	       buf->md_narrow->metadata->grp_metadata[grp].blk_metadata[blk].dump_buf_index +
-	       (buf->md_narrow->metadata->grp_metadata[grp].blk_metadata[blk].dump_buf_stride *
-		blk_inst);
-}
-
-/**
- * kbase_hwcnt_gpu_metadata_narrow_create() - Create HWC metadata with HWC
- *                                            entries per block truncated to
- *                                            64 entries and block entry size
- *                                            narrowed down to 32-bit.
- *
- * @dst_md_narrow: Non-NULL pointer to where created narrow metadata is stored
- *                 on success.
- * @src_md:        Non-NULL pointer to the HWC metadata used as the source to
- *                 create dst_md_narrow.
- *
- * For backward compatibility of the interface to user clients, a new metadata
- * with entries per block truncated to 64 and block entry size narrowed down
- * to 32-bit will be created for dst_md_narrow.
- * The total entries per block in src_md must be 64 or 128, if it's other
- * values, function returns error since it's not supported.
- *
- * Return: 0 on success, else error code.
- */
-int kbase_hwcnt_gpu_metadata_narrow_create(const struct kbase_hwcnt_metadata_narrow **dst_md_narrow,
-					   const struct kbase_hwcnt_metadata *src_md);
-
-/**
- * kbase_hwcnt_gpu_metadata_narrow_destroy() - Destroy a hardware counter narrow
- *                                             metadata object.
- * @md_narrow: Pointer to hardware counter narrow metadata.
- */
-void kbase_hwcnt_gpu_metadata_narrow_destroy(const struct kbase_hwcnt_metadata_narrow *md_narrow);
-
-/**
- * kbase_hwcnt_dump_buffer_narrow_alloc() - Allocate a narrow dump buffer.
- * @md_narrow: Non-NULL pointer to narrow metadata.
- * @dump_buf:  Non-NULL pointer to narrow dump buffer to be initialised. Will be
- *             initialised to undefined values, so must be used as a copy
- *             destination, or cleared before use.
- *
- * Return: 0 on success, else error code.
- */
-int kbase_hwcnt_dump_buffer_narrow_alloc(const struct kbase_hwcnt_metadata_narrow *md_narrow,
-					 struct kbase_hwcnt_dump_buffer_narrow *dump_buf);
-
-/**
- * kbase_hwcnt_dump_buffer_narrow_free() - Free a narrow dump buffer.
- * @dump_buf: Dump buffer to be freed.
- *
- * Can be safely called on an all-zeroed narrow dump buffer structure, or on an
- * already freed narrow dump buffer.
- */
-void kbase_hwcnt_dump_buffer_narrow_free(struct kbase_hwcnt_dump_buffer_narrow *dump_buf);
-
-/**
- * kbase_hwcnt_dump_buffer_narrow_array_alloc() - Allocate an array of narrow
- *                                                dump buffers.
- * @md_narrow:  Non-NULL pointer to narrow metadata.
- * @n:          Number of narrow dump buffers to allocate
- * @dump_bufs:  Non-NULL pointer to a kbase_hwcnt_dump_buffer_narrow_array
- *              object to be initialised.
- *
- * A single zeroed contiguous page allocation will be used for all of the
- * buffers inside the object, where:
- * dump_bufs->bufs[n].dump_buf == page_addr + n * md_narrow.dump_buf_bytes
- *
- * Return: 0 on success, else error code.
- */
-int kbase_hwcnt_dump_buffer_narrow_array_alloc(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t n,
-	struct kbase_hwcnt_dump_buffer_narrow_array *dump_bufs);
-
-/**
- * kbase_hwcnt_dump_buffer_narrow_array_free() - Free a narrow dump buffer
- *                                               array.
- * @dump_bufs: Narrow Dump buffer array to be freed.
- *
- * Can be safely called on an all-zeroed narrow dump buffer array structure, or
- * on an already freed narrow dump buffer array.
- */
-void kbase_hwcnt_dump_buffer_narrow_array_free(
-	struct kbase_hwcnt_dump_buffer_narrow_array *dump_bufs);
-
-/**
- * kbase_hwcnt_dump_buffer_block_copy_strict_narrow() - Copy all enabled block
- *                                                      values from source to
- *                                                      destination.
- * @dst_blk: Non-NULL pointer to destination block obtained from a call to
- *           kbase_hwcnt_dump_buffer_narrow_block_instance.
- * @src_blk: Non-NULL pointer to source block obtained from a call to
- *           kbase_hwcnt_dump_buffer_block_instance.
- * @blk_em:  Non-NULL pointer to the block bitfield(s) obtained from a call to
- *           kbase_hwcnt_enable_map_block_instance.
- * @val_cnt: Number of values in the block.
- *
- * After the copy, any disabled values in destination will be zero, the enabled
- * values in destination will be saturated at U32_MAX if the corresponding
- * source value is bigger than U32_MAX, or copy the value from source if the
- * corresponding source value is less than or equal to U32_MAX.
- */
-void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk, const u64 *src_blk,
-						      const u64 *blk_em, size_t val_cnt);
-
-/**
- * kbase_hwcnt_dump_buffer_copy_strict_narrow() - Copy all enabled values to a
- *                                                narrow dump buffer.
- * @dst_narrow:     Non-NULL pointer to destination dump buffer.
- * @src:            Non-NULL pointer to source dump buffer.
- * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
- *
- * After the operation, all non-enabled values (including padding bytes) will be
- * zero. Slower than the non-strict variant.
- *
- * The enabled values in dst_narrow will be saturated at U32_MAX if the
- * corresponding source value is bigger than U32_MAX, or copy the value from
- * source if the corresponding source value is less than or equal to U32_MAX.
- */
-void kbase_hwcnt_dump_buffer_copy_strict_narrow(struct kbase_hwcnt_dump_buffer_narrow *dst_narrow,
-						const struct kbase_hwcnt_dump_buffer *src,
-						const struct kbase_hwcnt_enable_map *dst_enable_map);
-
-#endif /* _KBASE_HWCNT_GPU_NARROW_H_ */
diff --git a/mali_kbase/hwcnt/mali_kbase_hwcnt_types.c b/mali_kbase/hwcnt/mali_kbase_hwcnt_types.c
index 763eb31..3d0ad5a 100644
--- a/mali_kbase/hwcnt/mali_kbase_hwcnt_types.c
+++ b/mali_kbase/hwcnt/mali_kbase_hwcnt_types.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -27,15 +27,15 @@ int kbase_hwcnt_metadata_create(const struct kbase_hwcnt_description *desc,
 				const struct kbase_hwcnt_metadata **out_metadata)
 {
 	char *buf;
+	size_t blk;
 	struct kbase_hwcnt_metadata *metadata;
-	struct kbase_hwcnt_group_metadata *grp_mds;
-	size_t grp;
-	size_t enable_map_count; /* Number of u64 bitfields (inc padding) */
-	size_t dump_buf_count; /* Number of u64 values (inc padding) */
-	size_t avail_mask_bits; /* Number of availability mask bits */
-
-	size_t size;
+	struct kbase_hwcnt_block_metadata *blk_mds;
+	size_t enable_map_count = 0; /* Number of u64 bitfields (inc padding) */
+	size_t dump_buf_count = 0; /* Number of u64 values (inc padding) */
+	size_t avail_mask_bits = 0;
+	size_t state_count = 0;
 	size_t offset;
+	size_t size;
 
 	if (!desc || !out_metadata)
 		return -EINVAL;
@@ -50,13 +50,8 @@ int kbase_hwcnt_metadata_create(const struct kbase_hwcnt_description *desc,
 	size = 0;
 	size += sizeof(struct kbase_hwcnt_metadata);
 
-	/* Group metadata */
-	size += sizeof(struct kbase_hwcnt_group_metadata) * desc->grp_cnt;
-
 	/* Block metadata */
-	for (grp = 0; grp < desc->grp_cnt; grp++) {
-		size += sizeof(struct kbase_hwcnt_block_metadata) * desc->grps[grp].blk_cnt;
-	}
+	size += sizeof(struct kbase_hwcnt_block_metadata) * desc->blk_cnt;
 
 	/* Single allocation for the entire metadata */
 	buf = kmalloc(size, GFP_KERNEL);
@@ -70,79 +65,59 @@ int kbase_hwcnt_metadata_create(const struct kbase_hwcnt_description *desc,
 	metadata = (struct kbase_hwcnt_metadata *)(buf + offset);
 	offset += sizeof(struct kbase_hwcnt_metadata);
 
-	/* Bump allocate the group metadata */
-	grp_mds = (struct kbase_hwcnt_group_metadata *)(buf + offset);
-	offset += sizeof(struct kbase_hwcnt_group_metadata) * desc->grp_cnt;
-
-	enable_map_count = 0;
-	dump_buf_count = 0;
-	avail_mask_bits = 0;
-
-	for (grp = 0; grp < desc->grp_cnt; grp++) {
-		size_t blk;
-
-		const struct kbase_hwcnt_group_description *grp_desc = desc->grps + grp;
-		struct kbase_hwcnt_group_metadata *grp_md = grp_mds + grp;
-
-		size_t group_enable_map_count = 0;
-		size_t group_dump_buffer_count = 0;
-		size_t group_avail_mask_bits = 0;
-
-		/* Bump allocate this group's block metadata */
-		struct kbase_hwcnt_block_metadata *blk_mds =
-			(struct kbase_hwcnt_block_metadata *)(buf + offset);
-		offset += sizeof(struct kbase_hwcnt_block_metadata) * grp_desc->blk_cnt;
-
-		/* Fill in each block in the group's information */
-		for (blk = 0; blk < grp_desc->blk_cnt; blk++) {
-			const struct kbase_hwcnt_block_description *blk_desc = grp_desc->blks + blk;
-			struct kbase_hwcnt_block_metadata *blk_md = blk_mds + blk;
-			const size_t n_values = blk_desc->hdr_cnt + blk_desc->ctr_cnt;
-
-			blk_md->type = blk_desc->type;
-			blk_md->inst_cnt = blk_desc->inst_cnt;
-			blk_md->hdr_cnt = blk_desc->hdr_cnt;
-			blk_md->ctr_cnt = blk_desc->ctr_cnt;
-			blk_md->enable_map_index = group_enable_map_count;
-			blk_md->enable_map_stride = kbase_hwcnt_bitfield_count(n_values);
-			blk_md->dump_buf_index = group_dump_buffer_count;
-			blk_md->dump_buf_stride = KBASE_HWCNT_ALIGN_UPWARDS(
+	/* Bump allocate the block metadata */
+	blk_mds = (struct kbase_hwcnt_block_metadata *)(buf + offset);
+	offset += sizeof(struct kbase_hwcnt_block_metadata) * desc->blk_cnt;
+
+	/* Fill in each block */
+	for (blk = 0; blk < desc->blk_cnt; blk++) {
+		const struct kbase_hwcnt_block_description *blk_desc = desc->blks + blk;
+		struct kbase_hwcnt_block_metadata *blk_md = blk_mds + blk;
+		const size_t n_values = blk_desc->hdr_cnt + blk_desc->ctr_cnt;
+
+		*blk_md = (struct kbase_hwcnt_block_metadata){
+			.type = blk_desc->type,
+			.inst_cnt = blk_desc->inst_cnt,
+			.hdr_cnt = blk_desc->hdr_cnt,
+			.ctr_cnt = blk_desc->ctr_cnt,
+			.enable_map_index = enable_map_count,
+			.enable_map_stride = kbase_hwcnt_bitfield_count(n_values),
+			.dump_buf_index = dump_buf_count,
+			.dump_buf_stride = KBASE_HWCNT_ALIGN_UPWARDS(
 				n_values,
-				(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / KBASE_HWCNT_VALUE_BYTES));
-			blk_md->avail_mask_index = group_avail_mask_bits;
-
-			group_enable_map_count += blk_md->enable_map_stride * blk_md->inst_cnt;
-			group_dump_buffer_count += blk_md->dump_buf_stride * blk_md->inst_cnt;
-			group_avail_mask_bits += blk_md->inst_cnt;
-		}
-
-		/* Fill in the group's information */
-		grp_md->type = grp_desc->type;
-		grp_md->blk_cnt = grp_desc->blk_cnt;
-		grp_md->blk_metadata = blk_mds;
-		grp_md->enable_map_index = enable_map_count;
-		grp_md->dump_buf_index = dump_buf_count;
-		grp_md->avail_mask_index = avail_mask_bits;
-
-		enable_map_count += group_enable_map_count;
-		dump_buf_count += group_dump_buffer_count;
-		avail_mask_bits += group_avail_mask_bits;
+				(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / KBASE_HWCNT_VALUE_BYTES)),
+			.avail_mask_index = avail_mask_bits,
+			.blk_stt_index = state_count,
+			.blk_stt_stride = KBASE_HWCNT_BLOCK_STATE_STRIDE,
+		};
+
+		enable_map_count += blk_md->enable_map_stride * blk_md->inst_cnt;
+		dump_buf_count += blk_md->dump_buf_stride * blk_md->inst_cnt;
+		avail_mask_bits += blk_md->inst_cnt;
+		state_count += blk_md->inst_cnt * blk_md->blk_stt_stride;
 	}
 
 	/* Fill in the top level metadata's information */
-	metadata->grp_cnt = desc->grp_cnt;
-	metadata->grp_metadata = grp_mds;
-	metadata->enable_map_bytes = enable_map_count * KBASE_HWCNT_BITFIELD_BYTES;
-	metadata->dump_buf_bytes = dump_buf_count * KBASE_HWCNT_VALUE_BYTES;
-	metadata->avail_mask = desc->avail_mask;
-	metadata->clk_cnt = desc->clk_cnt;
-
-	WARN_ON(size != offset);
+	*metadata = (struct kbase_hwcnt_metadata){
+		.blk_cnt = desc->blk_cnt,
+		.blk_metadata = blk_mds,
+		.enable_map_bytes = enable_map_count * KBASE_HWCNT_BITFIELD_BYTES,
+		.dump_buf_bytes = dump_buf_count * KBASE_HWCNT_VALUE_BYTES,
+		.blk_stt_bytes = state_count * KBASE_HWCNT_BLOCK_STATE_BYTES,
+		.clk_cnt = desc->clk_cnt,
+	};
+
+	kbase_hwcnt_cp_avail_mask(&metadata->avail_mask, &desc->avail_mask);
+
+	if (WARN_ON(size != offset))
+		return -EINVAL;
+
 	/* Due to the block alignment, there should be exactly one enable map
 	 * bit per 4 bytes in the dump buffer.
 	 */
-	WARN_ON(metadata->dump_buf_bytes !=
-		(metadata->enable_map_bytes * BITS_PER_BYTE * KBASE_HWCNT_VALUE_BYTES));
+	if (WARN_ON(metadata->dump_buf_bytes !=
+		    (metadata->enable_map_bytes * BITS_PER_BYTE * KBASE_HWCNT_VALUE_BYTES)))
+		return -EINVAL;
 
 	*out_metadata = metadata;
 	return 0;
@@ -189,6 +164,7 @@ int kbase_hwcnt_dump_buffer_alloc(const struct kbase_hwcnt_metadata *metadata,
 {
 	size_t dump_buf_bytes;
 	size_t clk_cnt_buf_bytes;
+	size_t block_state_bytes;
 	u8 *buf;
 
 	if (!metadata || !dump_buf)
@@ -196,15 +172,17 @@ int kbase_hwcnt_dump_buffer_alloc(const struct kbase_hwcnt_metadata *metadata,
 
 	dump_buf_bytes = metadata->dump_buf_bytes;
 	clk_cnt_buf_bytes = sizeof(*dump_buf->clk_cnt_buf) * metadata->clk_cnt;
+	block_state_bytes = metadata->blk_stt_bytes;
 
-	/* Make a single allocation for both dump_buf and clk_cnt_buf. */
-	buf = kmalloc(dump_buf_bytes + clk_cnt_buf_bytes, GFP_KERNEL);
+	/* Make a single allocation for dump_buf, clk_cnt_buf and block_state_buf. */
+	buf = kzalloc(dump_buf_bytes + clk_cnt_buf_bytes + block_state_bytes, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
 	dump_buf->metadata = metadata;
 	dump_buf->dump_buf = (u64 *)buf;
 	dump_buf->clk_cnt_buf = (u64 *)(buf + dump_buf_bytes);
+	dump_buf->blk_stt_buf = (blk_stt_t *)(buf + dump_buf_bytes + clk_cnt_buf_bytes);
 
 	return 0;
 }
@@ -218,72 +196,11 @@ void kbase_hwcnt_dump_buffer_free(struct kbase_hwcnt_dump_buffer *dump_buf)
 	memset(dump_buf, 0, sizeof(*dump_buf));
 }
 
-int kbase_hwcnt_dump_buffer_array_alloc(const struct kbase_hwcnt_metadata *metadata, size_t n,
-					struct kbase_hwcnt_dump_buffer_array *dump_bufs)
-{
-	struct kbase_hwcnt_dump_buffer *buffers;
-	size_t buf_idx;
-	unsigned int order;
-	unsigned long addr;
-	size_t dump_buf_bytes;
-	size_t clk_cnt_buf_bytes;
-
-	if (!metadata || !dump_bufs)
-		return -EINVAL;
-
-	dump_buf_bytes = metadata->dump_buf_bytes;
-	clk_cnt_buf_bytes = sizeof(*dump_bufs->bufs->clk_cnt_buf) * metadata->clk_cnt;
-
-	/* Allocate memory for the dump buffer struct array */
-	buffers = kmalloc_array(n, sizeof(*buffers), GFP_KERNEL);
-	if (!buffers)
-		return -ENOMEM;
-
-	/* Allocate pages for the actual dump buffers, as they tend to be fairly
-	 * large.
-	 */
-	order = get_order((dump_buf_bytes + clk_cnt_buf_bytes) * n);
-	addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-
-	if (!addr) {
-		kfree(buffers);
-		return -ENOMEM;
-	}
-
-	dump_bufs->page_addr = addr;
-	dump_bufs->page_order = order;
-	dump_bufs->buf_cnt = n;
-	dump_bufs->bufs = buffers;
-
-	/* Set the buffer of each dump buf */
-	for (buf_idx = 0; buf_idx < n; buf_idx++) {
-		const size_t dump_buf_offset = dump_buf_bytes * buf_idx;
-		const size_t clk_cnt_buf_offset =
-			(dump_buf_bytes * n) + (clk_cnt_buf_bytes * buf_idx);
-
-		buffers[buf_idx].metadata = metadata;
-		buffers[buf_idx].dump_buf = (u64 *)(addr + dump_buf_offset);
-		buffers[buf_idx].clk_cnt_buf = (u64 *)(addr + clk_cnt_buf_offset);
-	}
-
-	return 0;
-}
-
-void kbase_hwcnt_dump_buffer_array_free(struct kbase_hwcnt_dump_buffer_array *dump_bufs)
-{
-	if (!dump_bufs)
-		return;
-
-	kfree(dump_bufs->bufs);
-	free_pages(dump_bufs->page_addr, dump_bufs->page_order);
-	memset(dump_bufs, 0, sizeof(*dump_bufs));
-}
-
 void kbase_hwcnt_dump_buffer_zero(struct kbase_hwcnt_dump_buffer *dst,
 				  const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
-	size_t grp, blk, blk_inst;
+	size_t blk, blk_inst;
 
 	if (WARN_ON(!dst) || WARN_ON(!dst_enable_map) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
@@ -291,21 +208,21 @@ void kbase_hwcnt_dump_buffer_zero(struct kbase_hwcnt_dump_buffer *dst,
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
-	{
+	kbase_hwcnt_metadata_for_each_block(metadata, blk, blk_inst) {
 		u64 *dst_blk;
 		size_t val_cnt;
 
-		if (!kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst))
+		if (!kbase_hwcnt_enable_map_block_enabled(dst_enable_map, blk, blk_inst))
 			continue;
 
-		dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
-		val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, grp, blk);
+		dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, blk, blk_inst);
+		val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, blk);
 
 		kbase_hwcnt_dump_buffer_block_zero(dst_blk, val_cnt);
 	}
 
 	memset(dst->clk_cnt_buf, 0, sizeof(*dst->clk_cnt_buf) * metadata->clk_cnt);
+	memset(dst->blk_stt_buf, 0, metadata->blk_stt_bytes);
 }
 
 void kbase_hwcnt_dump_buffer_zero_strict(struct kbase_hwcnt_dump_buffer *dst)
@@ -314,15 +231,15 @@ void kbase_hwcnt_dump_buffer_zero_strict(struct kbase_hwcnt_dump_buffer *dst)
 		return;
 
 	memset(dst->dump_buf, 0, dst->metadata->dump_buf_bytes);
-
 	memset(dst->clk_cnt_buf, 0, sizeof(*dst->clk_cnt_buf) * dst->metadata->clk_cnt);
+	memset(dst->blk_stt_buf, 0, dst->metadata->blk_stt_bytes);
 }
 
 void kbase_hwcnt_dump_buffer_zero_non_enabled(struct kbase_hwcnt_dump_buffer *dst,
 					      const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
-	size_t grp, blk, blk_inst;
+	size_t blk, blk_inst;
 
 	if (WARN_ON(!dst) || WARN_ON(!dst_enable_map) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
@@ -330,23 +247,29 @@ void kbase_hwcnt_dump_buffer_zero_non_enabled(struct kbase_hwcnt_dump_buffer *ds
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
-	{
-		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+	kbase_hwcnt_metadata_for_each_block(metadata, blk, blk_inst) {
+		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, blk, blk_inst);
+		blk_stt_t *dst_blk_stt =
+			kbase_hwcnt_dump_buffer_block_state_instance(dst, blk, blk_inst);
 		const u64 *blk_em =
-			kbase_hwcnt_enable_map_block_instance(dst_enable_map, grp, blk, blk_inst);
-		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, grp, blk);
+			kbase_hwcnt_enable_map_block_instance(dst_enable_map, blk, blk_inst);
+		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, blk);
 
 		/* Align upwards to include padding bytes */
 		val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
 			val_cnt, (KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / KBASE_HWCNT_VALUE_BYTES));
 
-		if (kbase_hwcnt_metadata_block_instance_avail(metadata, grp, blk, blk_inst)) {
+		if (kbase_hwcnt_metadata_block_instance_avail(metadata, blk, blk_inst)) {
 			/* Block available, so only zero non-enabled values */
 			kbase_hwcnt_dump_buffer_block_zero_non_enabled(dst_blk, blk_em, val_cnt);
+
+			if (!kbase_hwcnt_enable_map_block_enabled(dst_enable_map, blk, blk_inst))
+				kbase_hwcnt_block_state_set(dst_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
+
 		} else {
 			/* Block not available, so zero the entire thing */
 			kbase_hwcnt_dump_buffer_block_zero(dst_blk, val_cnt);
+			kbase_hwcnt_block_state_set(dst_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
 		}
 	}
 }
@@ -356,7 +279,7 @@ void kbase_hwcnt_dump_buffer_copy(struct kbase_hwcnt_dump_buffer *dst,
 				  const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
-	size_t grp, blk, blk_inst;
+	size_t blk, blk_inst;
 	size_t clk;
 
 	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || WARN_ON(dst == src) ||
@@ -366,24 +289,27 @@ void kbase_hwcnt_dump_buffer_copy(struct kbase_hwcnt_dump_buffer *dst,
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
-	{
+	kbase_hwcnt_metadata_for_each_block(metadata, blk, blk_inst) {
 		u64 *dst_blk;
 		const u64 *src_blk;
+		blk_stt_t *dst_blk_stt;
+		const blk_stt_t *src_blk_stt;
 		size_t val_cnt;
 
-		if (!kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst))
+		if (!kbase_hwcnt_enable_map_block_enabled(dst_enable_map, blk, blk_inst))
 			continue;
 
-		dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
-		src_blk = kbase_hwcnt_dump_buffer_block_instance(src, grp, blk, blk_inst);
-		val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, grp, blk);
+		dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, blk, blk_inst);
+		src_blk = kbase_hwcnt_dump_buffer_block_instance(src, blk, blk_inst);
+		val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, blk);
+		dst_blk_stt = kbase_hwcnt_dump_buffer_block_state_instance(dst, blk, blk_inst);
+		src_blk_stt = kbase_hwcnt_dump_buffer_block_state_instance(src, blk, blk_inst);
 
 		kbase_hwcnt_dump_buffer_block_copy(dst_blk, src_blk, val_cnt);
+		kbase_hwcnt_block_state_copy(dst_blk_stt, src_blk_stt);
 	}
 
-	kbase_hwcnt_metadata_for_each_clock(metadata, clk)
-	{
+	kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
 		if (kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			dst->clk_cnt_buf[clk] = src->clk_cnt_buf[clk];
 	}
@@ -394,7 +320,7 @@ void kbase_hwcnt_dump_buffer_copy_strict(struct kbase_hwcnt_dump_buffer *dst,
 					 const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
-	size_t grp, blk, blk_inst;
+	size_t blk, blk_inst;
 	size_t clk;
 
 	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || WARN_ON(dst == src) ||
@@ -404,23 +330,30 @@ void kbase_hwcnt_dump_buffer_copy_strict(struct kbase_hwcnt_dump_buffer *dst,
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
-	{
-		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
-		const u64 *src_blk =
-			kbase_hwcnt_dump_buffer_block_instance(src, grp, blk, blk_inst);
+	kbase_hwcnt_metadata_for_each_block(metadata, blk, blk_inst) {
+		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, blk, blk_inst);
+		const u64 *src_blk = kbase_hwcnt_dump_buffer_block_instance(src, blk, blk_inst);
+		blk_stt_t *dst_blk_stt =
+			kbase_hwcnt_dump_buffer_block_state_instance(dst, blk, blk_inst);
+		const blk_stt_t *src_blk_stt =
+			kbase_hwcnt_dump_buffer_block_state_instance(src, blk, blk_inst);
 		const u64 *blk_em =
-			kbase_hwcnt_enable_map_block_instance(dst_enable_map, grp, blk, blk_inst);
-		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, grp, blk);
+			kbase_hwcnt_enable_map_block_instance(dst_enable_map, blk, blk_inst);
+		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, blk);
+
 		/* Align upwards to include padding bytes */
 		val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
 			val_cnt, (KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / KBASE_HWCNT_VALUE_BYTES));
 
 		kbase_hwcnt_dump_buffer_block_copy_strict(dst_blk, src_blk, blk_em, val_cnt);
+
+		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, blk, blk_inst))
+			kbase_hwcnt_block_state_copy(dst_blk_stt, src_blk_stt);
+		else
+			kbase_hwcnt_block_state_set(dst_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
 	}
 
-	kbase_hwcnt_metadata_for_each_clock(metadata, clk)
-	{
+	kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
 		bool clk_enabled =
 			kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk);
 
@@ -433,7 +366,7 @@ void kbase_hwcnt_dump_buffer_accumulate(struct kbase_hwcnt_dump_buffer *dst,
 					const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
-	size_t grp, blk, blk_inst;
+	size_t blk, blk_inst;
 	size_t clk;
 
 	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || WARN_ON(dst == src) ||
@@ -443,26 +376,29 @@ void kbase_hwcnt_dump_buffer_accumulate(struct kbase_hwcnt_dump_buffer *dst,
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
-	{
+	kbase_hwcnt_metadata_for_each_block(metadata, blk, blk_inst) {
 		u64 *dst_blk;
 		const u64 *src_blk;
+		blk_stt_t *dst_blk_stt;
+		const blk_stt_t *src_blk_stt;
 		size_t hdr_cnt;
 		size_t ctr_cnt;
 
-		if (!kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst))
+		if (!kbase_hwcnt_enable_map_block_enabled(dst_enable_map, blk, blk_inst))
 			continue;
 
-		dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
-		src_blk = kbase_hwcnt_dump_buffer_block_instance(src, grp, blk, blk_inst);
-		hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
-		ctr_cnt = kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk);
+		dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, blk, blk_inst);
+		src_blk = kbase_hwcnt_dump_buffer_block_instance(src, blk, blk_inst);
+		dst_blk_stt = kbase_hwcnt_dump_buffer_block_state_instance(dst, blk, blk_inst);
+		src_blk_stt = kbase_hwcnt_dump_buffer_block_state_instance(src, blk, blk_inst);
+		hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, blk);
+		ctr_cnt = kbase_hwcnt_metadata_block_counters_count(metadata, blk);
 
 		kbase_hwcnt_dump_buffer_block_accumulate(dst_blk, src_blk, hdr_cnt, ctr_cnt);
+		kbase_hwcnt_block_state_accumulate(dst_blk_stt, src_blk_stt);
 	}
 
-	kbase_hwcnt_metadata_for_each_clock(metadata, clk)
-	{
+	kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
 		if (kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			dst->clk_cnt_buf[clk] += src->clk_cnt_buf[clk];
 	}
@@ -473,7 +409,7 @@ void kbase_hwcnt_dump_buffer_accumulate_strict(struct kbase_hwcnt_dump_buffer *d
 					       const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
-	size_t grp, blk, blk_inst;
+	size_t blk, blk_inst;
 	size_t clk;
 
 	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || WARN_ON(dst == src) ||
@@ -483,15 +419,19 @@ void kbase_hwcnt_dump_buffer_accumulate_strict(struct kbase_hwcnt_dump_buffer *d
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
-	{
-		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
-		const u64 *src_blk =
-			kbase_hwcnt_dump_buffer_block_instance(src, grp, blk, blk_inst);
+	kbase_hwcnt_metadata_for_each_block(metadata, blk, blk_inst) {
+		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, blk, blk_inst);
+		const u64 *src_blk = kbase_hwcnt_dump_buffer_block_instance(src, blk, blk_inst);
 		const u64 *blk_em =
-			kbase_hwcnt_enable_map_block_instance(dst_enable_map, grp, blk, blk_inst);
-		size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
-		size_t ctr_cnt = kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk);
+			kbase_hwcnt_enable_map_block_instance(dst_enable_map, blk, blk_inst);
+		blk_stt_t *dst_blk_stt =
+			kbase_hwcnt_dump_buffer_block_state_instance(dst, blk, blk_inst);
+		const blk_stt_t *src_blk_stt =
+			kbase_hwcnt_dump_buffer_block_state_instance(src, blk, blk_inst);
+
+		size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, blk);
+		size_t ctr_cnt = kbase_hwcnt_metadata_block_counters_count(metadata, blk);
+
 		/* Align upwards to include padding bytes */
 		ctr_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
 			hdr_cnt + ctr_cnt,
@@ -499,13 +439,41 @@ void kbase_hwcnt_dump_buffer_accumulate_strict(struct kbase_hwcnt_dump_buffer *d
 
 		kbase_hwcnt_dump_buffer_block_accumulate_strict(dst_blk, src_blk, blk_em, hdr_cnt,
 								ctr_cnt);
+
+		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, blk, blk_inst))
+			kbase_hwcnt_block_state_accumulate(dst_blk_stt, src_blk_stt);
+		else
+			kbase_hwcnt_block_state_set(dst_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
 	}
 
-	kbase_hwcnt_metadata_for_each_clock(metadata, clk)
-	{
+	kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
 		if (kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			dst->clk_cnt_buf[clk] += src->clk_cnt_buf[clk];
 		else
 			dst->clk_cnt_buf[clk] = 0;
 	}
 }
+
+void kbase_hwcnt_dump_buffer_block_state_update(struct kbase_hwcnt_dump_buffer *dst,
+						const struct kbase_hwcnt_enable_map *dst_enable_map,
+						blk_stt_t blk_stt_val)
+{
+	const struct kbase_hwcnt_metadata *metadata;
+	size_t blk, blk_inst;
+
+	if (WARN_ON(!dst) || WARN_ON(!dst_enable_map) ||
+	    WARN_ON(dst->metadata != dst_enable_map->metadata))
+		return;
+
+	metadata = dst->metadata;
+
+	kbase_hwcnt_metadata_for_each_block(metadata, blk, blk_inst) {
+		if (kbase_hwcnt_metadata_block_instance_avail(metadata, blk, blk_inst) &&
+		    kbase_hwcnt_enable_map_block_enabled(dst_enable_map, blk, blk_inst)) {
+			blk_stt_t *dst_blk_stt =
+				kbase_hwcnt_dump_buffer_block_state_instance(dst, blk, blk_inst);
+
+			*dst_blk_stt |= blk_stt_val;
+		}
+	}
+}
diff --git a/mali_kbase/hwcnt/mali_kbase_hwcnt_types.h b/mali_kbase/hwcnt/mali_kbase_hwcnt_types.h
index 5c5ada4..c7afe17 100644
--- a/mali_kbase/hwcnt/mali_kbase_hwcnt_types.h
+++ b/mali_kbase/hwcnt/mali_kbase_hwcnt_types.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -34,12 +34,8 @@
  * Terminology:
  *
  * Hardware Counter System:
- *   A collection of hardware counter groups, making a full hardware counter
+ *   A collection of hardware counter blocks, making a full hardware counter
  *   system.
- * Hardware Counter Group:
- *   A group of Hardware Counter Blocks (e.g. a t62x might have more than one
- *   core group, so has one counter group per core group, where each group
- *   may have a different number and layout of counter blocks).
  * Hardware Counter Block:
  *   A block of hardware counters (e.g. shader block, tiler block).
  * Hardware Counter Block Instance:
@@ -59,10 +55,16 @@
  *
  * Enable Map:
  *   An array of u64 bitfields, where each bit either enables exactly one
- *   block value, or is unused (padding).
+ *   block value, or is unused (padding). Note that this is derived from
+ *   the client configuration, and is not obtained from the hardware.
  * Dump Buffer:
  *   An array of u64 values, where each u64 corresponds either to one block
  *   value, or is unused (padding).
+ * Block State Buffer:
+ *   An array of blk_stt_t values, where each blk_stt_t corresponds to one block
+ *   instance and is used to track the on/off power state transitions, as well has
+ *   hardware resource availability, and whether the block was operating
+ *   in normal or protected mode.
  * Availability Mask:
  *   A bitfield, where each bit corresponds to whether a block instance is
  *   physically available (e.g. an MP3 GPU may have a sparse core mask of
@@ -74,7 +76,6 @@
  * Metadata:
  *   Structure describing the physical layout of the enable map and dump buffers
  *   for a specific hardware counter system.
- *
  */
 
 #ifndef _KBASE_HWCNT_TYPES_H_
@@ -98,10 +99,14 @@
  */
 #define KBASE_HWCNT_VALUE_BYTES (sizeof(u64))
 
+/* Number of elements in the avail_mask aray, in kbase_hwcnt_metadata */
+#define KBASE_HWCNT_AVAIL_MASK_ELEM_COUNT 2
+
 /* Number of bits in an availability mask (i.e. max total number of block
  * instances supported in a Hardware Counter System)
  */
-#define KBASE_HWCNT_AVAIL_MASK_BITS (sizeof(u64) * BITS_PER_BYTE)
+#define KBASE_HWCNT_AVAIL_MASK_BITS \
+	(sizeof(u64) * KBASE_HWCNT_AVAIL_MASK_ELEM_COUNT * BITS_PER_BYTE)
 
 /* Minimum alignment of each block of hardware counters */
 #define KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT (KBASE_HWCNT_BITFIELD_BITS * KBASE_HWCNT_VALUE_BYTES)
@@ -114,9 +119,60 @@
  * Return: Input value if already aligned to the specified boundary, or next
  * (incrementing upwards) aligned value.
  */
-#define KBASE_HWCNT_ALIGN_UPWARDS(value, alignment)                                                \
+#define KBASE_HWCNT_ALIGN_UPWARDS(value, alignment) \
 	(value + ((alignment - (value % alignment)) % alignment))
 
+typedef u8 blk_stt_t;
+
+/* Number of bytes storing the per-block state transition information. */
+#define KBASE_HWCNT_BLOCK_STATE_BYTES (sizeof(blk_stt_t))
+
+/* Number of entries of blk_stt_t used to store the block state. */
+#define KBASE_HWCNT_BLOCK_STATE_STRIDE (1)
+
+/* Block state indicating that the hardware block state was indeterminable
+ * or not set during the sampling period.
+ */
+#define KBASE_HWCNT_STATE_UNKNOWN ((blk_stt_t)(0))
+
+/* Block state indicating that the hardware block was on or transitioned to on
+ * during the sampling period.
+ */
+#define KBASE_HWCNT_STATE_ON ((blk_stt_t)(1u << 0))
+
+/* Block state indicating that the hardware block was off or transitioned to off
+ * during the sampling period.
+ */
+#define KBASE_HWCNT_STATE_OFF ((blk_stt_t)(1u << 1))
+
+/* Block state indicating that the hardware block was available to the current
+ * VM for some portion of the sampling period.
+ */
+#define KBASE_HWCNT_STATE_AVAILABLE ((blk_stt_t)(1u << 2))
+
+/* Block state indicating that the hardware block was unavailable to the current
+ * VM for some portion of the sampling period.
+ */
+#define KBASE_HWCNT_STATE_UNAVAILABLE ((blk_stt_t)(1u << 3))
+
+/* Block state indicating that the hardware block was operating in normal mode
+ * for some portion of the sampling period.
+ */
+#define KBASE_HWCNT_STATE_NORMAL ((blk_stt_t)(1u << 4))
+
+/* Block state indicating that the hardware block was operating in protected mode
+ * for some portion of the sampling period.
+ */
+#define KBASE_HWCNT_STATE_PROTECTED ((blk_stt_t)(1u << 5))
+
+/* For a valid block state with the above masks, only a maximum of
+ * KBASE_HWCNT_STATE_BITS can be set.
+ */
+#define KBASE_HWCNT_STATE_BITS (6)
+
+/* Mask to detect malformed block state bitmaps. */
+#define KBASE_HWCNT_STATE_MASK ((blk_stt_t)((1u << KBASE_HWCNT_STATE_BITS) - 1))
+
 /**
  * struct kbase_hwcnt_block_description - Description of one or more identical,
  *                                        contiguous, Hardware Counter Blocks.
@@ -133,31 +189,25 @@ struct kbase_hwcnt_block_description {
 };
 
 /**
- * struct kbase_hwcnt_group_description - Description of one or more identical,
- *                                        contiguous Hardware Counter Groups.
- * @type:    The arbitrary identifier used to identify the type of the group.
- * @blk_cnt: The number of types of Hardware Counter Block in the group.
- * @blks:    Non-NULL pointer to an array of blk_cnt block descriptions,
- *           describing each type of Hardware Counter Block in the group.
- */
-struct kbase_hwcnt_group_description {
-	u64 type;
-	size_t blk_cnt;
-	const struct kbase_hwcnt_block_description *blks;
+ * struct kbase_hwcnt_avail_mask - Mask type for HW Counter availablility.
+ * @mask: Array of bitmask elements.
+ */
+struct kbase_hwcnt_avail_mask {
+	u64 mask[KBASE_HWCNT_AVAIL_MASK_ELEM_COUNT];
 };
 
-/**
+/*
  * struct kbase_hwcnt_description - Description of a Hardware Counter System.
- * @grp_cnt:    The number of Hardware Counter Groups.
- * @grps:       Non-NULL pointer to an array of grp_cnt group descriptions,
- *              describing each Hardware Counter Group in the system.
+ * @blk_cnt:    The number of Hardware Counter Blocks.
+ * @blks:       Non-NULL pointer to an array of blk_cnt block descriptions,
+ *              describing each Hardware Counter Blocks in the system.
  * @avail_mask: Flat Availability Mask for all block instances in the system.
  * @clk_cnt:    The number of clock domains in the system. The maximum is 64.
  */
 struct kbase_hwcnt_description {
-	size_t grp_cnt;
-	const struct kbase_hwcnt_group_description *grps;
-	u64 avail_mask;
+	size_t blk_cnt;
+	const struct kbase_hwcnt_block_description *blks;
+	struct kbase_hwcnt_avail_mask avail_mask;
 	u8 clk_cnt;
 };
 
@@ -183,6 +233,12 @@ struct kbase_hwcnt_description {
  * @avail_mask_index:  Index in bits into the parent's Availability Mask where
  *                     the Availability Masks of the Block Instances described
  *                     by this metadata start.
+ * @blk_stt_index:     Index in bits into the parent's Block State Buffer
+ *                     where the Block State Masks of the Block Instances described
+ *                     by this metadata start.
+ * @blk_stt_stride:    Stride in the underly block state tracking type between
+ *                     the Block State bytes corresponding to each of the
+ *                     Block Instances.
  */
 struct kbase_hwcnt_block_metadata {
 	u64 type;
@@ -194,58 +250,148 @@ struct kbase_hwcnt_block_metadata {
 	size_t dump_buf_index;
 	size_t dump_buf_stride;
 	size_t avail_mask_index;
+	size_t blk_stt_index;
+	size_t blk_stt_stride;
 };
 
 /**
- * struct kbase_hwcnt_group_metadata - Metadata describing the physical layout
- *                                     of a group of blocks in a Hardware
- *                                     Counter System's Dump Buffers and Enable
- *                                     Maps.
- * @type:             The arbitrary identifier used to identify the type of the
- *                    group.
- * @blk_cnt:          The number of types of Hardware Counter Block in the
- *                    group.
- * @blk_metadata:     Non-NULL pointer to an array of blk_cnt block metadata,
- *                    describing the physical layout of each type of Hardware
- *                    Counter Block in the group.
- * @enable_map_index: Index in u64s into the parent's Enable Map where the
- *                    Enable Maps of the blocks within the group described by
- *                    this metadata start.
- * @dump_buf_index:   Index in u64s into the parent's Dump Buffer where the
- *                    Dump Buffers of the blocks within the group described by
- *                    metadata start.
- * @avail_mask_index: Index in bits into the parent's Availability Mask where
- *                    the Availability Masks of the blocks within the group
- *                    described by this metadata start.
- */
-struct kbase_hwcnt_group_metadata {
-	u64 type;
-	size_t blk_cnt;
-	const struct kbase_hwcnt_block_metadata *blk_metadata;
-	size_t enable_map_index;
-	size_t dump_buf_index;
-	size_t avail_mask_index;
-};
+ * kbase_hwcnt_set_avail_mask() - Set bitfield values into a large bitmask. Convenience function.
+ *
+ * @avail_mask: Pointer to destination HWC mask, which is comprised of an  array of u64 elements
+ * @u0: Value of element 0.
+ * @u1: Value of element 1
+ */
+static inline void kbase_hwcnt_set_avail_mask(struct kbase_hwcnt_avail_mask *avail_mask, u64 u0,
+					      u64 u1)
+{
+	/* If KBASE_HWCNT_AVAIL_MASK_ELEM_COUNT gets updated, we must modify the signature of
+	 * kbase_hwcnt_set_avail_mask() so that all elements continue to be set.
+	 */
+	BUILD_BUG_ON(KBASE_HWCNT_AVAIL_MASK_ELEM_COUNT != 2);
+
+	avail_mask->mask[0] = u0;
+	avail_mask->mask[1] = u1;
+}
+
+/**
+ * kbase_hwcnt_avail_masks_equal() - Compare two HWC availability masks
+ * @avail_mask0: First mask to compare
+ * @avail_mask1: Second mask to compare
+ *
+ * Return: 1 if masks are equal. Otherwise, 0.
+ */
+static inline bool kbase_hwcnt_avail_masks_equal(const struct kbase_hwcnt_avail_mask *avail_mask0,
+						 const struct kbase_hwcnt_avail_mask *avail_mask1)
+{
+	return (!memcmp(avail_mask0, avail_mask1, sizeof(*avail_mask0)));
+}
+
+/**
+ * kbase_hwcnt_avail_masks_equal_values() - Compare two HWC availability masks
+ * @avail_mask: Kask to compare
+ * @u0: First element of mask to compare against
+ * @u1: Second element of mask to compare against
+ *
+ * Return: 1 if masks are equal. Otherwise, 0.
+ */
+static inline bool
+kbase_hwcnt_avail_masks_equal_values(const struct kbase_hwcnt_avail_mask *avail_mask, u64 u0,
+				     u64 u1)
+{
+	BUILD_BUG_ON(KBASE_HWCNT_AVAIL_MASK_ELEM_COUNT != 2);
+	return ((avail_mask->mask[0] == u0) && (avail_mask->mask[1] == u1));
+}
+
+/**
+ * kbase_hwcnt_cp_avail_mask - Copy one avail mask into another
+ * @dst_avail_mask: Destination mask
+ * @src_avail_mask: Source Mask
+ */
+static inline void kbase_hwcnt_cp_avail_mask(struct kbase_hwcnt_avail_mask *dst_avail_mask,
+					     const struct kbase_hwcnt_avail_mask *src_avail_mask)
+{
+	memcpy(dst_avail_mask, src_avail_mask, sizeof(*dst_avail_mask));
+}
+
+/**
+ * kbase_hwcnt_set_avail_mask_bits() - Set a bitfield value into a large bitmask
+ *
+ * @avail_mask: Pointer to destination HWC mask, which is comprised of an  array of u64 elements
+ * @offset_in_bits: The offset into which to place the value in the bitmask. The value being
+ *                  placed is expected to be fully contained by the array of bitmask elements.
+ * @length_in_bits: The length of the value being placed in the bitmask. Assumed to be no more
+ *                  than 64 bits in length.
+ * @value:          The source value to be written into the bitmask.
+ */
+static inline void kbase_hwcnt_set_avail_mask_bits(struct kbase_hwcnt_avail_mask *avail_mask,
+						   size_t offset_in_bits, size_t length_in_bits,
+						   u64 value)
+{
+	size_t arr_offset = offset_in_bits / 64;
+	size_t bits_set = 0;
+
+	if (!length_in_bits)
+		return;
+
+	WARN_ON(length_in_bits > 64);
+	if (WARN_ON((offset_in_bits + length_in_bits) > (KBASE_HWCNT_AVAIL_MASK_ELEM_COUNT << 6)))
+		return;
+
+	do {
+		size_t remaining_to_set = length_in_bits - bits_set;
+		size_t start_dest_bit_in_word = (offset_in_bits + bits_set) - (arr_offset * 64);
+		size_t bits_that_fit_into_this_word =
+			min(64 - start_dest_bit_in_word, remaining_to_set);
+
+		uint64_t dest_mask, mask, source_mask;
+		uint64_t source_fragment;
+
+		if (bits_that_fit_into_this_word == 64) {
+			mask = U64_MAX;
+			source_mask = U64_MAX;
+			dest_mask = U64_MAX;
+		} else {
+			mask = (1ULL << bits_that_fit_into_this_word) - 1;
+			source_mask = ((1ULL << (bits_that_fit_into_this_word)) - 1) << bits_set;
+			dest_mask = mask << start_dest_bit_in_word;
+		}
+
+		source_fragment = (value & source_mask) >> bits_set;
+
+		if (WARN_ON(arr_offset >= KBASE_HWCNT_AVAIL_MASK_ELEM_COUNT))
+			break;
+
+		avail_mask->mask[arr_offset] &= ~dest_mask;
+		avail_mask->mask[arr_offset] |=
+			((source_fragment & mask) << start_dest_bit_in_word);
+
+		arr_offset++;
+		bits_set += bits_that_fit_into_this_word;
+	} while (bits_set < length_in_bits);
+}
 
 /**
  * struct kbase_hwcnt_metadata - Metadata describing the memory layout
  *                               of Dump Buffers and Enable Maps within a
  *                               Hardware Counter System.
- * @grp_cnt:          The number of Hardware Counter Groups.
- * @grp_metadata:     Non-NULL pointer to an array of grp_cnt group metadata,
+ * @blk_cnt:          The number of Hardware Counter Blocks
+ * @blk_metadata:     Non-NULL pointer to an array of blk_cnt block metadata,
  *                    describing the physical layout of each Hardware Counter
- *                    Group in the system.
+ *                    Block in the system.
  * @enable_map_bytes: The size in bytes of an Enable Map needed for the system.
  * @dump_buf_bytes:   The size in bytes of a Dump Buffer needed for the system.
+ * @blk_stt_bytes:    The size in bytes of a Block State Buffer needed for
+ *                    the system.
  * @avail_mask:       The Availability Mask for the system.
  * @clk_cnt:          The number of clock domains in the system.
  */
 struct kbase_hwcnt_metadata {
-	size_t grp_cnt;
-	const struct kbase_hwcnt_group_metadata *grp_metadata;
+	size_t blk_cnt;
+	const struct kbase_hwcnt_block_metadata *blk_metadata;
 	size_t enable_map_bytes;
 	size_t dump_buf_bytes;
-	u64 avail_mask;
+	size_t blk_stt_bytes;
+	struct kbase_hwcnt_avail_mask avail_mask;
 	u8 clk_cnt;
 };
 
@@ -257,7 +403,7 @@ struct kbase_hwcnt_metadata {
  * @hwcnt_enable_map: Non-NULL pointer of size metadata->enable_map_bytes to an
  *              array of u64 bitfields, each bit of which enables one hardware
  *              counter.
- * @clk_enable_map: An array of u64 bitfields, each bit of which enables cycle
+ * @clk_enable_map: A u64 bitfield, each bit of which enables cycle
  *              counter for a given clock domain.
  */
 struct kbase_hwcnt_enable_map {
@@ -274,27 +420,14 @@ struct kbase_hwcnt_enable_map {
  *            metadata->dump_buf_bytes.
  * @clk_cnt_buf: A pointer to an array of u64 values for cycle count elapsed
  *               for each clock domain.
+ * @blk_stt_buf: A pointer to an array of blk_stt_t values holding block state
+ *               information for each block.
  */
 struct kbase_hwcnt_dump_buffer {
 	const struct kbase_hwcnt_metadata *metadata;
 	u64 *dump_buf;
 	u64 *clk_cnt_buf;
-};
-
-/**
- * struct kbase_hwcnt_dump_buffer_array - Hardware Counter Dump Buffer array.
- * @page_addr:  Address of allocated pages. A single allocation is used for all
- *              Dump Buffers in the array.
- * @page_order: The allocation order of the pages, the order is on a logarithmic
- *              scale.
- * @buf_cnt:    The number of allocated Dump Buffers.
- * @bufs:       Non-NULL pointer to the array of Dump Buffers.
- */
-struct kbase_hwcnt_dump_buffer_array {
-	unsigned long page_addr;
-	unsigned int page_order;
-	size_t buf_cnt;
-	struct kbase_hwcnt_dump_buffer *bufs;
+	blk_stt_t *blk_stt_buf;
 };
 
 /**
@@ -316,232 +449,229 @@ int kbase_hwcnt_metadata_create(const struct kbase_hwcnt_description *desc,
 void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
 
 /**
- * kbase_hwcnt_metadata_group_count() - Get the number of groups.
- * @metadata: Non-NULL pointer to metadata.
- *
- * Return: Number of hardware counter groups described by metadata.
+ * kbase_hwcnt_block_state_set() - Set one or more block states
+ *                                 for a block instance.
+ * @blk_stt: Pointer to destination block state instance
+ * @stt: Block state bitmask
  */
-static inline size_t kbase_hwcnt_metadata_group_count(const struct kbase_hwcnt_metadata *metadata)
+static inline void kbase_hwcnt_block_state_set(blk_stt_t *blk_stt, blk_stt_t stt)
 {
-	if (WARN_ON(!metadata))
-		return 0;
+	if (WARN_ON(stt & ~KBASE_HWCNT_STATE_MASK))
+		return;
 
-	return metadata->grp_cnt;
+	*blk_stt = stt;
 }
 
 /**
- * kbase_hwcnt_metadata_group_type() - Get the arbitrary type of a group.
- * @metadata: Non-NULL pointer to metadata.
- * @grp:      Index of the group in the metadata.
- *
- * Return: Type of the group grp.
+ * kbase_hwcnt_block_state_append() - Adds one or more block states
+ *                                    onto a block instance.
+ * @blk_stt: Pointer to destination block state instance
+ * @stt: Block state bitmask
  */
-static inline u64 kbase_hwcnt_metadata_group_type(const struct kbase_hwcnt_metadata *metadata,
-						  size_t grp)
+static inline void kbase_hwcnt_block_state_append(blk_stt_t *blk_stt, blk_stt_t stt)
 {
-	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt))
-		return 0;
+	if (WARN_ON(stt & ~KBASE_HWCNT_STATE_MASK))
+		return;
 
-	return metadata->grp_metadata[grp].type;
+	*blk_stt |= stt;
+}
+
+/**
+ * kbase_hwcnt_block_state_copy() - Copy block state between two block
+ *                                  state instances.
+ * @dst_blk_stt: Pointer to destination block state instance
+ * @src_blk_stt: Pointer to source block state instance.
+ */
+static inline void kbase_hwcnt_block_state_copy(blk_stt_t *dst_blk_stt,
+						const blk_stt_t *src_blk_stt)
+{
+	kbase_hwcnt_block_state_set(dst_blk_stt, *src_blk_stt);
+}
+
+/**
+ * kbase_hwcnt_block_state_accumulate() - Accumulate block state between two block
+ *                                        state instances.
+ * @dst_blk_stt: Pointer to destination block state instance
+ * @src_blk_stt: Pointer to source block state instance.
+ */
+static inline void kbase_hwcnt_block_state_accumulate(blk_stt_t *dst_blk_stt,
+						      const blk_stt_t *src_blk_stt)
+{
+	kbase_hwcnt_block_state_append(dst_blk_stt, *src_blk_stt);
 }
 
 /**
- * kbase_hwcnt_metadata_block_count() - Get the number of blocks in a group.
+ * kbase_hwcnt_metadata_block_count() - Get the number of blocks in the metadata.
  * @metadata: Non-NULL pointer to metadata.
- * @grp:      Index of the group in the metadata.
  *
- * Return: Number of blocks in group grp.
+ * Return: Number of blocks in the metadata.
  */
-static inline size_t kbase_hwcnt_metadata_block_count(const struct kbase_hwcnt_metadata *metadata,
-						      size_t grp)
+static inline size_t kbase_hwcnt_metadata_block_count(const struct kbase_hwcnt_metadata *metadata)
 {
-	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt))
+	if (WARN_ON(!metadata))
 		return 0;
 
-	return metadata->grp_metadata[grp].blk_cnt;
+	return metadata->blk_cnt;
 }
 
 /**
  * kbase_hwcnt_metadata_block_type() - Get the arbitrary type of a block.
  * @metadata: Non-NULL pointer to metadata.
- * @grp:      Index of the group in the metadata.
- * @blk:      Index of the block in the group.
+ * @blk:      Index of the block.
  *
- * Return: Type of the block blk in group grp.
+ * Return: Type of the block blk.
  */
 static inline u64 kbase_hwcnt_metadata_block_type(const struct kbase_hwcnt_metadata *metadata,
-						  size_t grp, size_t blk)
+						  size_t blk)
 {
-	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
-	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
+	if (WARN_ON(!metadata) || WARN_ON(blk >= metadata->blk_cnt))
 		return 0;
 
-	return metadata->grp_metadata[grp].blk_metadata[blk].type;
+	return metadata->blk_metadata[blk].type;
 }
 
 /**
  * kbase_hwcnt_metadata_block_instance_count() - Get the number of instances of
  *                                               a block.
  * @metadata: Non-NULL pointer to metadata.
- * @grp:      Index of the group in the metadata.
- * @blk:      Index of the block in the group.
+ * @blk:      Index of the block in the metadata.
  *
- * Return: Number of instances of block blk in group grp.
+ * Return: Number of instances of block blk.
  */
 static inline size_t
-kbase_hwcnt_metadata_block_instance_count(const struct kbase_hwcnt_metadata *metadata, size_t grp,
-					  size_t blk)
+kbase_hwcnt_metadata_block_instance_count(const struct kbase_hwcnt_metadata *metadata, size_t blk)
 {
-	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
-	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
+	if (WARN_ON(!metadata) || WARN_ON(blk >= metadata->blk_cnt))
 		return 0;
 
-	return metadata->grp_metadata[grp].blk_metadata[blk].inst_cnt;
+	return metadata->blk_metadata[blk].inst_cnt;
 }
 
 /**
  * kbase_hwcnt_metadata_block_headers_count() - Get the number of counter
  *                                              headers.
  * @metadata: Non-NULL pointer to metadata.
- * @grp:      Index of the group in the metadata.
- * @blk:      Index of the block in the group.
+ * @blk:      Index of the block in the metadata.
  *
- * Return: Number of counter headers in each instance of block blk in group grp.
+ * Return: Number of counter headers in each instance of block blk.
  */
 static inline size_t
-kbase_hwcnt_metadata_block_headers_count(const struct kbase_hwcnt_metadata *metadata, size_t grp,
-					 size_t blk)
+kbase_hwcnt_metadata_block_headers_count(const struct kbase_hwcnt_metadata *metadata, size_t blk)
 {
-	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
-	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
+	if (WARN_ON(!metadata) || WARN_ON(blk >= metadata->blk_cnt))
 		return 0;
 
-	return metadata->grp_metadata[grp].blk_metadata[blk].hdr_cnt;
+	return metadata->blk_metadata[blk].hdr_cnt;
 }
 
 /**
  * kbase_hwcnt_metadata_block_counters_count() - Get the number of counters.
  * @metadata: Non-NULL pointer to metadata.
- * @grp:      Index of the group in the metadata.
- * @blk:      Index of the block in the group.
+ * @blk:      Index of the block in the metadata.
  *
- * Return: Number of counters in each instance of block blk in group grp.
+ * Return: Number of counters in each instance of block blk.
  */
 static inline size_t
-kbase_hwcnt_metadata_block_counters_count(const struct kbase_hwcnt_metadata *metadata, size_t grp,
-					  size_t blk)
+kbase_hwcnt_metadata_block_counters_count(const struct kbase_hwcnt_metadata *metadata, size_t blk)
 {
-	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
-	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
+	if (WARN_ON(!metadata) || WARN_ON(blk >= metadata->blk_cnt))
 		return 0;
 
-	return metadata->grp_metadata[grp].blk_metadata[blk].ctr_cnt;
+	return metadata->blk_metadata[blk].ctr_cnt;
 }
 
 /**
  * kbase_hwcnt_metadata_block_enable_map_stride() - Get the enable map stride.
  * @metadata: Non-NULL pointer to metadata.
- * @grp:      Index of the group in the metadata.
- * @blk:      Index of the block in the group.
+ * @blk:      Index of the block in the metadata.
  *
- * Return: enable map stride in each instance of block blk in group grp.
+ * Return: enable map stride in each instance of block blk.
  */
 static inline size_t
 kbase_hwcnt_metadata_block_enable_map_stride(const struct kbase_hwcnt_metadata *metadata,
-					     size_t grp, size_t blk)
+					     size_t blk)
 {
-	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
-	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
+	if (WARN_ON(!metadata) || WARN_ON(blk >= metadata->blk_cnt))
 		return 0;
 
-	return metadata->grp_metadata[grp].blk_metadata[blk].enable_map_stride;
+	return metadata->blk_metadata[blk].enable_map_stride;
 }
 
 /**
  * kbase_hwcnt_metadata_block_values_count() - Get the number of values.
  * @metadata: Non-NULL pointer to metadata.
- * @grp:      Index of the group in the metadata.
- * @blk:      Index of the block in the group.
+ * @blk:      Index of the block in the metadata.
  *
  * Return: Number of headers plus counters in each instance of block blk
- *         in group grp.
+ *         in the metadata.
  */
 static inline size_t
-kbase_hwcnt_metadata_block_values_count(const struct kbase_hwcnt_metadata *metadata, size_t grp,
-					size_t blk)
+kbase_hwcnt_metadata_block_values_count(const struct kbase_hwcnt_metadata *metadata, size_t blk)
 {
-	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
-	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
+	if (WARN_ON(!metadata) || WARN_ON(blk >= metadata->blk_cnt))
 		return 0;
 
-	return kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk) +
-	       kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
+	return kbase_hwcnt_metadata_block_counters_count(metadata, blk) +
+	       kbase_hwcnt_metadata_block_headers_count(metadata, blk);
 }
 
 /**
  * kbase_hwcnt_metadata_for_each_block() - Iterate over each block instance in
  *                                         the metadata.
  * @md:       Non-NULL pointer to metadata.
- * @grp:      size_t variable used as group iterator.
  * @blk:      size_t variable used as block iterator.
  * @blk_inst: size_t variable used as block instance iterator.
  *
- * Iteration order is group, then block, then block instance (i.e. linearly
- * through memory).
+ * Iteration order is block, then block instance (i.e. linearly through memory).
  */
-#define kbase_hwcnt_metadata_for_each_block(md, grp, blk, blk_inst)                                \
-	for ((grp) = 0; (grp) < kbase_hwcnt_metadata_group_count((md)); (grp)++)                   \
-		for ((blk) = 0; (blk) < kbase_hwcnt_metadata_block_count((md), (grp)); (blk)++)    \
-			for ((blk_inst) = 0;                                                       \
-			     (blk_inst) <                                                          \
-			     kbase_hwcnt_metadata_block_instance_count((md), (grp), (blk));        \
-			     (blk_inst)++)
+#define kbase_hwcnt_metadata_for_each_block(md, blk, blk_inst)                            \
+	for ((blk) = 0; (blk) < kbase_hwcnt_metadata_block_count((md)); (blk)++)          \
+		for ((blk_inst) = 0;                                                      \
+		     (blk_inst) < kbase_hwcnt_metadata_block_instance_count((md), (blk)); \
+		     (blk_inst)++)
 
 /**
  * kbase_hwcnt_metadata_block_avail_bit() - Get the bit index into the avail
  *                                          mask corresponding to the block.
  * @metadata: Non-NULL pointer to metadata.
- * @grp:      Index of the group in the metadata.
- * @blk:      Index of the block in the group.
+ * @blk:      Index of the block in the metadata.
  *
  * Return: The bit index into the avail mask for the block.
  */
 static inline size_t
-kbase_hwcnt_metadata_block_avail_bit(const struct kbase_hwcnt_metadata *metadata, size_t grp,
-				     size_t blk)
+kbase_hwcnt_metadata_block_avail_bit(const struct kbase_hwcnt_metadata *metadata, size_t blk)
 {
-	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
-	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
+	if (WARN_ON(!metadata) || WARN_ON(blk >= metadata->blk_cnt))
 		return 0;
 
-	return metadata->grp_metadata[grp].avail_mask_index +
-	       metadata->grp_metadata[grp].blk_metadata[blk].avail_mask_index;
+	return metadata->blk_metadata[blk].avail_mask_index;
 }
 
 /**
  * kbase_hwcnt_metadata_block_instance_avail() - Check if a block instance is
  *                                               available.
  * @metadata: Non-NULL pointer to metadata.
- * @grp:      Index of the group in the metadata.
- * @blk:      Index of the block in the group.
+ * @blk:      Index of the block in the metadata.
  * @blk_inst: Index of the block instance in the block.
  *
  * Return: true if the block instance is available, else false.
  */
 static inline bool
-kbase_hwcnt_metadata_block_instance_avail(const struct kbase_hwcnt_metadata *metadata, size_t grp,
-					  size_t blk, size_t blk_inst)
+kbase_hwcnt_metadata_block_instance_avail(const struct kbase_hwcnt_metadata *metadata, size_t blk,
+					  size_t blk_inst)
 {
 	size_t bit;
+	size_t mask_index;
 	u64 mask;
 
 	if (WARN_ON(!metadata))
 		return false;
 
-	bit = kbase_hwcnt_metadata_block_avail_bit(metadata, grp, blk) + blk_inst;
-	mask = 1ull << bit;
+	bit = kbase_hwcnt_metadata_block_avail_bit(metadata, blk) + blk_inst;
+	mask_index = bit >> 6;
+	mask = 1ull << (bit & 0x3f);
 
-	return (metadata->avail_mask & mask) != 0;
+	return (metadata->avail_mask.mask[mask_index] & mask) != 0;
 }
 
 /**
@@ -568,31 +698,28 @@ void kbase_hwcnt_enable_map_free(struct kbase_hwcnt_enable_map *enable_map);
  * kbase_hwcnt_enable_map_block_instance() - Get the pointer to a block
  *                                           instance's enable map.
  * @map:      Non-NULL pointer to enable map.
- * @grp:      Index of the group in the metadata.
- * @blk:      Index of the block in the group.
+ * @blk:      Index of the block in the metadata.
  * @blk_inst: Index of the block instance in the block.
  *
  * Return: u64* to the bitfield(s) used as the enable map for the
  *         block instance.
  */
 static inline u64 *kbase_hwcnt_enable_map_block_instance(const struct kbase_hwcnt_enable_map *map,
-							 size_t grp, size_t blk, size_t blk_inst)
+							 size_t blk, size_t blk_inst)
 {
 	if (WARN_ON(!map) || WARN_ON(!map->hwcnt_enable_map))
 		return NULL;
 
-	if (WARN_ON(!map->metadata) || WARN_ON(grp >= map->metadata->grp_cnt) ||
-	    WARN_ON(blk >= map->metadata->grp_metadata[grp].blk_cnt) ||
-	    WARN_ON(blk_inst >= map->metadata->grp_metadata[grp].blk_metadata[blk].inst_cnt))
+	if (WARN_ON(!map->metadata) || WARN_ON(blk >= map->metadata->blk_cnt) ||
+	    WARN_ON(blk_inst >= map->metadata->blk_metadata[blk].inst_cnt))
 		return map->hwcnt_enable_map;
 
-	return map->hwcnt_enable_map + map->metadata->grp_metadata[grp].enable_map_index +
-	       map->metadata->grp_metadata[grp].blk_metadata[blk].enable_map_index +
-	       (map->metadata->grp_metadata[grp].blk_metadata[blk].enable_map_stride * blk_inst);
+	return map->hwcnt_enable_map + map->metadata->blk_metadata[blk].enable_map_index +
+	       (map->metadata->blk_metadata[blk].enable_map_stride * blk_inst);
 }
 
 /**
- * kbase_hwcnt_bitfield_count() - Calculate the number of u64 bitfields required
+ * kbase_hwcnt_bitfield_count - Calculate the number of u64 bitfields required
  *                                to have at minimum one bit per value.
  * @val_cnt: Number of values.
  *
@@ -604,24 +731,22 @@ static inline size_t kbase_hwcnt_bitfield_count(size_t val_cnt)
 }
 
 /**
- * kbase_hwcnt_enable_map_block_disable_all() - Disable all values in a block.
+ * kbase_hwcnt_enable_map_block_disable_all - Disable all values in a block.
  * @dst:      Non-NULL pointer to enable map.
- * @grp:      Index of the group in the metadata.
- * @blk:      Index of the block in the group.
+ * @blk:      Index of the block in the metadata.
  * @blk_inst: Index of the block instance in the block.
  */
 static inline void kbase_hwcnt_enable_map_block_disable_all(struct kbase_hwcnt_enable_map *dst,
-							    size_t grp, size_t blk, size_t blk_inst)
+							    size_t blk, size_t blk_inst)
 {
 	size_t val_cnt;
 	size_t bitfld_cnt;
-	u64 *const block_enable_map =
-		kbase_hwcnt_enable_map_block_instance(dst, grp, blk, blk_inst);
+	u64 *const block_enable_map = kbase_hwcnt_enable_map_block_instance(dst, blk, blk_inst);
 
 	if (WARN_ON(!dst))
 		return;
 
-	val_cnt = kbase_hwcnt_metadata_block_values_count(dst->metadata, grp, blk);
+	val_cnt = kbase_hwcnt_metadata_block_values_count(dst->metadata, blk);
 	bitfld_cnt = kbase_hwcnt_bitfield_count(val_cnt);
 
 	memset(block_enable_map, 0, bitfld_cnt * KBASE_HWCNT_BITFIELD_BYTES);
@@ -645,23 +770,21 @@ static inline void kbase_hwcnt_enable_map_disable_all(struct kbase_hwcnt_enable_
 /**
  * kbase_hwcnt_enable_map_block_enable_all() - Enable all values in a block.
  * @dst:      Non-NULL pointer to enable map.
- * @grp:      Index of the group in the metadata.
- * @blk:      Index of the block in the group.
+ * @blk:      Index of the block in the metadata.
  * @blk_inst: Index of the block instance in the block.
  */
 static inline void kbase_hwcnt_enable_map_block_enable_all(struct kbase_hwcnt_enable_map *dst,
-							   size_t grp, size_t blk, size_t blk_inst)
+							   size_t blk, size_t blk_inst)
 {
 	size_t val_cnt;
 	size_t bitfld_cnt;
-	u64 *const block_enable_map =
-		kbase_hwcnt_enable_map_block_instance(dst, grp, blk, blk_inst);
+	u64 *const block_enable_map = kbase_hwcnt_enable_map_block_instance(dst, blk, blk_inst);
 	size_t bitfld_idx;
 
 	if (WARN_ON(!dst))
 		return;
 
-	val_cnt = kbase_hwcnt_metadata_block_values_count(dst->metadata, grp, blk);
+	val_cnt = kbase_hwcnt_metadata_block_values_count(dst->metadata, blk);
 	bitfld_cnt = kbase_hwcnt_bitfield_count(val_cnt);
 
 	for (bitfld_idx = 0; bitfld_idx < bitfld_cnt; bitfld_idx++) {
@@ -682,13 +805,13 @@ static inline void kbase_hwcnt_enable_map_block_enable_all(struct kbase_hwcnt_en
  */
 static inline void kbase_hwcnt_enable_map_enable_all(struct kbase_hwcnt_enable_map *dst)
 {
-	size_t grp, blk, blk_inst;
+	size_t blk, blk_inst;
 
 	if (WARN_ON(!dst) || WARN_ON(!dst->metadata))
 		return;
 
-	kbase_hwcnt_metadata_for_each_block(dst->metadata, grp, blk, blk_inst)
-		kbase_hwcnt_enable_map_block_enable_all(dst, grp, blk, blk_inst);
+	kbase_hwcnt_metadata_for_each_block(dst->metadata, blk, blk_inst)
+		kbase_hwcnt_enable_map_block_enable_all(dst, blk, blk_inst);
 
 	dst->clk_enable_map = (1ull << dst->metadata->clk_cnt) - 1;
 }
@@ -751,27 +874,26 @@ static inline void kbase_hwcnt_enable_map_union(struct kbase_hwcnt_enable_map *d
  * kbase_hwcnt_enable_map_block_enabled() - Check if any values in a block
  *                                          instance are enabled.
  * @enable_map: Non-NULL pointer to enable map.
- * @grp:        Index of the group in the metadata.
- * @blk:        Index of the block in the group.
+ * @blk:        Index of the block in the metadata.
  * @blk_inst:   Index of the block instance in the block.
  *
  * Return: true if any values in the block are enabled, else false.
  */
 static inline bool
-kbase_hwcnt_enable_map_block_enabled(const struct kbase_hwcnt_enable_map *enable_map, size_t grp,
-				     size_t blk, size_t blk_inst)
+kbase_hwcnt_enable_map_block_enabled(const struct kbase_hwcnt_enable_map *enable_map, size_t blk,
+				     size_t blk_inst)
 {
 	bool any_enabled = false;
 	size_t val_cnt;
 	size_t bitfld_cnt;
 	const u64 *const block_enable_map =
-		kbase_hwcnt_enable_map_block_instance(enable_map, grp, blk, blk_inst);
+		kbase_hwcnt_enable_map_block_instance(enable_map, blk, blk_inst);
 	size_t bitfld_idx;
 
 	if (WARN_ON(!enable_map))
 		return false;
 
-	val_cnt = kbase_hwcnt_metadata_block_values_count(enable_map->metadata, grp, blk);
+	val_cnt = kbase_hwcnt_metadata_block_values_count(enable_map->metadata, blk);
 	bitfld_cnt = kbase_hwcnt_bitfield_count(val_cnt);
 
 	for (bitfld_idx = 0; bitfld_idx < bitfld_cnt; bitfld_idx++) {
@@ -796,7 +918,7 @@ kbase_hwcnt_enable_map_block_enabled(const struct kbase_hwcnt_enable_map *enable
 static inline bool
 kbase_hwcnt_enable_map_any_enabled(const struct kbase_hwcnt_enable_map *enable_map)
 {
-	size_t grp, blk, blk_inst;
+	size_t blk, blk_inst;
 	u64 clk_enable_map_mask;
 
 	if (WARN_ON(!enable_map) || WARN_ON(!enable_map->metadata))
@@ -807,9 +929,8 @@ kbase_hwcnt_enable_map_any_enabled(const struct kbase_hwcnt_enable_map *enable_m
 	if (enable_map->metadata->clk_cnt > 0 && (enable_map->clk_enable_map & clk_enable_map_mask))
 		return true;
 
-	kbase_hwcnt_metadata_for_each_block(enable_map->metadata, grp, blk, blk_inst)
-	{
-		if (kbase_hwcnt_enable_map_block_enabled(enable_map, grp, blk, blk_inst))
+	kbase_hwcnt_metadata_for_each_block(enable_map->metadata, blk, blk_inst) {
+		if (kbase_hwcnt_enable_map_block_enabled(enable_map, blk, blk_inst))
 			return true;
 	}
 
@@ -869,9 +990,8 @@ static inline void kbase_hwcnt_enable_map_block_disable_value(u64 *bitfld, size_
 /**
  * kbase_hwcnt_dump_buffer_alloc() - Allocate a dump buffer.
  * @metadata: Non-NULL pointer to metadata describing the system.
- * @dump_buf: Non-NULL pointer to dump buffer to be initialised. Will be
- *            initialised to undefined values, so must be used as a copy dest,
- *            or cleared before use.
+ * @dump_buf: Non-NULL pointer to a zero-initialized dump buffer.
+ *            The memory will be zero allocated
  *
  * Return: 0 on success, else error code.
  */
@@ -888,53 +1008,51 @@ int kbase_hwcnt_dump_buffer_alloc(const struct kbase_hwcnt_metadata *metadata,
 void kbase_hwcnt_dump_buffer_free(struct kbase_hwcnt_dump_buffer *dump_buf);
 
 /**
- * kbase_hwcnt_dump_buffer_array_alloc() - Allocate an array of dump buffers.
- * @metadata:  Non-NULL pointer to metadata describing the system.
- * @n:         Number of dump buffers to allocate
- * @dump_bufs: Non-NULL pointer to dump buffer array to be initialised.
- *
- * A single zeroed contiguous page allocation will be used for all of the
- * buffers inside the array, where:
- * dump_bufs[n].dump_buf == page_addr + n * metadata.dump_buf_bytes
- *
- * Return: 0 on success, else error code.
- */
-int kbase_hwcnt_dump_buffer_array_alloc(const struct kbase_hwcnt_metadata *metadata, size_t n,
-					struct kbase_hwcnt_dump_buffer_array *dump_bufs);
-
-/**
- * kbase_hwcnt_dump_buffer_array_free() - Free a dump buffer array.
- * @dump_bufs: Dump buffer array to be freed.
- *
- * Can be safely called on an all-zeroed dump buffer array structure, or on an
- * already freed dump buffer array.
- */
-void kbase_hwcnt_dump_buffer_array_free(struct kbase_hwcnt_dump_buffer_array *dump_bufs);
-
-/**
  * kbase_hwcnt_dump_buffer_block_instance() - Get the pointer to a block
  *                                            instance's dump buffer.
  * @buf:      Non-NULL pointer to dump buffer.
- * @grp:      Index of the group in the metadata.
- * @blk:      Index of the block in the group.
+ * @blk:      Index of the block in the metadata.
  * @blk_inst: Index of the block instance in the block.
  *
  * Return: u64* to the dump buffer for the block instance.
  */
 static inline u64 *kbase_hwcnt_dump_buffer_block_instance(const struct kbase_hwcnt_dump_buffer *buf,
-							  size_t grp, size_t blk, size_t blk_inst)
+							  size_t blk, size_t blk_inst)
 {
 	if (WARN_ON(!buf) || WARN_ON(!buf->dump_buf))
 		return NULL;
 
-	if (WARN_ON(!buf->metadata) || WARN_ON(grp >= buf->metadata->grp_cnt) ||
-	    WARN_ON(blk >= buf->metadata->grp_metadata[grp].blk_cnt) ||
-	    WARN_ON(blk_inst >= buf->metadata->grp_metadata[grp].blk_metadata[blk].inst_cnt))
+	if (WARN_ON(!buf->metadata) || WARN_ON(blk >= buf->metadata->blk_cnt) ||
+	    WARN_ON(blk_inst >= buf->metadata->blk_metadata[blk].inst_cnt))
 		return buf->dump_buf;
 
-	return buf->dump_buf + buf->metadata->grp_metadata[grp].dump_buf_index +
-	       buf->metadata->grp_metadata[grp].blk_metadata[blk].dump_buf_index +
-	       (buf->metadata->grp_metadata[grp].blk_metadata[blk].dump_buf_stride * blk_inst);
+	return buf->dump_buf + buf->metadata->blk_metadata[blk].dump_buf_index +
+	       (buf->metadata->blk_metadata[blk].dump_buf_stride * blk_inst);
+}
+
+/**
+ * kbase_hwcnt_dump_buffer_block_state_instance() - Get the pointer to a block
+ *                                                  instance's block state mask.
+ * @buf:      Non-NULL pointer to dump buffer.
+ * @blk:      Index of the block in the metadata.
+ * @blk_inst: Index of the block instance in the block.
+ *
+ * Return: blk_stt_t* to the block state mask of the block instance in the dump
+ * buffer.
+ */
+static inline blk_stt_t *
+kbase_hwcnt_dump_buffer_block_state_instance(const struct kbase_hwcnt_dump_buffer *buf, size_t blk,
+					     size_t blk_inst)
+{
+	if (WARN_ON(!buf) || WARN_ON(!buf->dump_buf))
+		return NULL;
+
+	if (WARN_ON(!buf->metadata) || WARN_ON(blk >= buf->metadata->blk_cnt) ||
+	    WARN_ON(blk_inst >= buf->metadata->blk_metadata[blk].inst_cnt))
+		return buf->blk_stt_buf;
+
+	return buf->blk_stt_buf + buf->metadata->blk_metadata[blk].blk_stt_index +
+	       (buf->metadata->blk_metadata[blk].blk_stt_stride * blk_inst);
 }
 
 /**
@@ -1228,4 +1346,19 @@ static inline bool kbase_hwcnt_clk_enable_map_enabled(const u64 clk_enable_map,
 	return false;
 }
 
+/**
+ * kbase_hwcnt_dump_buffer_block_state_update() - Update the enabled block instances' block states
+ *                                                in dst. After the operation, all non-enabled or
+ *                                                unavailable block instances will be unchanged.
+ * @dst:            Non-NULL pointer to dump buffer.
+ * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
+ * @blk_stt_val:    Mask of block states to update. Block states not set in this mask will still be
+ *                  preserved in dst.
+ *
+ * The dst and dst_enable_map MUST have been created from the same metadata.
+ */
+void kbase_hwcnt_dump_buffer_block_state_update(struct kbase_hwcnt_dump_buffer *dst,
+						const struct kbase_hwcnt_enable_map *dst_enable_map,
+						blk_stt_t blk_stt_val);
+
 #endif /* _KBASE_HWCNT_TYPES_H_ */
diff --git a/mali_kbase/hwcnt/mali_kbase_hwcnt_virtualizer.c b/mali_kbase/hwcnt/mali_kbase_hwcnt_virtualizer.c
index d618764..89cca45 100644
--- a/mali_kbase/hwcnt/mali_kbase_hwcnt_virtualizer.c
+++ b/mali_kbase/hwcnt/mali_kbase_hwcnt_virtualizer.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -255,7 +255,7 @@ static int kbasep_hwcnt_virtualizer_client_add(struct kbase_hwcnt_virtualizer *h
 
 		/* Make the scratch enable map the union of all enable maps */
 		kbase_hwcnt_enable_map_copy(&hvirt->scratch_map, enable_map);
-		list_for_each_entry (pos, &hvirt->clients, node)
+		list_for_each_entry(pos, &hvirt->clients, node)
 			kbase_hwcnt_enable_map_union(&hvirt->scratch_map, &pos->enable_map);
 
 		/* Set the counters with the new union enable map */
@@ -264,7 +264,7 @@ static int kbasep_hwcnt_virtualizer_client_add(struct kbase_hwcnt_virtualizer *h
 							       &hvirt->scratch_buf);
 		/* Accumulate into only existing clients' accumulation bufs */
 		if (!errcode)
-			list_for_each_entry (pos, &hvirt->clients, node)
+			list_for_each_entry(pos, &hvirt->clients, node)
 				kbasep_hwcnt_virtualizer_client_accumulate(pos,
 									   &hvirt->scratch_buf);
 	}
@@ -315,7 +315,7 @@ static void kbasep_hwcnt_virtualizer_client_remove(struct kbase_hwcnt_virtualize
 		struct kbase_hwcnt_virtualizer_client *pos;
 		/* Make the scratch enable map the union of all enable maps */
 		kbase_hwcnt_enable_map_disable_all(&hvirt->scratch_map);
-		list_for_each_entry (pos, &hvirt->clients, node)
+		list_for_each_entry(pos, &hvirt->clients, node)
 			kbase_hwcnt_enable_map_union(&hvirt->scratch_map, &pos->enable_map);
 		/* Set the counters with the new union enable map */
 		errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum, &hvirt->scratch_map,
@@ -323,7 +323,7 @@ static void kbasep_hwcnt_virtualizer_client_remove(struct kbase_hwcnt_virtualize
 							       &hvirt->scratch_buf);
 		/* Accumulate into remaining clients' accumulation bufs */
 		if (!errcode) {
-			list_for_each_entry (pos, &hvirt->clients, node)
+			list_for_each_entry(pos, &hvirt->clients, node)
 				kbasep_hwcnt_virtualizer_client_accumulate(pos,
 									   &hvirt->scratch_buf);
 
@@ -373,7 +373,7 @@ static int kbasep_hwcnt_virtualizer_client_set_counters(
 
 	/* Make the scratch enable map the union of all enable maps */
 	kbase_hwcnt_enable_map_copy(&hvirt->scratch_map, enable_map);
-	list_for_each_entry (pos, &hvirt->clients, node)
+	list_for_each_entry(pos, &hvirt->clients, node)
 		/* Ignore the enable map of the selected client */
 		if (pos != hvcli)
 			kbase_hwcnt_enable_map_union(&hvirt->scratch_map, &pos->enable_map);
@@ -385,7 +385,7 @@ static int kbasep_hwcnt_virtualizer_client_set_counters(
 		return errcode;
 
 	/* Accumulate into all accumulation bufs except the selected client's */
-	list_for_each_entry (pos, &hvirt->clients, node)
+	list_for_each_entry(pos, &hvirt->clients, node)
 		if (pos != hvcli)
 			kbasep_hwcnt_virtualizer_client_accumulate(pos, &hvirt->scratch_buf);
 
@@ -503,7 +503,7 @@ static int kbasep_hwcnt_virtualizer_client_dump(struct kbase_hwcnt_virtualizer *
 		return errcode;
 
 	/* Accumulate into all accumulation bufs except the selected client's */
-	list_for_each_entry (pos, &hvirt->clients, node)
+	list_for_each_entry(pos, &hvirt->clients, node)
 		if (pos != hvcli)
 			kbasep_hwcnt_virtualizer_client_accumulate(pos, &hvirt->scratch_buf);
 
@@ -724,7 +724,7 @@ void kbase_hwcnt_virtualizer_term(struct kbase_hwcnt_virtualizer *hvirt)
 	if (WARN_ON(hvirt->client_count != 0)) {
 		struct kbase_hwcnt_virtualizer_client *pos, *n;
 
-		list_for_each_entry_safe (pos, n, &hvirt->clients, node)
+		list_for_each_entry_safe(pos, n, &hvirt->clients, node)
 			kbase_hwcnt_virtualizer_client_destroy(pos);
 	}
author	Jörg Wagner <jorwag@google.com>	2023-12-14 09:44:26 +0000
committer	Jörg Wagner <jorwag@google.com>	2023-12-14 09:44:26 +0000
commit	049a542207ed694271316782397b78b2e202086a (patch)
tree	105e9378d4d5062dc72109fdd4a77c915bd9425d /mali_kbase/hwcnt
parent	e61eb93296e9f940b32d4ad4b0c3a5557cbeaf17 (diff)
download	gpu-049a542207ed694271316782397b78b2e202086a.tar.gz