1 files changed, 333 insertions, 64 deletions
diff --git a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
index 27acfc6..d7911ae 100644
--- a/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
+++ b/mali_kbase/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
@@ -44,6 +44,9 @@
 #define HWCNT_BACKEND_WATCHDOG_TIMER_INTERVAL_MS ((u32)1000)
 #endif /* IS_FPGA && !NO_MALI */
 
+/* Used to check for a sample in which all counters in the block are disabled */
+#define HWCNT_BLOCK_EMPTY_SAMPLE (2)
+
 /**
  * enum kbase_hwcnt_backend_csf_dump_state - HWC CSF backend dumping states.
  *
@@ -172,15 +175,16 @@ struct kbase_hwcnt_backend_csf_info {
 
 /**
  * struct kbase_hwcnt_csf_physical_layout - HWC sample memory physical layout
- *                                          information.
- * @hw_block_cnt:       Total number of hardware counters blocks. The hw counters blocks are
- *                      sub-categorized into 4 classes: front-end, tiler, memory system, and shader.
- *                      hw_block_cnt = fe_cnt + tiler_cnt + mmu_l2_cnt + shader_cnt.
+ *                                          information, as defined by the spec.
  * @fe_cnt:             Front end block count.
  * @tiler_cnt:          Tiler block count.
  * @mmu_l2_cnt:         Memory system (MMU and L2 cache) block count.
  * @shader_cnt:         Shader Core block count.
- * @fw_block_cnt:       Total number of firmware counters blocks.
+ * @fw_block_cnt:       Total number of firmware counter blocks, with a single
+ *                      global FW block and a block per CSG.
+ * @hw_block_cnt:       Total number of hardware counter blocks. The hw counters blocks are
+ *                      sub-categorized into 4 classes: front-end, tiler, memory system, and shader.
+ *                      hw_block_cnt = fe_cnt + tiler_cnt + mmu_l2_cnt + shader_cnt.
  * @block_cnt:          Total block count (sum of all counter blocks: hw_block_cnt + fw_block_cnt).
  * @shader_avail_mask:  Bitmap of all shader cores in the system.
  * @enable_mask_offset: Offset in array elements of enable mask in each block
@@ -190,12 +194,12 @@ struct kbase_hwcnt_backend_csf_info {
  * @values_per_block:   For any block, the number of counters in total (header + payload).
  */
 struct kbase_hwcnt_csf_physical_layout {
-	u8 hw_block_cnt;
 	u8 fe_cnt;
 	u8 tiler_cnt;
 	u8 mmu_l2_cnt;
 	u8 shader_cnt;
 	u8 fw_block_cnt;
+	u8 hw_block_cnt;
 	u8 block_cnt;
 	u64 shader_avail_mask;
 	size_t enable_mask_offset;
@@ -220,6 +224,13 @@ struct kbase_hwcnt_csf_physical_layout {
  * @old_sample_buf:             HWC sample buffer to save the previous values
  *                              for delta calculation, size
  *                              prfcnt_info.dump_bytes.
+ * @block_states:               Pointer to array of block_state values for all
+ *                              blocks.
+ * @to_user_block_states:       Block state buffer for client user.
+ * @accum_all_blk_stt:          Block state to accumulate for all known blocks
+ *                              on next sample.
+ * @sampled_all_blk_stt:        Block State to accumulate for all known blocks
+ *                              into the current sample.
  * @watchdog_last_seen_insert_idx: The insert index which watchdog has last
  *                                 seen, to check any new firmware automatic
  *                                 samples generated during the watchdog
@@ -243,6 +254,8 @@ struct kbase_hwcnt_csf_physical_layout {
  * @hwc_dump_work:              Worker to accumulate samples.
  * @hwc_threshold_work:         Worker for consuming available samples when
  *                              threshold interrupt raised.
+ * @num_l2_slices:              Current number of L2 slices allocated to the GPU.
+ * @shader_present_bitmap:      Current shader-present bitmap that is allocated to the GPU.
  */
 struct kbase_hwcnt_backend_csf {
 	struct kbase_hwcnt_backend_csf_info *info;
@@ -253,6 +266,10 @@ struct kbase_hwcnt_backend_csf {
 	u64 *to_user_buf;
 	u64 *accum_buf;
 	u32 *old_sample_buf;
+	blk_stt_t *block_states;
+	blk_stt_t *to_user_block_states;
+	blk_stt_t accum_all_blk_stt;
+	blk_stt_t sampled_all_blk_stt;
 	u32 watchdog_last_seen_insert_idx;
 	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf;
 	void *ring_buf_cpu_base;
@@ -265,15 +282,45 @@ struct kbase_hwcnt_backend_csf {
 	struct workqueue_struct *hwc_dump_workq;
 	struct work_struct hwc_dump_work;
 	struct work_struct hwc_threshold_work;
+	size_t num_l2_slices;
+	u64 shader_present_bitmap;
 };
 
 static bool kbasep_hwcnt_backend_csf_backend_exists(struct kbase_hwcnt_backend_csf_info *csf_info)
 {
-	WARN_ON(!csf_info);
+	if (WARN_ON(!csf_info))
+		return false;
+
 	csf_info->csf_if->assert_lock_held(csf_info->csf_if->ctx);
 	return (csf_info->backend != NULL);
 }
 
+void kbase_hwcnt_backend_csf_set_hw_availability(struct kbase_hwcnt_backend_interface *iface,
+						 size_t num_l2_slices, u64 shader_present_bitmap)
+{
+	struct kbase_hwcnt_backend_csf_info *csf_info;
+
+	if (!iface)
+		return;
+
+	csf_info = (struct kbase_hwcnt_backend_csf_info *)iface->info;
+
+	/* Early out if the backend does not exist. */
+	if (!csf_info || !csf_info->backend)
+		return;
+
+	if (WARN_ON(csf_info->backend->enable_state != KBASE_HWCNT_BACKEND_CSF_DISABLED))
+		return;
+
+	if (WARN_ON(num_l2_slices > csf_info->backend->phys_layout.mmu_l2_cnt) ||
+	    WARN_ON((shader_present_bitmap & csf_info->backend->phys_layout.shader_avail_mask) !=
+		    shader_present_bitmap))
+		return;
+
+	csf_info->backend->num_l2_slices = num_l2_slices;
+	csf_info->backend->shader_present_bitmap = shader_present_bitmap;
+}
+
 /**
  * kbasep_hwcnt_backend_csf_cc_initial_sample() - Initialize cycle count
  *                                                tracking.
@@ -295,8 +342,7 @@ kbasep_hwcnt_backend_csf_cc_initial_sample(struct kbase_hwcnt_backend_csf *backe
 	backend_csf->info->csf_if->get_gpu_cycle_count(backend_csf->info->csf_if->ctx, cycle_counts,
 						       clk_enable_map);
 
-	kbase_hwcnt_metadata_for_each_clock(enable_map->metadata, clk)
-	{
+	kbase_hwcnt_metadata_for_each_clock(enable_map->metadata, clk) {
 		if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, clk))
 			backend_csf->prev_cycle_count[clk] = cycle_counts[clk];
 	}
@@ -317,8 +363,7 @@ static void kbasep_hwcnt_backend_csf_cc_update(struct kbase_hwcnt_backend_csf *b
 	backend_csf->info->csf_if->get_gpu_cycle_count(backend_csf->info->csf_if->ctx, cycle_counts,
 						       backend_csf->clk_enable_map);
 
-	kbase_hwcnt_metadata_for_each_clock(backend_csf->info->metadata, clk)
-	{
+	kbase_hwcnt_metadata_for_each_clock(backend_csf->info->metadata, clk) {
 		if (kbase_hwcnt_clk_enable_map_enabled(backend_csf->clk_enable_map, clk)) {
 			backend_csf->cycle_count_elapsed[clk] =
 				cycle_counts[clk] - backend_csf->prev_cycle_count[clk];
@@ -340,29 +385,29 @@ static u64 kbasep_hwcnt_backend_csf_timestamp_ns(struct kbase_hwcnt_backend *bac
 
 /** kbasep_hwcnt_backend_csf_process_enable_map() - Process the enable_map to
  *                                                  guarantee headers are
- *                                                  enabled if any counter is
- *                                                  required.
+ *                                                  enabled.
  *@phys_enable_map: HWC physical enable map to be processed.
  */
-static void
-kbasep_hwcnt_backend_csf_process_enable_map(struct kbase_hwcnt_physical_enable_map *phys_enable_map)
+void kbasep_hwcnt_backend_csf_process_enable_map(
+	struct kbase_hwcnt_physical_enable_map *phys_enable_map)
 {
 	WARN_ON(!phys_enable_map);
 
-	/* Enable header if any counter is required from user, the header is
-	 * controlled by bit 0 of the enable mask.
+	/* Unconditionally enable each block header and first counter,
+	 * the header is controlled by bit 0 of the enable mask.
 	 */
-	if (phys_enable_map->fe_bm)
-		phys_enable_map->fe_bm |= 1;
+	phys_enable_map->fe_bm |= 3;
 
-	if (phys_enable_map->tiler_bm)
-		phys_enable_map->tiler_bm |= 1;
+	phys_enable_map->tiler_bm |= 3;
 
-	if (phys_enable_map->mmu_l2_bm)
-		phys_enable_map->mmu_l2_bm |= 1;
+	phys_enable_map->mmu_l2_bm |= 3;
+
+	phys_enable_map->shader_bm |= 3;
+
+	phys_enable_map->fw_bm |= 3;
+
+	phys_enable_map->csg_bm |= 3;
 
-	if (phys_enable_map->shader_bm)
-		phys_enable_map->shader_bm |= 1;
 }
 
 static void kbasep_hwcnt_backend_csf_init_layout(
@@ -371,32 +416,35 @@ static void kbasep_hwcnt_backend_csf_init_layout(
 {
 	size_t shader_core_cnt;
 	size_t values_per_block;
-	size_t fw_blocks_count;
-	size_t hw_blocks_count;
+	size_t fw_block_cnt;
+	size_t hw_block_cnt;
+	size_t core_cnt;
+
 
 	WARN_ON(!prfcnt_info);
 	WARN_ON(!phys_layout);
 
-	shader_core_cnt = fls64(prfcnt_info->core_mask);
+	shader_core_cnt = (size_t)fls64(prfcnt_info->core_mask);
 	values_per_block = prfcnt_info->prfcnt_block_size / KBASE_HWCNT_VALUE_HW_BYTES;
-	fw_blocks_count = div_u64(prfcnt_info->prfcnt_fw_size, prfcnt_info->prfcnt_block_size);
-	hw_blocks_count = div_u64(prfcnt_info->prfcnt_hw_size, prfcnt_info->prfcnt_block_size);
+	fw_block_cnt = div_u64(prfcnt_info->prfcnt_fw_size, prfcnt_info->prfcnt_block_size);
+	hw_block_cnt = div_u64(prfcnt_info->prfcnt_hw_size, prfcnt_info->prfcnt_block_size);
+
+	core_cnt = shader_core_cnt;
 
 	/* The number of hardware counters reported by the GPU matches the legacy guess-work we
 	 * have done in the past
 	 */
-	WARN_ON(hw_blocks_count != KBASE_HWCNT_V5_FE_BLOCK_COUNT +
-					   KBASE_HWCNT_V5_TILER_BLOCK_COUNT +
-					   prfcnt_info->l2_count + shader_core_cnt);
+	WARN_ON(hw_block_cnt != KBASE_HWCNT_V5_FE_BLOCK_COUNT + KBASE_HWCNT_V5_TILER_BLOCK_COUNT +
+					prfcnt_info->l2_count + core_cnt);
 
 	*phys_layout = (struct kbase_hwcnt_csf_physical_layout){
 		.fe_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT,
 		.tiler_cnt = KBASE_HWCNT_V5_TILER_BLOCK_COUNT,
 		.mmu_l2_cnt = prfcnt_info->l2_count,
 		.shader_cnt = shader_core_cnt,
-		.fw_block_cnt = fw_blocks_count,
-		.hw_block_cnt = hw_blocks_count,
-		.block_cnt = fw_blocks_count + hw_blocks_count,
+		.fw_block_cnt = fw_block_cnt,
+		.hw_block_cnt = hw_block_cnt,
+		.block_cnt = fw_block_cnt + hw_block_cnt,
 		.shader_avail_mask = prfcnt_info->core_mask,
 		.headers_per_block = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
 		.values_per_block = values_per_block,
@@ -409,10 +457,14 @@ static void
 kbasep_hwcnt_backend_csf_reset_internal_buffers(struct kbase_hwcnt_backend_csf *backend_csf)
 {
 	size_t user_buf_bytes = backend_csf->info->metadata->dump_buf_bytes;
+	size_t block_state_bytes = backend_csf->phys_layout.block_cnt *
+				   KBASE_HWCNT_BLOCK_STATE_BYTES * KBASE_HWCNT_BLOCK_STATE_STRIDE;
 
 	memset(backend_csf->to_user_buf, 0, user_buf_bytes);
 	memset(backend_csf->accum_buf, 0, user_buf_bytes);
 	memset(backend_csf->old_sample_buf, 0, backend_csf->info->prfcnt_info.dump_bytes);
+	memset(backend_csf->block_states, 0, block_state_bytes);
+	memset(backend_csf->to_user_block_states, 0, block_state_bytes);
 }
 
 static void
@@ -450,40 +502,130 @@ kbasep_hwcnt_backend_csf_zero_all_prfcnt_en_header(struct kbase_hwcnt_backend_cs
 static void kbasep_hwcnt_backend_csf_update_user_sample(struct kbase_hwcnt_backend_csf *backend_csf)
 {
 	size_t user_buf_bytes = backend_csf->info->metadata->dump_buf_bytes;
+	size_t block_state_bytes = backend_csf->phys_layout.block_cnt *
+				   KBASE_HWCNT_BLOCK_STATE_BYTES * KBASE_HWCNT_BLOCK_STATE_STRIDE;
 
 	/* Copy the data into the sample and wait for the user to get it. */
 	memcpy(backend_csf->to_user_buf, backend_csf->accum_buf, user_buf_bytes);
+	memcpy(backend_csf->to_user_block_states, backend_csf->block_states, block_state_bytes);
 
 	/* After copied data into user sample, clear the accumulator values to
 	 * prepare for the next accumulator, such as the next request or
 	 * threshold.
 	 */
 	memset(backend_csf->accum_buf, 0, user_buf_bytes);
+	memset(backend_csf->block_states, 0, block_state_bytes);
+}
+
+/**
+ * kbasep_hwcnt_backend_csf_update_block_state - Update block state of a block instance with
+ *						   information from a sample.
+ * @phys_layout:                Physical memory layout information of HWC
+ *                              sample buffer.
+ * @enable_mask:                Counter enable mask for the block whose state is being updated.
+ * @enable_state:               The CSF backend internal enabled state.
+ * @exiting_protm:              Whether or not the sample is taken when the GPU is exiting
+ *                              protected mode.
+ * @block_idx:                  Index of block within the ringbuffer.
+ * @block_state:                Pointer to existing block state of the block whose state is being
+ *                              updated.
+ * @fw_in_protected_mode:       Whether or not GPU is in protected mode during sampling.
+ */
+static void kbasep_hwcnt_backend_csf_update_block_state(
+	const struct kbase_hwcnt_csf_physical_layout *phys_layout, const u32 enable_mask,
+	enum kbase_hwcnt_backend_csf_enable_state enable_state, bool exiting_protm,
+	size_t block_idx, blk_stt_t *const block_state, bool fw_in_protected_mode)
+{
+	/* Offset of shader core blocks from the start of the HW blocks in the sample */
+	size_t shader_core_block_offset =
+		(size_t)(phys_layout->hw_block_cnt - phys_layout->shader_cnt);
+	bool is_shader_core_block;
+
+	is_shader_core_block = block_idx >= shader_core_block_offset;
+
+	/* Set power bits for the block state for the block, for the sample */
+	switch (enable_state) {
+	/* Disabled states */
+	case KBASE_HWCNT_BACKEND_CSF_DISABLED:
+	case KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED:
+	case KBASE_HWCNT_BACKEND_CSF_DISABLED_WAIT_FOR_WORKER:
+		kbase_hwcnt_block_state_append(block_state, KBASE_HWCNT_STATE_OFF);
+		break;
+	/* Enabled states */
+	case KBASE_HWCNT_BACKEND_CSF_ENABLED:
+	case KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED:
+		if (!is_shader_core_block)
+			kbase_hwcnt_block_state_append(block_state, KBASE_HWCNT_STATE_ON);
+		else if (!exiting_protm) {
+			/* When not exiting protected mode, a zero enable mask on a shader core
+			 * counter block indicates the block was powered off for the sample, and
+			 * a non-zero counter enable mask indicates the block was powered on for
+			 * the sample.
+			 */
+			kbase_hwcnt_block_state_append(block_state,
+						       (enable_mask ? KBASE_HWCNT_STATE_ON :
+									    KBASE_HWCNT_STATE_OFF));
+		}
+		break;
+	/* Error states */
+	case KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR_WAIT_FOR_WORKER:
+	case KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR:
+	default:
+		/* Do nothing */
+		break;
+	}
+
+	/* The following four cases apply to a block state in either normal mode or protected mode:
+	 * 1. GPU executing in normal mode: Only set normal mode bit.
+	 * 2. First sample request after GPU enters protected mode: Set both normal mode and
+	 *    protected mode bit. In this case, there will at least be one sample to accumulate
+	 *    in the ring buffer which was automatically triggered before GPU entered protected
+	 *    mode.
+	 * 3. Subsequent sample requests while GPU remains in protected mode: Only set protected
+	 *    mode bit. In this case, the ring buffer should be empty and dump should return 0s but
+	 *    block state should be updated accordingly. This case is not handled here.
+	 * 4. Samples requested after GPU exits protected mode: Set both protected mode and normal
+	 *    mode bits.
+	 */
+	if (exiting_protm || fw_in_protected_mode)
+		kbase_hwcnt_block_state_append(block_state, KBASE_HWCNT_STATE_PROTECTED |
+								    KBASE_HWCNT_STATE_NORMAL);
+	else
+		kbase_hwcnt_block_state_append(block_state, KBASE_HWCNT_STATE_NORMAL);
 }
 
 static void kbasep_hwcnt_backend_csf_accumulate_sample(
 	const struct kbase_hwcnt_csf_physical_layout *phys_layout, size_t dump_bytes,
-	u64 *accum_buf, const u32 *old_sample_buf, const u32 *new_sample_buf, bool clearing_samples)
+	u64 *accum_buf, const u32 *old_sample_buf, const u32 *new_sample_buf,
+	blk_stt_t *const block_states, bool clearing_samples,
+	enum kbase_hwcnt_backend_csf_enable_state enable_state, bool fw_in_protected_mode)
 {
 	size_t block_idx;
 	const u32 *old_block = old_sample_buf;
 	const u32 *new_block = new_sample_buf;
 	u64 *acc_block = accum_buf;
+	/* Flag to indicate whether current sample is exiting protected mode. */
+	bool exiting_protm = false;
 	const size_t values_per_block = phys_layout->values_per_block;
 
-	/* Performance counter blocks for firmware are stored before blocks for hardware.
-	 * We skip over the firmware's performance counter blocks (counters dumping is not
-	 * supported for firmware blocks, only hardware ones).
+	/* The block pointers now point to the first HW block, which is always a CSHW/front-end
+	 * block. The counter enable mask for this block can be checked to determine whether this
+	 * sample is taken after leaving protected mode - this is the only scenario where the CSHW
+	 * block counter enable mask has only the first bit set, and no others. In this case,
+	 * the values in this sample would not be meaningful, so they don't need to be accumulated.
 	 */
-	old_block += values_per_block * phys_layout->fw_block_cnt;
-	new_block += values_per_block * phys_layout->fw_block_cnt;
+	exiting_protm = (new_block[phys_layout->enable_mask_offset] == 1);
 
-	for (block_idx = phys_layout->fw_block_cnt; block_idx < phys_layout->block_cnt;
-	     block_idx++) {
+	for (block_idx = 0; block_idx < phys_layout->block_cnt; block_idx++) {
 		const u32 old_enable_mask = old_block[phys_layout->enable_mask_offset];
 		const u32 new_enable_mask = new_block[phys_layout->enable_mask_offset];
+		/* Update block state with information of the current sample */
+		kbasep_hwcnt_backend_csf_update_block_state(phys_layout, new_enable_mask,
+							    enable_state, exiting_protm, block_idx,
+							    &block_states[block_idx],
+							    fw_in_protected_mode);
 
-		if (new_enable_mask == 0) {
+		if (!(new_enable_mask & HWCNT_BLOCK_EMPTY_SAMPLE)) {
 			/* Hardware block was unavailable or we didn't turn on
 			 * any counters. Do nothing.
 			 */
@@ -492,7 +634,6 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 			 * enabled. We need to update the accumulation buffer.
 			 */
 			size_t ctr_idx;
-
 			/* Unconditionally copy the headers. */
 			for (ctr_idx = 0; ctr_idx < phys_layout->headers_per_block; ctr_idx++) {
 				acc_block[ctr_idx] = new_block[ctr_idx];
@@ -517,8 +658,8 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 			 * saturating at their maximum value.
 			 */
 			if (!clearing_samples) {
-				if (old_enable_mask == 0) {
-					/* Hardware block was previously
+				if (!(old_enable_mask & HWCNT_BLOCK_EMPTY_SAMPLE)) {
+					/* Block was previously
 					 * unavailable. Accumulate the new
 					 * counters only, as we know previous
 					 * values are zeroes.
@@ -545,15 +686,14 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 				}
 			}
 		}
+
 		old_block += values_per_block;
 		new_block += values_per_block;
 		acc_block += values_per_block;
 	}
-
 	WARN_ON(old_block != old_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
 	WARN_ON(new_block != new_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
-	WARN_ON(acc_block != accum_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES) -
-				     (values_per_block * phys_layout->fw_block_cnt));
+	WARN_ON(acc_block != accum_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
 	(void)dump_bytes;
 }
 
@@ -569,10 +709,23 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(struct kbase_hwcnt_backe
 	bool clearing_samples = backend_csf->info->prfcnt_info.clearing_samples;
 	u32 *old_sample_buf = backend_csf->old_sample_buf;
 	u32 *new_sample_buf = old_sample_buf;
+	const struct kbase_hwcnt_csf_physical_layout *phys_layout = &backend_csf->phys_layout;
+
+	if (extract_index_to_start == insert_index_to_stop) {
+		/* No samples to accumulate but block states need to be updated for dump. */
+		size_t block_idx;
 
-	if (extract_index_to_start == insert_index_to_stop)
-		/* No samples to accumulate. Early out. */
+		for (block_idx = 0; block_idx < phys_layout->block_cnt; block_idx++) {
+			/* Set protected mode bit for block state if GPU is in protected mode,
+			 * otherwise set the normal mode bit.
+			 */
+			kbase_hwcnt_block_state_append(&backend_csf->block_states[block_idx],
+						       backend_csf->info->fw_in_protected_mode ?
+								     KBASE_HWCNT_STATE_PROTECTED :
+								     KBASE_HWCNT_STATE_NORMAL);
+		}
 		return;
+	}
 
 	/* Sync all the buffers to CPU side before read the data. */
 	backend_csf->info->csf_if->ring_buf_sync(backend_csf->info->csf_if->ctx,
@@ -587,11 +740,10 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(struct kbase_hwcnt_backe
 		const u32 buf_idx = raw_idx & (ring_buf_cnt - 1);
 
 		new_sample_buf = (u32 *)&cpu_dump_base[buf_idx * buf_dump_bytes];
-
-		kbasep_hwcnt_backend_csf_accumulate_sample(&backend_csf->phys_layout,
-							   buf_dump_bytes, backend_csf->accum_buf,
-							   old_sample_buf, new_sample_buf,
-							   clearing_samples);
+		kbasep_hwcnt_backend_csf_accumulate_sample(
+			phys_layout, buf_dump_bytes, backend_csf->accum_buf, old_sample_buf,
+			new_sample_buf, backend_csf->block_states, clearing_samples,
+			backend_csf->enable_state, backend_csf->info->fw_in_protected_mode);
 
 		old_sample_buf = new_sample_buf;
 	}
@@ -875,6 +1027,8 @@ kbasep_hwcnt_backend_csf_get_physical_enable(struct kbase_hwcnt_backend_csf *bac
 	enable->shader_bm = phys_enable_map.shader_bm;
 	enable->tiler_bm = phys_enable_map.tiler_bm;
 	enable->mmu_l2_bm = phys_enable_map.mmu_l2_bm;
+	enable->fw_bm = phys_enable_map.fw_bm;
+	enable->csg_bm = phys_enable_map.csg_bm;
 	enable->counter_set = phys_counter_set;
 	enable->clk_enable_map = enable_map->clk_enable_map;
 }
@@ -893,6 +1047,17 @@ kbasep_hwcnt_backend_csf_dump_enable_nolock(struct kbase_hwcnt_backend *backend,
 
 	backend_csf->info->csf_if->assert_lock_held(backend_csf->info->csf_if->ctx);
 
+	/* Enabling counters is an indication that the power may have previously been off for all
+	 * blocks.
+	 *
+	 * In any case, the counters would not have been counting recently, so an 'off' block state
+	 * is an approximation for this.
+	 *
+	 * This will be transferred to the dump only after a dump_wait(), or dump_disable() in
+	 * cases where the caller requested such information. This is to handle when a
+	 * dump_enable() happens in between dump_wait() and dump_get().
+	 */
+	kbase_hwcnt_block_state_append(&backend_csf->accum_all_blk_stt, KBASE_HWCNT_STATE_OFF);
 	kbasep_hwcnt_backend_csf_get_physical_enable(backend_csf, enable_map, &enable);
 
 	/* enable_state should be DISABLED before we transfer it to enabled */
@@ -956,13 +1121,19 @@ static void kbasep_hwcnt_backend_csf_wait_enable_transition_complete(
 }
 
 /* CSF backend implementation of kbase_hwcnt_backend_dump_disable_fn */
-static void kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
+static void kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend,
+						  struct kbase_hwcnt_dump_buffer *dump_buffer,
+						  const struct kbase_hwcnt_enable_map *enable_map)
 {
 	unsigned long flags = 0UL;
 	struct kbase_hwcnt_backend_csf *backend_csf = (struct kbase_hwcnt_backend_csf *)backend;
 	bool do_disable = false;
 
-	WARN_ON(!backend_csf);
+	if (WARN_ON(!backend_csf ||
+		    (dump_buffer && (backend_csf->info->metadata != dump_buffer->metadata)) ||
+		    (enable_map && (backend_csf->info->metadata != enable_map->metadata)) ||
+		    (dump_buffer && !enable_map)))
+		return;
 
 	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 
@@ -1048,6 +1219,42 @@ static void kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *ba
 	 * for next enable.
 	 */
 	kbasep_hwcnt_backend_csf_reset_internal_buffers(backend_csf);
+
+	/* Disabling HWCNT is an indication that blocks have been powered off. This is important to
+	 * know for L2, CSHW, and Tiler blocks, as this is currently the only way a backend can
+	 * know if they are being powered off.
+	 *
+	 * In any case, even if they weren't really powered off, we won't be counting whilst
+	 * disabled.
+	 *
+	 * Update the block state information in the block state accumulator to show this, so that
+	 * in the next dump blocks will have been seen as powered off for some of the time.
+	 */
+	kbase_hwcnt_block_state_append(&backend_csf->accum_all_blk_stt, KBASE_HWCNT_STATE_OFF);
+
+	if (dump_buffer) {
+		/* In some use-cases, the caller will need the information whilst the counters are
+		 * disabled, but will not be able to call into the backend to dump them. Instead,
+		 * they have an opportunity here to request them to be accumulated into their
+		 * buffer immediately.
+		 *
+		 * This consists of taking a sample of the accumulated block state (as though a
+		 * real dump_get() had happened), then transfer ownership of that to the caller
+		 * (i.e. erasing our copy of it).
+		 */
+		kbase_hwcnt_block_state_accumulate(&backend_csf->sampled_all_blk_stt,
+						   &backend_csf->accum_all_blk_stt);
+		kbase_hwcnt_dump_buffer_block_state_update(dump_buffer, enable_map,
+							   backend_csf->sampled_all_blk_stt);
+		/* Now the block state has been passed out into the caller's own accumulation
+		 * buffer, clear our own accumulated and sampled block state - ownership has been
+		 * transferred.
+		 */
+		kbase_hwcnt_block_state_set(&backend_csf->sampled_all_blk_stt,
+					    KBASE_HWCNT_STATE_UNKNOWN);
+		kbase_hwcnt_block_state_set(&backend_csf->accum_all_blk_stt,
+					    KBASE_HWCNT_STATE_UNKNOWN);
+	}
 }
 
 /* CSF backend implementation of kbase_hwcnt_backend_dump_request_fn */
@@ -1183,6 +1390,16 @@ static int kbasep_hwcnt_backend_csf_dump_wait(struct kbase_hwcnt_backend *backen
 
 	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 
+	/* Now that we've completed a sample, also sample+clear the accumulated block state.
+	 *
+	 * This is to ensure that a dump_enable() that happens in between dump_wait() and
+	 * dump_get() is reported on the _next_ dump, not the _current_ dump. That is, the block
+	 * state is reported at the actual time that counters are being sampled.
+	 */
+	kbase_hwcnt_block_state_accumulate(&backend_csf->sampled_all_blk_stt,
+					   &backend_csf->accum_all_blk_stt);
+	kbase_hwcnt_block_state_set(&backend_csf->accum_all_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
+
 	return errcode;
 }
 
@@ -1223,8 +1440,7 @@ static int kbasep_hwcnt_backend_csf_dump_get(struct kbase_hwcnt_backend *backend
 		return -EINVAL;
 
 	/* Extract elapsed cycle count for each clock domain if enabled. */
-	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk)
-	{
+	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk) {
 		if (!kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			continue;
 
@@ -1238,7 +1454,20 @@ static int kbasep_hwcnt_backend_csf_dump_get(struct kbase_hwcnt_backend *backend
 	 * as it is undefined to call this function without a prior succeeding
 	 * one to dump_wait().
 	 */
-	ret = kbase_hwcnt_csf_dump_get(dst, backend_csf->to_user_buf, dst_enable_map, accumulate);
+	ret = kbase_hwcnt_csf_dump_get(dst, backend_csf->to_user_buf,
+				       backend_csf->to_user_block_states, dst_enable_map,
+				       backend_csf->num_l2_slices,
+				       backend_csf->shader_present_bitmap, accumulate);
+
+	/* If no error occurred (zero ret value), then update block state for all blocks in the
+	 * accumulation with the current sample's block state.
+	 */
+	if (!ret) {
+		kbase_hwcnt_dump_buffer_block_state_update(dst, dst_enable_map,
+							   backend_csf->sampled_all_blk_stt);
+		kbase_hwcnt_block_state_set(&backend_csf->sampled_all_blk_stt,
+					    KBASE_HWCNT_STATE_UNKNOWN);
+	}
 
 	return ret;
 }
@@ -1269,6 +1498,12 @@ static void kbasep_hwcnt_backend_csf_destroy(struct kbase_hwcnt_backend_csf *bac
 	kfree(backend_csf->to_user_buf);
 	backend_csf->to_user_buf = NULL;
 
+	kfree(backend_csf->block_states);
+	backend_csf->block_states = NULL;
+
+	kfree(backend_csf->to_user_block_states);
+	backend_csf->to_user_block_states = NULL;
+
 	kfree(backend_csf);
 }
 
@@ -1285,6 +1520,7 @@ static int kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *
 {
 	struct kbase_hwcnt_backend_csf *backend_csf = NULL;
 	int errcode = -ENOMEM;
+	size_t block_state_bytes;
 
 	WARN_ON(!csf_info);
 	WARN_ON(!out_backend);
@@ -1308,6 +1544,17 @@ static int kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *
 	if (!backend_csf->to_user_buf)
 		goto err_alloc_user_sample_buf;
 
+	/* Allocate space to store block state values for each block */
+	block_state_bytes = backend_csf->phys_layout.block_cnt * KBASE_HWCNT_BLOCK_STATE_BYTES *
+			    KBASE_HWCNT_BLOCK_STATE_STRIDE;
+	backend_csf->block_states = kzalloc(block_state_bytes, GFP_KERNEL);
+	if (!backend_csf->block_states)
+		goto err_alloc_block_states_buf;
+
+	backend_csf->to_user_block_states = kzalloc(block_state_bytes, GFP_KERNEL);
+	if (!backend_csf->to_user_block_states)
+		goto err_alloc_user_block_state_buf;
+
 	errcode = csf_info->csf_if->ring_buf_alloc(csf_info->csf_if->ctx, csf_info->ring_buf_cnt,
 						   &backend_csf->ring_buf_cpu_base,
 						   &backend_csf->ring_buf);
@@ -1343,6 +1590,8 @@ static int kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *
 	complete_all(&backend_csf->dump_completed);
 	backend_csf->user_requested = false;
 	backend_csf->watchdog_last_seen_insert_idx = 0;
+	kbase_hwcnt_block_state_set(&backend_csf->accum_all_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
+	kbase_hwcnt_block_state_set(&backend_csf->sampled_all_blk_stt, KBASE_HWCNT_STATE_UNKNOWN);
 
 	*out_backend = backend_csf;
 	return 0;
@@ -1351,6 +1600,12 @@ err_alloc_workqueue:
 	backend_csf->info->csf_if->ring_buf_free(backend_csf->info->csf_if->ctx,
 						 backend_csf->ring_buf);
 err_ring_buf_alloc:
+	kfree(backend_csf->to_user_block_states);
+	backend_csf->to_user_block_states = NULL;
+err_alloc_user_block_state_buf:
+	kfree(backend_csf->block_states);
+	backend_csf->block_states = NULL;
+err_alloc_block_states_buf:
 	kfree(backend_csf->to_user_buf);
 	backend_csf->to_user_buf = NULL;
 err_alloc_user_sample_buf:
@@ -1417,7 +1672,7 @@ static void kbasep_hwcnt_backend_csf_term(struct kbase_hwcnt_backend *backend)
 	if (!backend)
 		return;
 
-	kbasep_hwcnt_backend_csf_dump_disable(backend);
+	kbasep_hwcnt_backend_csf_dump_disable(backend, NULL, NULL);
 
 	/* Set the backend in csf_info to NULL so we won't handle any external
 	 * notification anymore since we are terminating.
@@ -1828,7 +2083,21 @@ int kbase_hwcnt_backend_csf_metadata_init(struct kbase_hwcnt_backend_interface *
 	if (csf_info->prfcnt_info.clk_cnt > BASE_MAX_NR_CLOCKS_REGULATORS)
 		return -EIO;
 
+	/* We should reject initializing the metadata for any malformed
+	 * firmware size. The legitimate firmware sizes are as follows:
+	 * 1. fw_size == 0 on older GPUs
+	 * 2. fw_size == block_size on GPUs that support FW counters but not CSG counters
+	 * 3. fw_size == (1 + #CSG) * block size on GPUs that support CSG counters
+	 */
+	if ((csf_info->prfcnt_info.prfcnt_fw_size != 0) &&
+	    (csf_info->prfcnt_info.prfcnt_fw_size != csf_info->prfcnt_info.prfcnt_block_size) &&
+	    (csf_info->prfcnt_info.prfcnt_fw_size !=
+	     ((csf_info->prfcnt_info.csg_count + 1) * csf_info->prfcnt_info.prfcnt_block_size)))
+		return -EINVAL;
+
+	gpu_info.has_fw_counters = csf_info->prfcnt_info.prfcnt_fw_size > 0;
 	gpu_info.l2_count = csf_info->prfcnt_info.l2_count;
+	gpu_info.csg_cnt = csf_info->prfcnt_info.csg_count;
 	gpu_info.core_mask = csf_info->prfcnt_info.core_mask;
 	gpu_info.clk_cnt = csf_info->prfcnt_info.clk_cnt;
 	gpu_info.prfcnt_values_per_block =