/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * * (C) COPYRIGHT 2018-2023 ARM Limited. All rights reserved. * * This program is free software and is provided to you under the terms of the * GNU General Public License version 2 as published by the Free Software * Foundation, and any use by you of this program is subject to the terms * of such GNU license. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, you can access it online at * http://www.gnu.org/licenses/gpl-2.0.html. * */ #ifndef _KBASE_HWCNT_GPU_H_ #define _KBASE_HWCNT_GPU_H_ #include "hwcnt/mali_kbase_hwcnt_types.h" #include #include struct kbase_device; struct kbase_hwcnt_metadata; struct kbase_hwcnt_enable_map; struct kbase_hwcnt_dump_buffer; /* Hardware counter version 5 definitions, V5 is the only supported version. */ #define KBASE_HWCNT_V5_BLOCK_TYPE_COUNT 7 #define KBASE_HWCNT_V5_HEADERS_PER_BLOCK 4 #define KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK 60 #define KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK \ (KBASE_HWCNT_V5_HEADERS_PER_BLOCK + KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK) /* FrontEnd block count in V5 GPU hardware counter. */ #define KBASE_HWCNT_V5_FE_BLOCK_COUNT 1 /* Tiler block count in V5 GPU hardware counter. */ #define KBASE_HWCNT_V5_TILER_BLOCK_COUNT 1 /* Index of the PRFCNT_EN header into a V5 counter block */ #define KBASE_HWCNT_V5_PRFCNT_EN_HEADER 2 /* Number of bytes for each counter value in hardware. */ #define KBASE_HWCNT_VALUE_HW_BYTES (sizeof(u32)) /** * enum kbase_hwcnt_gpu_v5_block_type - GPU V5 hardware counter block types, * used to identify metadata blocks. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE: Front End block (Job manager * or CSF HW). * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE2: Secondary Front End block (Job * manager or CSF HW). * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE3: Tertiary Front End block (Job * manager or CSF HW). * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED: Undefined Front End block * (e.g. if a counter set that * a block doesn't support is * used). * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER: Tiler block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER_UNDEFINED: Undefined Tiler block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC: Shader Core block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2: Secondary Shader Core block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC3: Tertiary Shader Core block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC_UNDEFINED: Undefined Shader Core block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS: Memsys block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2: Secondary Memsys block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED: Undefined Memsys block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW: FW block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW2: Secondary FW block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW3: Tertiary FW block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW_UNDEFINED: Undefined FW block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG: CSG block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG2: Secondary CSG block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG3: Tertiary CSG block. * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG_UNDEFINED: Undefined CSG block. */ enum kbase_hwcnt_gpu_v5_block_type { KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE2, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE3, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER_UNDEFINED, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC3, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC_UNDEFINED, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW2, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW3, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FW_UNDEFINED, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG2, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG3, KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_CSG_UNDEFINED, }; /** * enum kbase_hwcnt_set - GPU hardware counter sets * @KBASE_HWCNT_SET_PRIMARY: The Primary set of counters * @KBASE_HWCNT_SET_SECONDARY: The Secondary set of counters * @KBASE_HWCNT_SET_TERTIARY: The Tertiary set of counters * @KBASE_HWCNT_SET_UNDEFINED: Undefined set of counters */ enum kbase_hwcnt_set { KBASE_HWCNT_SET_PRIMARY, KBASE_HWCNT_SET_SECONDARY, KBASE_HWCNT_SET_TERTIARY, KBASE_HWCNT_SET_UNDEFINED = 255, }; /** * struct kbase_hwcnt_physical_enable_map - Representation of enable map * directly used by GPU. * @fe_bm: Front end (JM/CSHW) counters selection bitmask. * @shader_bm: Shader counters selection bitmask. * @tiler_bm: Tiler counters selection bitmask. * @mmu_l2_bm: MMU_L2 counters selection bitmask. * @fw_bm: CSF firmware counters selection bitmask. * @csg_bm: CSF CSG counters selection bitmask. */ struct kbase_hwcnt_physical_enable_map { u32 fe_bm; u32 shader_bm; u32 tiler_bm; u32 mmu_l2_bm; u32 fw_bm; u32 csg_bm; }; /** * struct kbase_hwcnt_enable_cm - 128-bit enable counter masks. * @fe_bm: Front end (JM/CSHW) counters selection bitmask. * @shader_bm: Shader counters selection bitmask. * @tiler_bm: Tiler counters selection bitmask. * @mmu_l2_bm: MMU_L2 counters selection bitmask. * @fw_bm: CSF firmware counters selection bitmask. * @csg_bm: CSF CSG counters selection bitmask. */ struct kbase_hwcnt_enable_cm { u64 fe_bm[2]; u64 shader_bm[2]; u64 tiler_bm[2]; u64 mmu_l2_bm[2]; u64 fw_bm[2]; u64 csg_bm[2]; }; /* * Values for Hardware Counter SET_SELECT value. * Directly passed to HW. */ enum kbase_hwcnt_physical_set { KBASE_HWCNT_PHYSICAL_SET_PRIMARY = 0, KBASE_HWCNT_PHYSICAL_SET_SECONDARY = 1, KBASE_HWCNT_PHYSICAL_SET_TERTIARY = 2, }; /** * struct kbase_hwcnt_gpu_info - Information about hwcnt blocks on the GPUs. * @l2_count: L2 cache count. * @core_mask: Shader core mask. May be sparse. * @clk_cnt: Number of clock domains available. * @csg_cnt: Number of CSGs available. * @prfcnt_values_per_block: Total entries (header + counters) of performance * counter per block. * @has_fw_counters: Whether the GPU has FW counters available. */ struct kbase_hwcnt_gpu_info { size_t l2_count; u64 core_mask; u8 clk_cnt; u8 csg_cnt; size_t prfcnt_values_per_block; bool has_fw_counters; }; /** * struct kbase_hwcnt_curr_config - Current Configuration of HW allocated to the * GPU. * @num_l2_slices: Current number of L2 slices allocated to the GPU. * @shader_present: Current shader present bitmap that is allocated to the GPU. * * For architectures with the max_config interface available from the Arbiter, * the current resources allocated may change during runtime due to a * re-partitioning (possible with partition manager). Thus, the HWC needs to be * prepared to report any possible set of counters. For this reason the memory * layout in the userspace is based on the maximum possible allocation. On the * other hand, each partition has just the view of its currently allocated * resources. Therefore, it is necessary to correctly map the dumped HWC values * from the registers into this maximum memory layout so that it can be exposed * to the userspace side correctly. * * For L2 cache just the number is enough once the allocated ones will be * accumulated on the first L2 slots available in the destination buffer. * * For the correct mapping of the shader cores it is necessary to jump all the * L2 cache slots in the destination buffer that are not allocated. But, it is * not necessary to add any logic to map the shader cores bitmap into the memory * layout because the shader_present allocated will always be a subset of the * maximum shader_present. It is possible because: * 1 - Partitions are made of slices and they are always ordered from the ones * with more shader cores to the ones with less. * 2 - The shader cores in a slice are always contiguous. * 3 - A partition can only have a contiguous set of slices allocated to it. * So, for example, if 4 slices are available in total, 1 with 4 cores, 2 with * 3 cores and 1 with 2 cores. The maximum possible shader_present would be: * 0x0011|0111|0111|1111 -> note the order and that the shader cores are * contiguous in any slice. * Supposing that a partition takes the two slices in the middle, the current * config shader_present for this partition would be: * 0x0111|0111 -> note that this is a subset of the maximum above and the slices * are contiguous. * Therefore, by directly copying any subset of the maximum possible * shader_present the mapping is already achieved. */ struct kbase_hwcnt_curr_config { size_t num_l2_slices; u64 shader_present; }; /** * kbase_hwcnt_is_block_type_undefined() - Check if a block type is undefined. * * @blk_type: Hardware counter block type. * * Return: true if the block type is undefined, else false. */ static inline bool kbase_hwcnt_is_block_type_undefined(const uint64_t blk_type) { return (blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE_UNDEFINED || blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER_UNDEFINED || blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC_UNDEFINED || blk_type == KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS_UNDEFINED); } /** * kbase_hwcnt_jm_metadata_create() - Create hardware counter metadata for the * JM GPUs. * @info: Non-NULL pointer to info struct. * @counter_set: The performance counter set used. * @out_metadata: Non-NULL pointer to where created metadata is stored on * success. * @out_dump_bytes: Non-NULL pointer to where the size of the GPU counter dump * buffer is stored on success. * * Return: 0 on success, else error code. */ int kbase_hwcnt_jm_metadata_create(const struct kbase_hwcnt_gpu_info *info, enum kbase_hwcnt_set counter_set, const struct kbase_hwcnt_metadata **out_metadata, size_t *out_dump_bytes); /** * kbase_hwcnt_jm_metadata_destroy() - Destroy JM GPU hardware counter metadata. * * @metadata: Pointer to metadata to destroy. */ void kbase_hwcnt_jm_metadata_destroy(const struct kbase_hwcnt_metadata *metadata); /** * kbase_hwcnt_csf_metadata_create() - Create hardware counter metadata for the * CSF GPUs. * @info: Non-NULL pointer to info struct. * @counter_set: The performance counter set used. * @out_metadata: Non-NULL pointer to where created metadata is stored on * success. * * Return: 0 on success, else error code. */ int kbase_hwcnt_csf_metadata_create(const struct kbase_hwcnt_gpu_info *info, enum kbase_hwcnt_set counter_set, const struct kbase_hwcnt_metadata **out_metadata); /** * kbase_hwcnt_csf_metadata_destroy() - Destroy CSF GPU hardware counter * metadata. * @metadata: Pointer to metadata to destroy. */ void kbase_hwcnt_csf_metadata_destroy(const struct kbase_hwcnt_metadata *metadata); /** * kbase_hwcnt_jm_dump_get() - Copy or accumulate enabled counters from the raw * dump buffer in src into the dump buffer * abstraction in dst. * @dst: Non-NULL pointer to destination dump buffer. * @src: Non-NULL pointer to source raw dump buffer, of same length * as dump_buf_bytes in the metadata of destination dump * buffer. * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values. * @pm_core_mask: PM state synchronized shaders core mask with the dump. * @debug_core_mask: User-set mask of cores to be used by the GPU. * @max_core_mask: Core mask of all cores allocated to the GPU (non * virtualized platforms) or resource group (virtualized * platforms). * @max_l2_slices: Maximum number of L2 slices allocated to the GPU (non * virtualised platforms) or resource group (virtualized * platforms). * @curr_config: Current allocated hardware resources to correctly map the * source raw dump buffer to the destination dump buffer. * @accumulate: True if counters in source should be accumulated into * destination, rather than copied. * * The dst and dst_enable_map MUST have been created from the same metadata as * returned from the call to kbase_hwcnt_jm_metadata_create as was used to get * the length of src. * * Return: 0 on success, else error code. */ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src, const struct kbase_hwcnt_enable_map *dst_enable_map, const u64 pm_core_mask, u64 debug_core_mask, u64 max_core_mask, size_t max_l2_slices, const struct kbase_hwcnt_curr_config *curr_config, bool accumulate); /** * kbase_hwcnt_csf_dump_get() - Copy or accumulate enabled counters from the raw * dump buffer in src into the dump buffer * abstraction in dst. * @dst: Non-NULL pointer to destination dump buffer. * @src: Non-NULL pointer to source raw dump buffer, of same length * as dump_buf_bytes in the metadata of dst dump buffer. * @src_block_stt: Non-NULL pointer to source block state buffer. * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values. * @num_l2_slices: Current number of L2 slices allocated to the GPU. * @shader_present_bitmap: Current shader-present bitmap that is allocated to the GPU. * @accumulate: True if counters in src should be accumulated into * destination, rather than copied. * * The dst and dst_enable_map MUST have been created from the same metadata as * returned from the call to kbase_hwcnt_csf_metadata_create as was used to get * the length of src. * * Return: 0 on success, else error code. */ int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src, blk_stt_t *src_block_stt, const struct kbase_hwcnt_enable_map *dst_enable_map, size_t num_l2_slices, u64 shader_present_bitmap, bool accumulate); /** * kbase_hwcnt_backend_gpu_block_map_to_physical() - Convert from a block * enable map abstraction to * a physical block enable * map. * @lo: Low 64 bits of block enable map abstraction. * @hi: High 64 bits of block enable map abstraction. * * The abstraction uses 128 bits to enable 128 block values, whereas the * physical uses just 32 bits, as bit n enables values [n*4, n*4+3]. * Therefore, this conversion is lossy. * * Return: 32-bit physical block enable map. */ static inline u32 kbase_hwcnt_backend_gpu_block_map_to_physical(u64 lo, u64 hi) { u32 phys = 0; u64 dwords[2] = { lo, hi }; size_t dword_idx; for (dword_idx = 0; dword_idx < 2; dword_idx++) { const u64 dword = dwords[dword_idx]; u16 packed = 0; size_t hword_bit; for (hword_bit = 0; hword_bit < 16; hword_bit++) { const size_t dword_bit = hword_bit * 4; const u16 mask = ((dword >> (dword_bit + 0)) & 0x1) | ((dword >> (dword_bit + 1)) & 0x1) | ((dword >> (dword_bit + 2)) & 0x1) | ((dword >> (dword_bit + 3)) & 0x1); packed |= (mask << hword_bit); } phys |= ((u32)packed) << (16 * dword_idx); } return phys; } /** * kbase_hwcnt_gpu_enable_map_to_physical() - Convert an enable map abstraction * into a physical enable map. * @dst: Non-NULL pointer to destination physical enable map. * @src: Non-NULL pointer to source enable map abstraction. * * The src must have been created from a metadata returned from a call to * kbase_hwcnt_jm_metadata_create or kbase_hwcnt_csf_metadata_create. * * This is a lossy conversion, as the enable map abstraction has one bit per * individual counter block value, but the physical enable map uses 1 bit for * every 4 counters, shared over all instances of a block. */ void kbase_hwcnt_gpu_enable_map_to_physical(struct kbase_hwcnt_physical_enable_map *dst, const struct kbase_hwcnt_enable_map *src); /** * kbase_hwcnt_gpu_set_to_physical() - Map counter set selection to physical * SET_SELECT value. * * @dst: Non-NULL pointer to destination physical SET_SELECT value. * @src: Non-NULL pointer to source counter set selection. */ void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst, enum kbase_hwcnt_set src); /** * kbase_hwcnt_gpu_enable_map_from_physical() - Convert a physical enable map to * an enable map abstraction. * @dst: Non-NULL pointer to destination enable map abstraction. * @src: Non-NULL pointer to source physical enable map. * * The dst must have been created from a metadata returned from a call to * kbase_hwcnt_jm_metadata_create or kbase_hwcnt_csf_metadata_create. * * This is a lossy conversion, as the physical enable map can technically * support counter blocks with 128 counters each, but no hardware actually uses * more than 64, so the enable map abstraction has nowhere to store the enable * information for the 64 non-existent counters. */ void kbase_hwcnt_gpu_enable_map_from_physical(struct kbase_hwcnt_enable_map *dst, const struct kbase_hwcnt_physical_enable_map *src); /** * kbase_hwcnt_gpu_patch_dump_headers() - Patch all the performance counter * enable headers in a dump buffer to * reflect the specified enable map. * @buf: Non-NULL pointer to dump buffer to patch. * @enable_map: Non-NULL pointer to enable map. * * The buf and enable_map must have been created from a metadata returned from * a call to kbase_hwcnt_jm_metadata_create or kbase_hwcnt_csf_metadata_create. * * This function should be used before handing off a dump buffer over the * kernel-user boundary, to ensure the header is accurate for the enable map * used by the user. */ void kbase_hwcnt_gpu_patch_dump_headers(struct kbase_hwcnt_dump_buffer *buf, const struct kbase_hwcnt_enable_map *enable_map); bool kbase_hwcnt_is_block_type_shader(const enum kbase_hwcnt_gpu_v5_block_type blk_type); bool kbase_hwcnt_is_block_type_memsys(const enum kbase_hwcnt_gpu_v5_block_type blk_type); bool kbase_hwcnt_is_block_type_tiler(const enum kbase_hwcnt_gpu_v5_block_type blk_type); bool kbase_hwcnt_is_block_type_fe(const enum kbase_hwcnt_gpu_v5_block_type blk_type); /** * kbase_hwcnt_gpu_enable_map_from_cm() - Builds enable map abstraction from * counter selection bitmasks. * @dst: Non-NULL pointer to destination enable map abstraction. * @src: Non-NULL pointer to source counter selection bitmasks. * * The dst must have been created from a metadata returned from a call to * kbase_hwcnt_jm_metadata_create or kbase_hwcnt_csf_metadata_create. */ void kbase_hwcnt_gpu_enable_map_from_cm(struct kbase_hwcnt_enable_map *dst, const struct kbase_hwcnt_enable_cm *src); #endif /* _KBASE_HWCNT_GPU_H_ */