diff options
author | Jesse Hall <jessehall@google.com> | 2021-11-23 14:38:46 -0800 |
---|---|---|
committer | Jesse Hall <jessehall@google.com> | 2021-11-23 14:38:46 -0800 |
commit | 0c596dc70431fa2c70021fa1685e3efc969a852d (patch) | |
tree | 8c6cfe8da5d3bea214e991cc4438988f65d9081e | |
parent | bbbb1cf6bb211bb2094dd66656966277c326867f (diff) | |
download | gpu-0c596dc70431fa2c70021fa1685e3efc969a852d.tar.gz |
Mali Valhall Android DDK r34p0-00dev1
Provenance:
046d23c969 (collaborate/google/android/v_r34p0-00dev1)
VX504X08X-BU-00000-r34p0-00dev1 - Valhall Android DDK
VX504X08X-SW-99006-r34p0-00dev1 - Valhall Android Renderscript AOSP parts
Documentation from VX504X08X-BU-00000 omitted.
Signed-off-by: Jesse Hall <jessehall@google.com>
Change-Id: I4ebbb3a3af709bd39f883eed3b35bf4657a95797
140 files changed, 9841 insertions, 2569 deletions
diff --git a/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h b/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h index 78c328c..f5f859e 100644 --- a/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h +++ b/common/include/uapi/gpu/arm/midgard/csf/mali_base_csf_kernel.h @@ -186,17 +186,17 @@ #define BASE_MEM_FLAGS_RESERVED \ BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_20 -#define BASEP_MEM_INVALID_HANDLE (0ull << 12) -#define BASE_MEM_MMU_DUMP_HANDLE (1ull << 12) -#define BASE_MEM_TRACE_BUFFER_HANDLE (2ull << 12) -#define BASE_MEM_MAP_TRACKING_HANDLE (3ull << 12) -#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ull << 12) +#define BASEP_MEM_INVALID_HANDLE (0ul) +#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT) +#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT) /* reserved handles ..-47<<PAGE_SHIFT> for future special handles */ -#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << 12) -#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << 12) -#define BASE_MEM_COOKIE_BASE (64ul << 12) -#define BASE_MEM_FIRST_FREE_ADDRESS ((BITS_PER_LONG << 12) + \ - BASE_MEM_COOKIE_BASE) +#define BASEP_MEM_CSF_USER_REG_PAGE_HANDLE (47ul << LOCAL_PAGE_SHIFT) +#define BASEP_MEM_CSF_USER_IO_PAGES_HANDLE (48ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_FIRST_FREE_ADDRESS \ + ((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE) #define KBASE_CSF_NUM_USER_IO_PAGES_HANDLE \ ((BASE_MEM_COOKIE_BASE - BASEP_MEM_CSF_USER_IO_PAGES_HANDLE) >> \ @@ -301,7 +301,6 @@ typedef __u32 base_context_create_flags; */ #define BASEP_KCPU_CQS_MAX_NUM_OBJS ((size_t)32) -#if MALI_UNIT_TEST /** * enum base_kcpu_command_type - Kernel CPU queue command type. * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL: fence_signal, @@ -331,42 +330,8 @@ enum base_kcpu_command_type { BASE_KCPU_COMMAND_TYPE_JIT_ALLOC, BASE_KCPU_COMMAND_TYPE_JIT_FREE, BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND, - BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER, - BASE_KCPU_COMMAND_TYPE_SAMPLE_TIME, + BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER }; -#else -/** - * enum base_kcpu_command_type - Kernel CPU queue command type. - * @BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL: fence_signal, - * @BASE_KCPU_COMMAND_TYPE_FENCE_WAIT: fence_wait, - * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT: cqs_wait, - * @BASE_KCPU_COMMAND_TYPE_CQS_SET: cqs_set, - * @BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION: cqs_wait_operation, - * @BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION: cqs_set_operation, - * @BASE_KCPU_COMMAND_TYPE_MAP_IMPORT: map_import, - * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT: unmap_import, - * @BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE: unmap_import_force, - * @BASE_KCPU_COMMAND_TYPE_JIT_ALLOC: jit_alloc, - * @BASE_KCPU_COMMAND_TYPE_JIT_FREE: jit_free, - * @BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND: group_suspend, - * @BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER: error_barrier, - */ -enum base_kcpu_command_type { - BASE_KCPU_COMMAND_TYPE_FENCE_SIGNAL, - BASE_KCPU_COMMAND_TYPE_FENCE_WAIT, - BASE_KCPU_COMMAND_TYPE_CQS_WAIT, - BASE_KCPU_COMMAND_TYPE_CQS_SET, - BASE_KCPU_COMMAND_TYPE_CQS_WAIT_OPERATION, - BASE_KCPU_COMMAND_TYPE_CQS_SET_OPERATION, - BASE_KCPU_COMMAND_TYPE_MAP_IMPORT, - BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT, - BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT_FORCE, - BASE_KCPU_COMMAND_TYPE_JIT_ALLOC, - BASE_KCPU_COMMAND_TYPE_JIT_FREE, - BASE_KCPU_COMMAND_TYPE_GROUP_SUSPEND, - BASE_KCPU_COMMAND_TYPE_ERROR_BARRIER, -}; -#endif /* MALI_UNIT_TEST */ /** * enum base_queue_group_priority - Priority of a GPU Command Queue Group. @@ -568,11 +533,6 @@ struct base_kcpu_command_group_suspend_info { __u8 padding[3]; }; -#if MALI_UNIT_TEST -struct base_kcpu_command_sample_time_info { - __u64 time; -}; -#endif /* MALI_UNIT_TEST */ /** * struct base_kcpu_command - kcpu command. @@ -603,9 +563,6 @@ struct base_kcpu_command { struct base_kcpu_command_jit_alloc_info jit_alloc; struct base_kcpu_command_jit_free_info jit_free; struct base_kcpu_command_group_suspend_info suspend_buf_copy; -#if MALI_UNIT_TEST - struct base_kcpu_command_sample_time_info sample_time; -#endif /* MALI_UNIT_TEST */ __u64 padding[2]; /* No sub-struct should be larger */ } info; }; diff --git a/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h b/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h index 06cc4c2..a5dc745 100644 --- a/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h +++ b/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h @@ -20,7 +20,8 @@ */ /* - * This header was autogenerated, it should not be edited. + * This header was originally autogenerated, but it is now ok (and + * expected) to have to add to it. */ #ifndef _UAPI_GPU_CSF_REGISTERS_H_ @@ -212,7 +213,6 @@ #define GLB_PWROFF_TIMER 0x0014 /* () Global shader core power off timer */ #define GLB_ALLOC_EN_LO 0x0018 /* () Global shader core allocation enable mask, low word */ #define GLB_ALLOC_EN_HI 0x001C /* () Global shader core allocation enable mask, high word */ -#define GLB_PROTM_COHERENCY 0x0020 /* () Configure COHERENCY_ENABLE register value to use in protected mode execution */ #define GLB_PRFCNT_JASID 0x0024 /* () Performance counter address space */ #define GLB_PRFCNT_BASE_LO 0x0028 /* () Performance counter buffer address, low word */ @@ -653,7 +653,9 @@ (((reg_val) & ~CS_FAULT_EXCEPTION_TYPE_MASK) | \ (((value) << CS_FAULT_EXCEPTION_TYPE_SHIFT) & CS_FAULT_EXCEPTION_TYPE_MASK)) /* CS_FAULT_EXCEPTION_TYPE values */ +#define CS_FAULT_EXCEPTION_TYPE_KABOOM 0x05 #define CS_FAULT_EXCEPTION_TYPE_CS_RESOURCE_TERMINATED 0x0F +#define CS_FAULT_EXCEPTION_TYPE_CS_BUS_FAULT 0x48 #define CS_FAULT_EXCEPTION_TYPE_CS_INHERIT_FAULT 0x4B #define CS_FAULT_EXCEPTION_TYPE_INSTR_INVALID_PC 0x50 #define CS_FAULT_EXCEPTION_TYPE_INSTR_INVALID_ENC 0x51 @@ -1164,6 +1166,13 @@ (((reg_val) & ~GLB_REQ_FIRMWARE_CONFIG_UPDATE_MASK) | \ (((value) << GLB_REQ_FIRMWARE_CONFIG_UPDATE_SHIFT) & \ GLB_REQ_FIRMWARE_CONFIG_UPDATE_MASK)) +#define GLB_REQ_SLEEP_SHIFT 12 +#define GLB_REQ_SLEEP_MASK (0x1 << GLB_REQ_SLEEP_SHIFT) +#define GLB_REQ_SLEEP_GET(reg_val) \ + (((reg_val) & GLB_REQ_SLEEP_MASK) >> GLB_REQ_SLEEP_SHIFT) +#define GLB_REQ_SLEEP_SET(reg_val, value) \ + (((reg_val) & ~GLB_REQ_SLEEP_MASK) | \ + (((value) << GLB_REQ_SLEEP_SHIFT) & GLB_REQ_SLEEP_MASK)) #define GLB_REQ_INACTIVE_COMPUTE_SHIFT 20 #define GLB_REQ_INACTIVE_COMPUTE_MASK (0x1 << GLB_REQ_INACTIVE_COMPUTE_SHIFT) #define GLB_REQ_INACTIVE_COMPUTE_GET(reg_val) \ @@ -1391,19 +1400,6 @@ #define GLB_ALLOC_EN_MASK_SET(reg_val, value) \ (((reg_val) & ~GLB_ALLOC_EN_MASK_MASK) | (((value) << GLB_ALLOC_EN_MASK_SHIFT) & GLB_ALLOC_EN_MASK_MASK)) -/* GLB_PROTM_COHERENCY register */ -#define GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_SHIFT 0 -#define GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_MASK \ - (0xFFFFFFFF << GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_SHIFT) -#define GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_GET(reg_val) \ - (((reg_val)&GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_MASK) >> \ - GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_SHIFT) -#define GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_SET(reg_val, value) \ - (((reg_val) & ~GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_MASK) | \ - (((value) << GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_SHIFT) & \ - GLB_PROTM_COHERENCY_L2_CACHE_PROTOCOL_SELECT_MASK)) -/* End of GLB_INPUT_BLOCK register set definitions */ - /* GLB_OUTPUT_BLOCK register set definitions */ /* GLB_ACK register */ @@ -1485,4 +1481,28 @@ (((reg_val) & ~CSG_STATUS_STATE_IDLE_MASK) | \ (((value) << CSG_STATUS_STATE_IDLE_SHIFT) & CSG_STATUS_STATE_IDLE_MASK)) +/* GLB_FEATURES_ITER_TRACE_SUPPORTED register */ +#define GLB_FEATURES_ITER_TRACE_SUPPORTED_SHIFT GPU_U(4) +#define GLB_FEATURES_ITER_TRACE_SUPPORTED_MASK \ + (GPU_U(0x1) << GLB_FEATURES_ITER_TRACE_SUPPORTED_SHIFT) +#define GLB_FEATURES_ITER_TRACE_SUPPORTED_GET(reg_val) \ + (((reg_val)&GLB_FEATURES_ITER_TRACE_SUPPORTED_MASK) >> \ + GLB_FEATURES_ITER_TRACE_SUPPORTED_SHIFT) +#define GLB_FEATURES_ITER_TRACE_SUPPORTED_SET(reg_val, value) \ + (((reg_val) & ~GLB_FEATURES_ITER_TRACE_SUPPORTED_MASK) | \ + (((value) << GLB_FEATURES_ITER_TRACE_SUPPORTED_SHIFT) & \ + GLB_FEATURES_ITER_TRACE_SUPPORTED_MASK)) + +/* GLB_REQ_ITER_TRACE_ENABLE register */ +#define GLB_REQ_ITER_TRACE_ENABLE_SHIFT GPU_U(11) +#define GLB_REQ_ITER_TRACE_ENABLE_MASK \ + (GPU_U(0x1) << GLB_REQ_ITER_TRACE_ENABLE_SHIFT) +#define GLB_REQ_ITER_TRACE_ENABLE_GET(reg_val) \ + (((reg_val)&GLB_REQ_ITER_TRACE_ENABLE_MASK) >> \ + GLB_REQ_ITER_TRACE_ENABLE_SHIFT) +#define GLB_REQ_ITER_TRACE_ENABLE_SET(reg_val, value) \ + (((reg_val) & ~GLB_REQ_ITER_TRACE_ENABLE_MASK) | \ + (((value) << GLB_REQ_ITER_TRACE_ENABLE_SHIFT) & \ + GLB_REQ_ITER_TRACE_ENABLE_MASK)) + #endif /* _UAPI_GPU_CSF_REGISTERS_H_ */ diff --git a/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h b/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h index d2d7ce2..ec4870c 100644 --- a/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h +++ b/common/include/uapi/gpu/arm/midgard/csf/mali_kbase_csf_ioctl.h @@ -44,6 +44,8 @@ * - Add ioctl 40: kbase_ioctl_cs_queue_register_ex, this is a new * queue registration call with extended format for supporting CS * trace configurations with CSF trace_command. + * 1.6: + * - Added new HW performance counters interface to all GPUs. */ #define BASE_UK_VERSION_MAJOR 1 diff --git a/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h index 2041739..4001a4c 100644 --- a/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h +++ b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_csf.h @@ -28,8 +28,13 @@ #error "Cannot be compiled with JM" #endif -/* IPA control registers */ +/* GPU_CONTROL_MCU base address */ +#define GPU_CONTROL_MCU_BASE 0x3000 + +/* MCU_SUBSYSTEM base address */ +#define MCU_SUBSYSTEM_BASE 0x20000 +/* IPA control registers */ #define IPA_CONTROL_BASE 0x40000 #define IPA_CONTROL_REG(r) (IPA_CONTROL_BASE+(r)) #define COMMAND 0x000 /* (WO) Command register */ @@ -63,8 +68,6 @@ #define VALUE_SHADER_REG_LO(n) (VALUE_SHADER_BASE + ((n) << 3)) /* (RO) Counter value #n, low word */ #define VALUE_SHADER_REG_HI(n) (VALUE_SHADER_BASE + ((n) << 3) + 4) /* (RO) Counter value #n, high word */ -#include "../../csf/mali_gpu_csf_control_registers.h" - /* Set to implementation defined, outer caching */ #define AS_MEMATTR_AARCH64_OUTER_IMPL_DEF 0x88ull /* Set to write back memory, outer caching */ @@ -117,6 +120,9 @@ #define MCU_CNTRL_AUTO (1 << 1) #define MCU_CNTRL_DISABLE (0) +#define MCU_CNTRL_DOORBELL_DISABLE_SHIFT (31) +#define MCU_CNTRL_DOORBELL_DISABLE_MASK (1 << MCU_CNTRL_DOORBELL_DISABLE_SHIFT) + #define MCU_STATUS_HALTED (1 << 1) #define PRFCNT_BASE_LO 0x060 /* (RW) Performance counter memory @@ -181,11 +187,19 @@ #define GPU_COMMAND_TIME_DISABLE 0x00 /* Disable cycle counter */ #define GPU_COMMAND_TIME_ENABLE 0x01 /* Enable cycle counter */ -/* GPU_COMMAND_FLUSH_CACHES payloads */ -#define GPU_COMMAND_FLUSH_PAYLOAD_NONE 0x00 /* No flush */ -#define GPU_COMMAND_FLUSH_PAYLOAD_CLEAN 0x01 /* Clean the caches */ -#define GPU_COMMAND_FLUSH_PAYLOAD_INVALIDATE 0x02 /* Invalidate the caches */ -#define GPU_COMMAND_FLUSH_PAYLOAD_CLEAN_INVALIDATE 0x03 /* Clean and invalidate the caches */ +/* GPU_COMMAND_FLUSH_CACHES payloads bits for L2 caches */ +#define GPU_COMMAND_FLUSH_PAYLOAD_L2_NONE 0x000 /* No flush */ +#define GPU_COMMAND_FLUSH_PAYLOAD_L2_CLEAN 0x001 /* CLN only */ +#define GPU_COMMAND_FLUSH_PAYLOAD_L2_CLEAN_INVALIDATE 0x003 /* CLN + INV */ + +/* GPU_COMMAND_FLUSH_CACHES payloads bits for Load-store caches */ +#define GPU_COMMAND_FLUSH_PAYLOAD_LSC_NONE 0x000 /* No flush */ +#define GPU_COMMAND_FLUSH_PAYLOAD_LSC_CLEAN 0x010 /* CLN only */ +#define GPU_COMMAND_FLUSH_PAYLOAD_LSC_CLEAN_INVALIDATE 0x030 /* CLN + INV */ + +/* GPU_COMMAND_FLUSH_CACHES payloads bits for Other caches */ +#define GPU_COMMAND_FLUSH_PAYLOAD_OTHER_NONE 0x000 /* No flush */ +#define GPU_COMMAND_FLUSH_PAYLOAD_OTHER_INVALIDATE 0x200 /* INV only */ /* GPU_COMMAND command + payload */ #define GPU_COMMAND_CODE_PAYLOAD(opcode, payload) \ @@ -220,13 +234,21 @@ #define GPU_COMMAND_CYCLE_COUNT_STOP \ GPU_COMMAND_CODE_PAYLOAD(GPU_COMMAND_CODE_TIME, GPU_COMMAND_TIME_DISABLE) -/* Clean all caches */ -#define GPU_COMMAND_CLEAN_CACHES \ - GPU_COMMAND_CODE_PAYLOAD(GPU_COMMAND_CODE_FLUSH_CACHES, GPU_COMMAND_FLUSH_PAYLOAD_CLEAN) - -/* Clean and invalidate all caches */ -#define GPU_COMMAND_CLEAN_INV_CACHES \ - GPU_COMMAND_CODE_PAYLOAD(GPU_COMMAND_CODE_FLUSH_CACHES, GPU_COMMAND_FLUSH_PAYLOAD_CLEAN_INVALIDATE) +/* Clean and invalidate L2 cache (Equivalent to FLUSH_PT) */ +#define GPU_COMMAND_CACHE_CLN_INV_L2 \ + GPU_COMMAND_CODE_PAYLOAD( \ + GPU_COMMAND_CODE_FLUSH_CACHES, \ + (GPU_COMMAND_FLUSH_PAYLOAD_L2_CLEAN_INVALIDATE | \ + GPU_COMMAND_FLUSH_PAYLOAD_LSC_NONE | \ + GPU_COMMAND_FLUSH_PAYLOAD_OTHER_NONE)) + +/* Clean and invalidate L2 and LSC caches (Equivalent to FLUSH_MEM) */ +#define GPU_COMMAND_CACHE_CLN_INV_L2_LSC \ + GPU_COMMAND_CODE_PAYLOAD( \ + GPU_COMMAND_CODE_FLUSH_CACHES, \ + (GPU_COMMAND_FLUSH_PAYLOAD_L2_CLEAN_INVALIDATE | \ + GPU_COMMAND_FLUSH_PAYLOAD_LSC_CLEAN_INVALIDATE | \ + GPU_COMMAND_FLUSH_PAYLOAD_OTHER_NONE)) /* Places the GPU in protected mode */ #define GPU_COMMAND_SET_PROTECTED_MODE \ diff --git a/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h index 1be3541..dcadcc7 100644 --- a/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h +++ b/common/include/uapi/gpu/arm/midgard/gpu/backend/mali_kbase_gpu_regmap_jm.h @@ -261,6 +261,10 @@ #define GPU_COMMAND_CLEAN_INV_CACHES 0x08 /* Clean and invalidate all caches */ #define GPU_COMMAND_SET_PROTECTED_MODE 0x09 /* Places the GPU in protected mode */ +/* GPU_COMMAND cache flush alias to CSF command payload */ +#define GPU_COMMAND_CACHE_CLN_INV_L2 GPU_COMMAND_CLEAN_INV_CACHES +#define GPU_COMMAND_CACHE_CLN_INV_L2_LSC GPU_COMMAND_CLEAN_INV_CACHES + /* IRQ flags */ #define GPU_FAULT (1 << 0) /* A GPU Fault has occurred */ #define MULTIPLE_GPU_FAULTS (1 << 7) /* More than one GPU Fault occurred. */ diff --git a/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h index d093ce4..666b0af 100644 --- a/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h +++ b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_id.h @@ -53,6 +53,20 @@ GPU_ID2_VERSION_MINOR | \ GPU_ID2_VERSION_STATUS) +/* Helper macro to construct a value consisting of arch major and revision + * using the value of gpu_id. + */ +#define ARCH_MAJOR_REV_REG(gpu_id) \ + ((((__u32)gpu_id) & GPU_ID2_ARCH_MAJOR) | \ + (((__u32)gpu_id) & GPU_ID2_ARCH_REV)) + +/* Helper macro to create a partial GPU_ID (new format) that defines + * a arch major and revision. + */ +#define GPU_ID2_ARCH_MAJOR_REV_MAKE(arch_major, arch_rev) \ + ((((__u32)arch_major) << GPU_ID2_ARCH_MAJOR_SHIFT) | \ + (((__u32)arch_rev) << GPU_ID2_ARCH_REV_SHIFT)) + /* Helper macro to create a partial GPU_ID (new format) that defines * a product ignoring its version. */ @@ -109,6 +123,8 @@ #define GPU_ID2_PRODUCT_TGRX GPU_ID2_MODEL_MAKE(10, 3) #define GPU_ID2_PRODUCT_TVAX GPU_ID2_MODEL_MAKE(10, 4) #define GPU_ID2_PRODUCT_LODX GPU_ID2_MODEL_MAKE(10, 7) +#define GPU_ID2_PRODUCT_TTUX GPU_ID2_MODEL_MAKE(11, 2) +#define GPU_ID2_PRODUCT_LTUX GPU_ID2_MODEL_MAKE(11, 3) /* Helper macro to create a GPU_ID assuming valid values for id, major, * minor, status diff --git a/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_regmap.h b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_regmap.h index 84fad8d..e223220 100644 --- a/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_regmap.h +++ b/common/include/uapi/gpu/arm/midgard/gpu/mali_kbase_gpu_regmap.h @@ -30,6 +30,13 @@ #include "backend/mali_kbase_gpu_regmap_jm.h" #endif +/* GPU_U definition */ +#ifdef __ASSEMBLER__ +#define GPU_U(x) x +#else +#define GPU_U(x) x##u +#endif /* __ASSEMBLER__ */ + /* Begin Register Offsets */ /* GPU control registers */ @@ -149,6 +156,10 @@ #define ASN_HASH(n) (ASN_HASH_0 + (n)*4) #define ASN_HASH_COUNT 3 +#define SYSC_ALLOC0 0x0340 /* (RW) System cache allocation hint from source ID */ +#define SYSC_ALLOC(n) (SYSC_ALLOC0 + (n)*4) +#define SYSC_ALLOC_COUNT 8 + #define STACK_PWRTRANS_LO 0xE40 /* (RO) Core stack power transition bitmap, low word */ #define STACK_PWRTRANS_HI 0xE44 /* (RO) Core stack power transition bitmap, high word */ @@ -164,6 +175,7 @@ #define COHERENCY_FEATURES 0x300 /* (RO) Coherency features present */ #define COHERENCY_ENABLE 0x304 /* (RW) Coherency enable */ + #define SHADER_CONFIG 0xF04 /* (RW) Shader core configuration (implementation-specific) */ #define TILER_CONFIG 0xF08 /* (RW) Tiler core configuration (implementation-specific) */ #define L2_MMU_CONFIG 0xF0C /* (RW) L2 cache and MMU configuration (implementation-specific) */ @@ -327,10 +339,6 @@ #define AS_COMMAND_UPDATE 0x01 /* Broadcasts the values in AS_TRANSTAB and ASn_MEMATTR to all MMUs */ #define AS_COMMAND_LOCK 0x02 /* Issue a lock region command to all MMUs */ #define AS_COMMAND_UNLOCK 0x03 /* Issue a flush region command to all MMUs */ -/* Flush all L2 caches then issue a flush region command to all MMUs - * (deprecated - only for use with T60x) - */ -#define AS_COMMAND_FLUSH 0x04 /* Flush all L2 caches then issue a flush region command to all MMUs */ #define AS_COMMAND_FLUSH_PT 0x04 /* Wait for memory accesses to complete, flush all the L1s cache then flush all @@ -338,6 +346,28 @@ */ #define AS_COMMAND_FLUSH_MEM 0x05 +/* AS_LOCKADDR register */ +#define AS_LOCKADDR_LOCKADDR_SIZE_SHIFT GPU_U(0) +#define AS_LOCKADDR_LOCKADDR_SIZE_MASK \ + (GPU_U(0x3F) << AS_LOCKADDR_LOCKADDR_SIZE_SHIFT) +#define AS_LOCKADDR_LOCKADDR_SIZE_GET(reg_val) \ + (((reg_val)&AS_LOCKADDR_LOCKADDR_SIZE_MASK) >> \ + AS_LOCKADDR_LOCKADDR_SIZE_SHIFT) +#define AS_LOCKADDR_LOCKADDR_SIZE_SET(reg_val, value) \ + (((reg_val) & ~AS_LOCKADDR_LOCKADDR_SIZE_MASK) | \ + (((value) << AS_LOCKADDR_LOCKADDR_SIZE_SHIFT) & \ + AS_LOCKADDR_LOCKADDR_SIZE_MASK)) +#define AS_LOCKADDR_LOCKADDR_BASE_SHIFT GPU_U(12) +#define AS_LOCKADDR_LOCKADDR_BASE_MASK \ + (GPU_U(0xFFFFFFFFFFFFF) << AS_LOCKADDR_LOCKADDR_BASE_SHIFT) +#define AS_LOCKADDR_LOCKADDR_BASE_GET(reg_val) \ + (((reg_val)&AS_LOCKADDR_LOCKADDR_BASE_MASK) >> \ + AS_LOCKADDR_LOCKADDR_BASE_SHIFT) +#define AS_LOCKADDR_LOCKADDR_BASE_SET(reg_val, value) \ + (((reg_val) & ~AS_LOCKADDR_LOCKADDR_BASE_MASK) | \ + (((value) << AS_LOCKADDR_LOCKADDR_BASE_SHIFT) & \ + AS_LOCKADDR_LOCKADDR_BASE_MASK)) + /* GPU_STATUS values */ #define GPU_STATUS_PRFCNT_ACTIVE (1 << 2) /* Set if the performance counters are active. */ #define GPU_STATUS_CYCLE_COUNT_ACTIVE (1 << 6) /* Set if the cycle counter is active. */ @@ -427,8 +457,133 @@ #define L2_CONFIG_ASN_HASH_ENABLE_MASK (1ul << L2_CONFIG_ASN_HASH_ENABLE_SHIFT) /* End L2_CONFIG register */ + /* IDVS_GROUP register */ #define IDVS_GROUP_SIZE_SHIFT (16) #define IDVS_GROUP_MAX_SIZE (0x3F) +/* SYSC_ALLOC read IDs */ +#define SYSC_ALLOC_ID_R_OTHER 0x00 +#define SYSC_ALLOC_ID_R_CSF 0x02 +#define SYSC_ALLOC_ID_R_MMU 0x04 +#define SYSC_ALLOC_ID_R_TILER_VERT 0x08 +#define SYSC_ALLOC_ID_R_TILER_PTR 0x09 +#define SYSC_ALLOC_ID_R_TILER_INDEX 0x0A +#define SYSC_ALLOC_ID_R_TILER_OTHER 0x0B +#define SYSC_ALLOC_ID_R_IC 0x10 +#define SYSC_ALLOC_ID_R_ATTR 0x11 +#define SYSC_ALLOC_ID_R_SCM 0x12 +#define SYSC_ALLOC_ID_R_FSDC 0x13 +#define SYSC_ALLOC_ID_R_VL 0x14 +#define SYSC_ALLOC_ID_R_PLR 0x15 +#define SYSC_ALLOC_ID_R_TEX 0x18 +#define SYSC_ALLOC_ID_R_LSC 0x1c + +/* SYSC_ALLOC write IDs */ +#define SYSC_ALLOC_ID_W_OTHER 0x00 +#define SYSC_ALLOC_ID_W_CSF 0x02 +#define SYSC_ALLOC_ID_W_PCB 0x07 +#define SYSC_ALLOC_ID_W_TILER_PTR 0x09 +#define SYSC_ALLOC_ID_W_TILER_VERT_PLIST 0x0A +#define SYSC_ALLOC_ID_W_TILER_OTHER 0x0B +#define SYSC_ALLOC_ID_W_L2_EVICT 0x0C +#define SYSC_ALLOC_ID_W_L2_FLUSH 0x0D +#define SYSC_ALLOC_ID_W_TIB_COLOR 0x10 +#define SYSC_ALLOC_ID_W_TIB_COLOR_AFBCH 0x11 +#define SYSC_ALLOC_ID_W_TIB_COLOR_AFBCB 0x12 +#define SYSC_ALLOC_ID_W_TIB_CRC 0x13 +#define SYSC_ALLOC_ID_W_TIB_DS 0x14 +#define SYSC_ALLOC_ID_W_TIB_DS_AFBCH 0x15 +#define SYSC_ALLOC_ID_W_TIB_DS_AFBCB 0x16 +#define SYSC_ALLOC_ID_W_LSC 0x1C + +/* SYSC_ALLOC values */ +#define SYSC_ALLOC_L2_ALLOC 0x0 +#define SYSC_ALLOC_NEVER_ALLOC 0x2 +#define SYSC_ALLOC_ALWAYS_ALLOC 0x3 +#define SYSC_ALLOC_PTL_ALLOC 0x4 +#define SYSC_ALLOC_L2_PTL_ALLOC 0x5 + +/* SYSC_ALLOC register */ +#define SYSC_ALLOC_R_SYSC_ALLOC0_SHIFT (0) +#define SYSC_ALLOC_R_SYSC_ALLOC0_MASK ((0xF) << SYSC_ALLOC_R_SYSC_ALLOC0_SHIFT) +#define SYSC_ALLOC_R_SYSC_ALLOC0_GET(reg_val) \ + (((reg_val)&SYSC_ALLOC_R_SYSC_ALLOC0_MASK) >> \ + SYSC_ALLOC_R_SYSC_ALLOC0_SHIFT) +#define SYSC_ALLOC_R_SYSC_ALLOC0_SET(reg_val, value) \ + (((reg_val) & ~SYSC_ALLOC_R_SYSC_ALLOC0_MASK) | \ + (((value) << SYSC_ALLOC_R_SYSC_ALLOC0_SHIFT) & \ + SYSC_ALLOC_R_SYSC_ALLOC0_MASK)) +/* End of SYSC_ALLOC_R_SYSC_ALLOC0 values */ +#define SYSC_ALLOC_W_SYSC_ALLOC0_SHIFT (4) +#define SYSC_ALLOC_W_SYSC_ALLOC0_MASK ((0xF) << SYSC_ALLOC_W_SYSC_ALLOC0_SHIFT) +#define SYSC_ALLOC_W_SYSC_ALLOC0_GET(reg_val) \ + (((reg_val)&SYSC_ALLOC_W_SYSC_ALLOC0_MASK) >> \ + SYSC_ALLOC_W_SYSC_ALLOC0_SHIFT) +#define SYSC_ALLOC_W_SYSC_ALLOC0_SET(reg_val, value) \ + (((reg_val) & ~SYSC_ALLOC_W_SYSC_ALLOC0_MASK) | \ + (((value) << SYSC_ALLOC_W_SYSC_ALLOC0_SHIFT) & \ + SYSC_ALLOC_W_SYSC_ALLOC0_MASK)) +/* End of SYSC_ALLOC_W_SYSC_ALLOC0 values */ +#define SYSC_ALLOC_R_SYSC_ALLOC1_SHIFT (8) +#define SYSC_ALLOC_R_SYSC_ALLOC1_MASK ((0xF) << SYSC_ALLOC_R_SYSC_ALLOC1_SHIFT) +#define SYSC_ALLOC_R_SYSC_ALLOC1_GET(reg_val) \ + (((reg_val)&SYSC_ALLOC_R_SYSC_ALLOC1_MASK) >> \ + SYSC_ALLOC_R_SYSC_ALLOC1_SHIFT) +#define SYSC_ALLOC_R_SYSC_ALLOC1_SET(reg_val, value) \ + (((reg_val) & ~SYSC_ALLOC_R_SYSC_ALLOC1_MASK) | \ + (((value) << SYSC_ALLOC_R_SYSC_ALLOC1_SHIFT) & \ + SYSC_ALLOC_R_SYSC_ALLOC1_MASK)) +/* End of SYSC_ALLOC_R_SYSC_ALLOC1 values */ +#define SYSC_ALLOC_W_SYSC_ALLOC1_SHIFT (12) +#define SYSC_ALLOC_W_SYSC_ALLOC1_MASK ((0xF) << SYSC_ALLOC_W_SYSC_ALLOC1_SHIFT) +#define SYSC_ALLOC_W_SYSC_ALLOC1_GET(reg_val) \ + (((reg_val)&SYSC_ALLOC_W_SYSC_ALLOC1_MASK) >> \ + SYSC_ALLOC_W_SYSC_ALLOC1_SHIFT) +#define SYSC_ALLOC_W_SYSC_ALLOC1_SET(reg_val, value) \ + (((reg_val) & ~SYSC_ALLOC_W_SYSC_ALLOC1_MASK) | \ + (((value) << SYSC_ALLOC_W_SYSC_ALLOC1_SHIFT) & \ + SYSC_ALLOC_W_SYSC_ALLOC1_MASK)) +/* End of SYSC_ALLOC_W_SYSC_ALLOC1 values */ +#define SYSC_ALLOC_R_SYSC_ALLOC2_SHIFT (16) +#define SYSC_ALLOC_R_SYSC_ALLOC2_MASK ((0xF) << SYSC_ALLOC_R_SYSC_ALLOC2_SHIFT) +#define SYSC_ALLOC_R_SYSC_ALLOC2_GET(reg_val) \ + (((reg_val)&SYSC_ALLOC_R_SYSC_ALLOC2_MASK) >> \ + SYSC_ALLOC_R_SYSC_ALLOC2_SHIFT) +#define SYSC_ALLOC_R_SYSC_ALLOC2_SET(reg_val, value) \ + (((reg_val) & ~SYSC_ALLOC_R_SYSC_ALLOC2_MASK) | \ + (((value) << SYSC_ALLOC_R_SYSC_ALLOC2_SHIFT) & \ + SYSC_ALLOC_R_SYSC_ALLOC2_MASK)) +/* End of SYSC_ALLOC_R_SYSC_ALLOC2 values */ +#define SYSC_ALLOC_W_SYSC_ALLOC2_SHIFT (20) +#define SYSC_ALLOC_W_SYSC_ALLOC2_MASK ((0xF) << SYSC_ALLOC_W_SYSC_ALLOC2_SHIFT) +#define SYSC_ALLOC_W_SYSC_ALLOC2_GET(reg_val) \ + (((reg_val)&SYSC_ALLOC_W_SYSC_ALLOC2_MASK) >> \ + SYSC_ALLOC_W_SYSC_ALLOC2_SHIFT) +#define SYSC_ALLOC_W_SYSC_ALLOC2_SET(reg_val, value) \ + (((reg_val) & ~SYSC_ALLOC_W_SYSC_ALLOC2_MASK) | \ + (((value) << SYSC_ALLOC_W_SYSC_ALLOC2_SHIFT) & \ + SYSC_ALLOC_W_SYSC_ALLOC2_MASK)) +/* End of SYSC_ALLOC_W_SYSC_ALLOC2 values */ +#define SYSC_ALLOC_R_SYSC_ALLOC3_SHIFT (24) +#define SYSC_ALLOC_R_SYSC_ALLOC3_MASK ((0xF) << SYSC_ALLOC_R_SYSC_ALLOC3_SHIFT) +#define SYSC_ALLOC_R_SYSC_ALLOC3_GET(reg_val) \ + (((reg_val)&SYSC_ALLOC_R_SYSC_ALLOC3_MASK) >> \ + SYSC_ALLOC_R_SYSC_ALLOC3_SHIFT) +#define SYSC_ALLOC_R_SYSC_ALLOC3_SET(reg_val, value) \ + (((reg_val) & ~SYSC_ALLOC_R_SYSC_ALLOC3_MASK) | \ + (((value) << SYSC_ALLOC_R_SYSC_ALLOC3_SHIFT) & \ + SYSC_ALLOC_R_SYSC_ALLOC3_MASK)) +/* End of SYSC_ALLOC_R_SYSC_ALLOC3 values */ +#define SYSC_ALLOC_W_SYSC_ALLOC3_SHIFT (28) +#define SYSC_ALLOC_W_SYSC_ALLOC3_MASK ((0xF) << SYSC_ALLOC_W_SYSC_ALLOC3_SHIFT) +#define SYSC_ALLOC_W_SYSC_ALLOC3_GET(reg_val) \ + (((reg_val)&SYSC_ALLOC_W_SYSC_ALLOC3_MASK) >> \ + SYSC_ALLOC_W_SYSC_ALLOC3_SHIFT) +#define SYSC_ALLOC_W_SYSC_ALLOC3_SET(reg_val, value) \ + (((reg_val) & ~SYSC_ALLOC_W_SYSC_ALLOC3_MASK) | \ + (((value) << SYSC_ALLOC_W_SYSC_ALLOC3_SHIFT) & \ + SYSC_ALLOC_W_SYSC_ALLOC3_MASK)) +/* End of SYSC_ALLOC_W_SYSC_ALLOC3 values */ + #endif /* _UAPI_KBASE_GPU_REGMAP_H_ */ diff --git a/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h b/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h index 749e1fa..7a52fbf 100644 --- a/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h +++ b/common/include/uapi/gpu/arm/midgard/jm/mali_base_jm_kernel.h @@ -192,15 +192,15 @@ #define BASE_MEM_FLAGS_RESERVED \ (BASE_MEM_RESERVED_BIT_8 | BASE_MEM_RESERVED_BIT_19) -#define BASEP_MEM_INVALID_HANDLE (0ull << 12) -#define BASE_MEM_MMU_DUMP_HANDLE (1ull << 12) -#define BASE_MEM_TRACE_BUFFER_HANDLE (2ull << 12) -#define BASE_MEM_MAP_TRACKING_HANDLE (3ull << 12) -#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ull << 12) +#define BASEP_MEM_INVALID_HANDLE (0ul) +#define BASE_MEM_MMU_DUMP_HANDLE (1ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_TRACE_BUFFER_HANDLE (2ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_MAP_TRACKING_HANDLE (3ul << LOCAL_PAGE_SHIFT) +#define BASEP_MEM_WRITE_ALLOC_PAGES_HANDLE (4ul << LOCAL_PAGE_SHIFT) /* reserved handles ..-47<<PAGE_SHIFT> for future special handles */ -#define BASE_MEM_COOKIE_BASE (64ul << 12) -#define BASE_MEM_FIRST_FREE_ADDRESS ((BITS_PER_LONG << 12) + \ - BASE_MEM_COOKIE_BASE) +#define BASE_MEM_COOKIE_BASE (64ul << LOCAL_PAGE_SHIFT) +#define BASE_MEM_FIRST_FREE_ADDRESS \ + ((BITS_PER_LONG << LOCAL_PAGE_SHIFT) + BASE_MEM_COOKIE_BASE) /* Similar to BASE_MEM_TILER_ALIGN_TOP, memory starting from the end of the * initial commit is aligned to 'extension' pages, where 'extension' must be a power diff --git a/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h b/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h index 72d75cb..2598e20 100644 --- a/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h +++ b/common/include/uapi/gpu/arm/midgard/jm/mali_kbase_jm_ioctl.h @@ -119,6 +119,8 @@ * 11.31: * - Added BASE_JD_REQ_LIMITED_CORE_MASK. * - Added ioctl 55: set_limited_core_count. + * 11.32: + * - Added new HW performance counters interface to all GPUs. */ #define BASE_UK_VERSION_MAJOR 11 #define BASE_UK_VERSION_MINOR 31 diff --git a/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h b/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h index a46c41f..410d54e 100644 --- a/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h +++ b/common/include/uapi/gpu/arm/midgard/mali_base_kernel.h @@ -42,18 +42,6 @@ struct base_mem_handle { #define BASE_MAX_COHERENT_GROUPS 16 -#if defined(CDBG_ASSERT) -#define LOCAL_ASSERT CDBG_ASSERT -#elif defined(KBASE_DEBUG_ASSERT) -#define LOCAL_ASSERT KBASE_DEBUG_ASSERT -#else -#if defined(__KERNEL__) -#error assert macro not defined! -#else -#define LOCAL_ASSERT(...) ((void)#__VA_ARGS__) -#endif -#endif - #if defined(PAGE_MASK) && defined(PAGE_SHIFT) #define LOCAL_PAGE_SHIFT PAGE_SHIFT #define LOCAL_PAGE_LSB ~PAGE_MASK @@ -635,7 +623,7 @@ struct mali_base_gpu_coherent_group_info { * @thread_max_barrier_size: Maximum number of threads per barrier * @thread_features: Thread features * @coherency_mode: Note: This is the _selected_ coherency mode rather than the - * available modes as exposed in the coherency_features register + * available modes as exposed in the coherency_features register * @thread_tls_alloc: Number of threads per core that TLS must be allocated for * @gpu_features: GPU features * @@ -699,7 +687,7 @@ struct gpu_raw_gpu_props { * values from which the value of the other members are derived. The derived * members exist to allow for efficient access and/or shielding the details * of the layout of the registers. - * */ + */ struct base_gpu_props { struct mali_base_gpu_core_props core_props; struct mali_base_gpu_l2_cache_props l2_props; @@ -716,82 +704,24 @@ struct base_gpu_props { #include "jm/mali_base_jm_kernel.h" #endif -/** - * base_mem_group_id_get() - Get group ID from flags - * @flags: Flags to pass to base_mem_alloc - * - * This inline function extracts the encoded group ID from flags - * and converts it into numeric value (0~15). - * - * Return: group ID(0~15) extracted from the parameter - */ -static __inline__ int base_mem_group_id_get(base_mem_alloc_flags flags) -{ - LOCAL_ASSERT((flags & ~BASE_MEM_FLAGS_INPUT_MASK) == 0); - return (int)((flags & BASE_MEM_GROUP_ID_MASK) >> - BASEP_MEM_GROUP_ID_SHIFT); -} - -/** - * base_mem_group_id_set() - Set group ID into base_mem_alloc_flags - * @id: group ID(0~15) you want to encode - * - * This inline function encodes specific group ID into base_mem_alloc_flags. - * Parameter 'id' should lie in-between 0 to 15. - * - * Return: base_mem_alloc_flags with the group ID (id) encoded - * - * The return value can be combined with other flags against base_mem_alloc - * to identify a specific memory group. - */ -static __inline__ base_mem_alloc_flags base_mem_group_id_set(int id) -{ - if ((id < 0) || (id >= BASE_MEM_GROUP_COUNT)) { - /* Set to default value when id is out of range. */ - id = BASE_MEM_GROUP_DEFAULT; - } +#define BASE_MEM_GROUP_ID_GET(flags) \ + ((flags & BASE_MEM_GROUP_ID_MASK) >> BASEP_MEM_GROUP_ID_SHIFT) - return ((base_mem_alloc_flags)id << BASEP_MEM_GROUP_ID_SHIFT) & - BASE_MEM_GROUP_ID_MASK; -} +#define BASE_MEM_GROUP_ID_SET(id) \ + (((base_mem_alloc_flags)((id < 0 || id >= BASE_MEM_GROUP_COUNT) ? \ + BASE_MEM_GROUP_DEFAULT : \ + id) \ + << BASEP_MEM_GROUP_ID_SHIFT) & \ + BASE_MEM_GROUP_ID_MASK) -/** - * base_context_mmu_group_id_set - Encode a memory group ID in - * base_context_create_flags - * - * Memory allocated for GPU page tables will come from the specified group. - * - * @group_id: Physical memory group ID. Range is 0..(BASE_MEM_GROUP_COUNT-1). - * - * Return: Bitmask of flags to pass to base_context_init. - */ -static __inline__ base_context_create_flags base_context_mmu_group_id_set( - int const group_id) -{ - LOCAL_ASSERT(group_id >= 0); - LOCAL_ASSERT(group_id < BASE_MEM_GROUP_COUNT); - return BASEP_CONTEXT_MMU_GROUP_ID_MASK & - ((base_context_create_flags)group_id << - BASEP_CONTEXT_MMU_GROUP_ID_SHIFT); -} +#define BASE_CONTEXT_MMU_GROUP_ID_SET(group_id) \ + (BASEP_CONTEXT_MMU_GROUP_ID_MASK & \ + ((base_context_create_flags)(group_id) \ + << BASEP_CONTEXT_MMU_GROUP_ID_SHIFT)) -/** - * base_context_mmu_group_id_get - Decode a memory group ID from - * base_context_create_flags - * - * Memory allocated for GPU page tables will come from the returned group. - * - * @flags: Bitmask of flags to pass to base_context_init. - * - * Return: Physical memory group ID. Valid range is 0..(BASE_MEM_GROUP_COUNT-1). - */ -static __inline__ int base_context_mmu_group_id_get( - base_context_create_flags const flags) -{ - LOCAL_ASSERT(flags == (flags & BASEP_CONTEXT_CREATE_ALLOWED_FLAGS)); - return (int)((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >> - BASEP_CONTEXT_MMU_GROUP_ID_SHIFT); -} +#define BASE_CONTEXT_MMU_GROUP_ID_GET(flags) \ + ((flags & BASEP_CONTEXT_MMU_GROUP_ID_MASK) >> \ + BASEP_CONTEXT_MMU_GROUP_ID_SHIFT) /* * A number of bit flags are defined for requesting cpu_gpu_timeinfo. These diff --git a/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h b/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h index 9baaec1..15843ee 100644 --- a/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h +++ b/common/include/uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h @@ -91,6 +91,7 @@ enum base_hwcnt_reader_event { #define KBASE_HWCNT_READER_API_VERSION_NO_FEATURE (0) #define KBASE_HWCNT_READER_API_VERSION_FEATURE_CYCLES_TOP (1 << 0) #define KBASE_HWCNT_READER_API_VERSION_FEATURE_CYCLES_SHADER_CORES (1 << 1) + /** * struct kbase_hwcnt_reader_api_version - hwcnt reader API version * @version: API version @@ -101,5 +102,263 @@ struct kbase_hwcnt_reader_api_version { __u32 features; }; +/** Hardware counters reader API version */ +#define PRFCNT_READER_API_VERSION (0) + +/** + * enum prfcnt_list_type - Type of list item + * @PRFCNT_LIST_TYPE_ENUM: Enumeration of performance counters. + * @PRFCNT_LIST_TYPE_REQUEST: Request for configuration setup. + * @PRFCNT_LIST_TYPE_SAMPLE_META: Sample metadata. + */ +enum prfcnt_list_type { + PRFCNT_LIST_TYPE_ENUM, + PRFCNT_LIST_TYPE_REQUEST, + PRFCNT_LIST_TYPE_SAMPLE_META, +}; + +#define FLEX_LIST_TYPE(type, subtype) \ + (__u16)(((type & 0xf) << 12) | (subtype & 0xfff)) +#define FLEX_LIST_TYPE_NONE FLEX_LIST_TYPE(0, 0) + +#define PRFCNT_ENUM_TYPE_BLOCK FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_ENUM, 0) +#define PRFCNT_ENUM_TYPE_REQUEST FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_ENUM, 1) + +#define PRFCNT_REQUEST_TYPE_MODE FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_REQUEST, 0) +#define PRFCNT_REQUEST_TYPE_ENABLE FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_REQUEST, 1) + +#define PRFCNT_SAMPLE_META_TYPE_SAMPLE \ + FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_SAMPLE_META, 0) +#define PRFCNT_SAMPLE_META_TYPE_CLOCK \ + FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_SAMPLE_META, 1) +#define PRFCNT_SAMPLE_META_TYPE_BLOCK \ + FLEX_LIST_TYPE(PRFCNT_LIST_TYPE_SAMPLE_META, 2) + +/** + * struct prfcnt_item_header - Header for an item of the list. + * @item_type: Type of item. + * @item_version: Protocol version. + */ +struct prfcnt_item_header { + __u16 item_type; + __u16 item_version; +}; + +/** + * enum prfcnt_block_type - Type of performance counter block. + * @PRFCNT_BLOCK_TYPE_FE: Front End. + * @PRFCNT_BLOCK_TYPE_TILER: Tiler. + * @PRFCNT_BLOCK_TYPE_MEMORY: Memory System. + * @PRFCNT_BLOCK_TYPE_SHADER_CORE: Shader Core. + */ +enum prfcnt_block_type { + PRFCNT_BLOCK_TYPE_FE, + PRFCNT_BLOCK_TYPE_TILER, + PRFCNT_BLOCK_TYPE_MEMORY, + PRFCNT_BLOCK_TYPE_SHADER_CORE, + PRFCNT_BLOCK_TYPE_RESERVED = 255, +}; + +/** + * enum prfcnt_block_set - Type of performance counter block set. + * @PRFCNT_SET_PRIMARY: Primary. + * @PRFCNT_SET_SECONDARY: Secondary. + * @PRFCNT_SET_TERTIARY: Tertiary. + */ +enum prfcnt_set { + PRFCNT_SET_PRIMARY, + PRFCNT_SET_SECONDARY, + PRFCNT_SET_TERTIARY, + PRFCNT_SET_RESERVED = 255, +}; + +/** + * struct prfcnt_enum_block_counter - Performance counter block descriptor. + * @block_type: Type of performance counter block. + * @set: Which SET this represents: primary, secondary or tertiary. + * @num_instances: How many instances of this block type exist in the hardware. + * @num_values: How many entries in the values array there are for samples + * from this block. + * @pad: Padding bytes. + * @counter_mask: Bitmask that indicates the availability of counters in this + * block. + */ +struct prfcnt_enum_block_counter { + __u8 block_type; + __u8 set; + __u8 num_instances; + __u8 num_values; + __u8 pad[4]; + __u64 counter_mask[2]; +}; + +/** + * struct prfcnt_enum_request - Request descriptor. + * @request_item_type: Type of request. + * @pad: Padding bytes. + * @versions_mask: Bitmask of versions that support this request. + */ +struct prfcnt_enum_request { + __u16 request_item_type; + __u16 pad; + __u32 versions_mask; +}; + +/** + * struct prfcnt_enum_item - Performance counter enumeration item. + * @hdr: Header describing the type of item in the list. + * @block_counter: Performance counter block descriptor. + * @request: Request descriptor. + */ +struct prfcnt_enum_item { + struct prfcnt_item_header hdr; + union { + struct prfcnt_enum_block_counter block_counter; + struct prfcnt_enum_request request; + } u; +}; + +/** + * enum prfcnt_mode - Capture mode for counter sampling. + * @PRFCNT_MODE_MANUAL: Manual sampling mode. + * @PRFCNT_MODE_PERIODIC: Periodic sampling mode. + */ +enum prfcnt_mode { + PRFCNT_MODE_MANUAL, + PRFCNT_MODE_PERIODIC, + PRFCNT_MODE_RESERVED = 255, +}; + +/** + * struct prfcnt_request_mode - Mode request descriptor. + * @mode: Capture mode for the session, either manual or periodic. + * @pad: Padding bytes. + * @period_us: Period in microseconds, for periodic mode. + */ +struct prfcnt_request_mode { + __u8 mode; + __u8 pad[7]; + union { + struct { + __u64 period_us; + } periodic; + } mode_config; +}; + +/** + * struct prfcnt_request_enable - Enable request descriptor. + * @block_type: Type of performance counter block. + * @set: Which SET to use: primary, secondary or tertiary. + * @pad: Padding bytes. + * @enable_mask: Bitmask that indicates which performance counters to enable. + * Unavailable counters will be ignored. + */ +struct prfcnt_request_enable { + __u8 block_type; + __u8 set; + __u8 pad[6]; + __u64 enable_mask[2]; +}; + +/** + * struct prfcnt_request_item - Performance counter request item. + * @hdr: Header describing the type of item in the list. + * @req_mode: Mode request descriptor. + * @req_enable: Enable request descriptor. + */ +struct prfcnt_request_item { + struct prfcnt_item_header hdr; + union { + struct prfcnt_request_mode req_mode; + struct prfcnt_request_enable req_enable; + } u; +}; + +/** + * enum prfcnt_request_type - Type of request descriptor. + * @PRFCNT_REQUEST_MODE: Specify the capture mode to be used for the session. + * @PRFCNT_REQUEST_ENABLE: Specify which performance counters to capture. + */ +enum prfcnt_request_type { + PRFCNT_REQUEST_MODE, + PRFCNT_REQUEST_ENABLE, +}; + +/** + * struct prfcnt_sample_metadata - Metadata for counter sample data. + * @timestamp_start: Earliest timestamp that values in this sample represent. + * @timestamp_end: Latest timestamp that values in this sample represent. + * @seq: Sequence number of this sample. Must match the value from + * GET_SAMPLE. + * @user_data: User data provided to HWC_CMD_START or HWC_CMD_SAMPLE_* + * @flags: Property flags. + */ +struct prfcnt_sample_metadata { + __u64 timestamp_start; + __u64 timestamp_end; + __u64 seq; + __u64 user_data; + __u32 flags; + __u32 pad; +}; + +/** + * struct prfcnt_clock_metadata - Metadata for clock cycles. + * @num_domains: Number of domains this metadata refers to. + * @cycles: Number of cycles elapsed in each counter domain between + * timestamp_start and timestamp_end. + */ +struct prfcnt_clock_metadata { + __u32 num_domains; + __u32 pad; + __u64 *cycles; +}; + +/* This block was powered on for at least some portion of the sample */ +#define BLOCK_STATE_ON (1 << 0) +/* This block was powered off for at least some portion of the sample */ +#define BLOCK_STATE_OFF (1 << 1) +/* This block was available to this VM for at least some portion of the sample */ +#define BLOCK_STATE_AVAILABLE (1 << 2) +/* This block was not available to this VM for at least some portion of the sample + * Note that no data is collected when the block is not available to the VM. + */ +#define BLOCK_STATE_UNAVAILABLE (1 << 3) +/* This block was operating in "normal" (non-protected) mode for at least some portion of the sample */ +#define BLOCK_STATE_NORMAL (1 << 4) +/* This block was operating in "protected" mode for at least some portion of the sample. + * Note that no data is collected when the block is in protected mode. + */ +#define BLOCK_STATE_PROTECTED (1 << 5) + +/** + * struct prfcnt_block_metadata - Metadata for counter block. + * @block_type: Type of performance counter block. + * @block_idx: Index of performance counter block. + * @set: Set of performance counter block. + * @block_state: Bits set indicate the states which the block is known + * to have operated in during this sample. + * @values_offset: Offset from the start of the mmapped region, to the values + * for this block. The values themselves are an array of __u64. + */ +struct prfcnt_block_metadata { + __u8 block_type; + __u8 block_idx; + __u8 set; + __u8 pad_u8; + __u32 block_state; + __u32 values_offset; + __u32 pad_u32; +}; + +struct prfcnt_metadata { + struct prfcnt_item_header hdr; + union { + struct prfcnt_sample_metadata sample_md; + struct prfcnt_clock_metadata clock_md; + struct prfcnt_block_metadata block_md; + } u; +}; + #endif /* _UAPI_KBASE_HWCNT_READER_H_ */ diff --git a/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h b/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h index 29ff32a..8e1ed55 100644 --- a/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h +++ b/common/include/uapi/gpu/arm/midgard/mali_kbase_ioctl.h @@ -186,12 +186,15 @@ struct kbase_ioctl_hwcnt_enable { __u32 mmu_l2_bm; }; +/* This IOCTL is deprecated as of R33, and will be removed in R35. */ #define KBASE_IOCTL_HWCNT_ENABLE \ _IOW(KBASE_IOCTL_TYPE, 9, struct kbase_ioctl_hwcnt_enable) +/* This IOCTL is deprecated as of R33, and will be removed in R35. */ #define KBASE_IOCTL_HWCNT_DUMP \ _IO(KBASE_IOCTL_TYPE, 10) +/* This IOCTL is deprecated as of R33, and will be removed in R35. */ #define KBASE_IOCTL_HWCNT_CLEAR \ _IO(KBASE_IOCTL_TYPE, 11) @@ -686,6 +689,55 @@ struct kbase_ioctl_set_limited_core_count { #define KBASE_IOCTL_SET_LIMITED_CORE_COUNT \ _IOW(KBASE_IOCTL_TYPE, 55, struct kbase_ioctl_set_limited_core_count) +/** + * struct kbase_ioctl_kinstr_prfcnt_enum_info - Enum Performance counter + * information + * @info_item_size: Performance counter item size in bytes. + * @info_item_count: Performance counter item count in the info_list_ptr. + * @info_list_ptr: Performance counter item list pointer which points to a + * list with info_item_count of items. + * + * On success: returns info_item_size and info_item_count if info_list_ptr is + * NULL, returns performance counter information if info_list_ptr is not NULL. + * On error: returns a negative error code. + */ +struct kbase_ioctl_kinstr_prfcnt_enum_info { + __u32 info_item_size; + __u32 info_item_count; + __u64 info_list_ptr; +}; + +#define KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO \ + _IOWR(KBASE_IOCTL_TYPE, 56, struct kbase_ioctl_kinstr_prfcnt_enum_info) + +/** + * struct kbase_ioctl_hwcnt_reader_setup - Setup HWC dumper/reader + * @in: input parameters. + * @in.request_item_count: Number of requests in the requests array. + * @in.request_item_size: Size in bytes of each request in the requests array. + * @in.requests_ptr: Pointer to the requests array. + * @out: output parameters. + * @out.prfcnt_metadata_item_size: Size of each item in the metadata array for + * each sample. + * @out.prfcnt_mmap_size_bytes: Size in bytes that user-space should mmap + * for reading performance counter samples. + * + * A fd is returned from the ioctl if successful, or a negative value on error. + */ +union kbase_ioctl_kinstr_prfcnt_setup { + struct { + __u32 request_item_count; + __u32 request_item_size; + __u64 requests_ptr; + } in; + struct { + __u32 prfcnt_metadata_item_size; + __u32 prfcnt_mmap_size_bytes; + } out; +}; + +#define KBASE_IOCTL_KINSTR_PRFCNT_SETUP \ + _IOWR(KBASE_IOCTL_TYPE, 57, union kbase_ioctl_kinstr_prfcnt_setup) /*************** * test ioctls * diff --git a/mali_kbase/Kbuild b/mali_kbase/Kbuild index c520597..e253f1c 100644 --- a/mali_kbase/Kbuild +++ b/mali_kbase/Kbuild @@ -48,6 +48,10 @@ ifeq ($(CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND),n) $(error CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND must be set in Kernel configuration) endif +ifeq ($(CONFIG_FW_LOADER), n) + $(error CONFIG_FW_LOADER must be set in Kernel configuration) +endif + ifeq ($(CONFIG_MALI_PRFCNT_SET_SELECT_VIA_DEBUG_FS), y) ifneq ($(CONFIG_DEBUG_FS), y) $(error CONFIG_MALI_PRFCNT_SET_SELECT_VIA_DEBUG_FS depends on CONFIG_DEBUG_FS to be set in Kernel configuration) @@ -67,7 +71,7 @@ endif # # Driver version string which is returned to userspace via an ioctl -MALI_RELEASE_NAME ?= '"r32p1-01eac0"' +MALI_RELEASE_NAME ?= '"r34p0-00dev1"' # Set up defaults if not defined by build system ifeq ($(CONFIG_MALI_DEBUG), y) MALI_UNIT_TEST = 1 @@ -91,6 +95,7 @@ else MALI_USE_CSF ?= 0 endif + ifneq ($(CONFIG_MALI_KUTF), n) MALI_KERNEL_TEST_API ?= 1 else @@ -156,9 +161,11 @@ mali_kbase-y := \ mali_kbase_gpuprops.o \ mali_kbase_pm.o \ mali_kbase_config.o \ + mali_kbase_kinstr_prfcnt.o \ mali_kbase_vinstr.o \ mali_kbase_hwcnt.o \ mali_kbase_hwcnt_gpu.o \ + mali_kbase_hwcnt_gpu_narrow.o \ mali_kbase_hwcnt_legacy.o \ mali_kbase_hwcnt_types.o \ mali_kbase_hwcnt_virtualizer.o \ @@ -180,7 +187,10 @@ mali_kbase-y := \ mali_kbase_regs_history_debugfs.o \ mali_kbase_dvfs_debugfs.o \ mali_power_gpu_frequency_trace.o \ - mali_kbase_trace_gpu_mem.o + mali_kbase_trace_gpu_mem.o \ + mali_kbase_pbha.o + +mali_kbase-$(CONFIG_DEBUG_FS) += mali_kbase_pbha_debugfs.o mali_kbase-$(CONFIG_MALI_CINSTR_GWT) += mali_kbase_gwt.o diff --git a/mali_kbase/Kconfig b/mali_kbase/Kconfig index 9f1a6e3..a563d35 100644 --- a/mali_kbase/Kconfig +++ b/mali_kbase/Kconfig @@ -24,6 +24,7 @@ menuconfig MALI_MIDGARD select DMA_SHARED_BUFFER select PM_DEVFREQ select DEVFREQ_THERMAL + select FW_LOADER default n help Enable this option to build support for a ARM Mali Midgard GPU. @@ -39,7 +40,7 @@ config MALI_PLATFORM_NAME default "devicetree" help Enter the name of the desired platform configuration directory to - include in the build. 'platform/$(MALI_PLATFORM_NAME)/Makefile' must + include in the build. 'platform/$(MALI_PLATFORM_NAME)/Kbuild' must exist. config MALI_REAL_HW @@ -365,7 +366,7 @@ config MALI_HW_ERRATA_1485982_USE_CLOCK_ALTERNATIVE endif config MALI_ARBITRATION - bool "Enable Virtualization reference code" + tristate "Enable Virtualization reference code" depends on MALI_MIDGARD default n help diff --git a/mali_kbase/Makefile b/mali_kbase/Makefile index 4384e80..099da33 100644 --- a/mali_kbase/Makefile +++ b/mali_kbase/Makefile @@ -55,7 +55,7 @@ ifeq ($(CONFIG_MALI_MIDGARD),m) CONFIG_MALI_DMA_BUF_LEGACY_COMPAT = n endif - ifeq ($(CONFIG_BSP_HAS_HYPERVISOR),y) + ifeq ($(CONFIG_XEN),y) ifneq ($(CONFIG_MALI_ARBITRATION), n) CONFIG_MALI_XEN ?= m endif diff --git a/mali_kbase/Mconfig b/mali_kbase/Mconfig index d71a113..1b66978 100644 --- a/mali_kbase/Mconfig +++ b/mali_kbase/Mconfig @@ -35,7 +35,7 @@ config MALI_PLATFORM_NAME default "devicetree" help Enter the name of the desired platform configuration directory to - include in the build. 'platform/$(MALI_PLATFORM_NAME)/Makefile' must + include in the build. 'platform/$(MALI_PLATFORM_NAME)/Kbuild' must exist. When PLATFORM_CUSTOM is set, this needs to be set manually to diff --git a/mali_kbase/arbiter/mali_kbase_arbiter_defs.h b/mali_kbase/arbiter/mali_kbase_arbiter_defs.h index 570a82a..65cfc7b 100644 --- a/mali_kbase/arbiter/mali_kbase_arbiter_defs.h +++ b/mali_kbase/arbiter/mali_kbase_arbiter_defs.h @@ -20,7 +20,6 @@ */ /** - * @file * Mali structures define to support arbitration feature */ diff --git a/mali_kbase/arbiter/mali_kbase_arbiter_interface.h b/mali_kbase/arbiter/mali_kbase_arbiter_interface.h index c0137f7..3c60878 100644 --- a/mali_kbase/arbiter/mali_kbase_arbiter_interface.h +++ b/mali_kbase/arbiter/mali_kbase_arbiter_interface.h @@ -20,7 +20,6 @@ */ /** - * @file * Defines the Mali arbiter interface */ @@ -61,58 +60,47 @@ struct arbiter_if_dev; * the arbiter arbiter_if_vm_arb_ops callbacks below. * For example vm_arb_gpu_stopped() may be called as a side effect of * arb_vm_gpu_stop() being called here. + * + * @arb_vm_gpu_stop: Callback to ask VM to stop using GPU. + * dev: The arbif kernel module device. + * + * Informs KBase to stop using the GPU as soon as possible. + * Note: Once the driver is no longer using the GPU, a call + * to vm_arb_gpu_stopped is expected by the arbiter. + * @arb_vm_gpu_granted: Callback to indicate that GPU has been granted to VM. + * dev: The arbif kernel module device. + * + * Informs KBase that the GPU can now be used by the VM. + * @arb_vm_gpu_lost: Callback to indicate that VM has lost the GPU. + * dev: The arbif kernel module device. + * + * This is called if KBase takes too long to respond to the + * arbiter stop request. + * Once this is called, KBase will assume that access to the + * GPU has been lost and will fail all running jobs and + * reset its internal state. + * If successful, will respond with a vm_arb_gpu_stopped + * message. + * @arb_vm_max_config: Callback to send the max config info to the VM. + * dev: The arbif kernel module device. + * max_l2_slices: The maximum number of L2 slices. + * max_core_mask: The largest core mask. + * + * Informs KBase the maximum resources that can be + * allocated to the partition in use. + * @arb_vm_update_freq: Callback to notify that GPU clock frequency has been + * updated. + * dev: The arbif kernel module device. + * freq: GPU clock frequency value reported from arbiter + * + * Informs KBase that the GPU clock frequency has been updated. */ struct arbiter_if_arb_vm_ops { - /** - * arb_vm_gpu_stop() - Ask VM to stop using GPU - * @dev: The arbif kernel module device. - * - * Informs KBase to stop using the GPU as soon as possible. - * @Note: Once the driver is no longer using the GPU, a call to - * vm_arb_gpu_stopped is expected by the arbiter. - */ void (*arb_vm_gpu_stop)(struct device *dev); - - /** - * arb_vm_gpu_granted() - GPU has been granted to VM - * @dev: The arbif kernel module device. - * - * Informs KBase that the GPU can now be used by the VM. - */ void (*arb_vm_gpu_granted)(struct device *dev); - - /** - * arb_vm_gpu_lost() - VM has lost the GPU - * @dev: The arbif kernel module device. - * - * This is called if KBase takes too long to respond to the arbiter - * stop request. - * Once this is called, KBase will assume that access to the GPU - * has been lost and will fail all running jobs and reset its - * internal state. - * If successful, will respond with a vm_arb_gpu_stopped message. - */ void (*arb_vm_gpu_lost)(struct device *dev); - - /** - * arb_vm_max_config() - Send max config info to the VM - * @dev: The arbif kernel module device. - * @max_l2_slices: The maximum number of L2 slices. - * @max_core_mask: The largest core mask. - * - * Informs KBase the maximum resources that can be allocated to the - * partition in use. - */ void (*arb_vm_max_config)(struct device *dev, uint32_t max_l2_slices, uint32_t max_core_mask); - - /** - * arb_vm_update_freq() - GPU clock frequency has been updated - * @dev: The arbif kernel module device. - * @freq: GPU clock frequency value reported from arbiter - * - * Informs KBase that the GPU clock frequency has been updated. - */ void (*arb_vm_update_freq)(struct device *dev, uint32_t freq); }; @@ -124,60 +112,45 @@ struct arbiter_if_arb_vm_ops { * * Note that we must not make any synchronous calls back in to the VM * (via arbiter_if_arb_vm_ops above) in the context of these callbacks. + * + * @vm_arb_register_dev: Callback to register VM device driver callbacks. + * arbif_dev: The arbiter interface to register + * with for device callbacks + * dev: The device structure to supply in the callbacks. + * ops: The callbacks that the device driver supports + * (none are optional). + * + * Returns + * 0 - successful. + * -EINVAL - invalid argument. + * -EPROBE_DEFER - module dependencies are not yet + * available. + * @vm_arb_unregister_dev: Callback to unregister VM device driver callbacks. + * arbif_dev: The arbiter interface to unregistering + * from. + * @vm_arb_get_max_config: Callback to Request the max config from the Arbiter. + * arbif_dev: The arbiter interface to issue the + * request to. + * @vm_arb_gpu_request: Callback to ask the arbiter interface for GPU access. + * arbif_dev: The arbiter interface to issue the request + * to. + * @vm_arb_gpu_active: Callback to inform arbiter that driver has gone active. + * arbif_dev: The arbiter interface device to notify. + * @vm_arb_gpu_idle: Callback to inform the arbiter that driver has gone idle. + * arbif_dev: The arbiter interface device to notify. + * @vm_arb_gpu_stopped: Callback to inform arbiter that driver has stopped + * using the GPU + * arbif_dev: The arbiter interface device to notify. + * gpu_required: The GPU is still needed to do more work. */ struct arbiter_if_vm_arb_ops { - /** - * vm_arb_register_dev() - Register VM device driver callbacks. - * @arbif_dev: The arbiter interface we are registering device callbacks - * @dev: The device structure to supply in the callbacks. - * @ops: The callbacks that the device driver supports - * (none are optional). - * - * Return: - * * 0 - successful. - * * -EINVAL - invalid argument. - * * -EPROBE_DEFER - module dependencies are not yet available. - */ int (*vm_arb_register_dev)(struct arbiter_if_dev *arbif_dev, struct device *dev, struct arbiter_if_arb_vm_ops *ops); - - /** - * vm_arb_unregister_dev() - Unregister VM device driver callbacks. - * @arbif_dev: The arbiter interface we are unregistering from. - */ void (*vm_arb_unregister_dev)(struct arbiter_if_dev *arbif_dev); - - /** - * vm_arb_gpu_get_max_config() - Request the max config from the - * Arbiter. - * @arbif_dev: The arbiter interface we want to issue the request. - */ void (*vm_arb_get_max_config)(struct arbiter_if_dev *arbif_dev); - - /** - * vm_arb_gpu_request() - Ask the arbiter interface for GPU access. - * @arbif_dev: The arbiter interface we want to issue the request. - */ void (*vm_arb_gpu_request)(struct arbiter_if_dev *arbif_dev); - - /** - * vm_arb_gpu_active() - Inform arbiter that the driver has gone active - * @arbif_dev: The arbiter interface device. - */ void (*vm_arb_gpu_active)(struct arbiter_if_dev *arbif_dev); - - /** - * vm_arb_gpu_idle() - Inform the arbiter that the driver has gone idle - * @arbif_dev: The arbiter interface device. - */ void (*vm_arb_gpu_idle)(struct arbiter_if_dev *arbif_dev); - - /** - * vm_arb_gpu_stopped() - Inform the arbiter that the driver has stopped - * using the GPU - * @arbif_dev: The arbiter interface device. - * @gpu_required: The GPU is still needed to do more work. - */ void (*vm_arb_gpu_stopped)(struct arbiter_if_dev *arbif_dev, u8 gpu_required); }; diff --git a/mali_kbase/arbiter/mali_kbase_arbiter_pm.c b/mali_kbase/arbiter/mali_kbase_arbiter_pm.c index 5c75686..62ff4fd 100644 --- a/mali_kbase/arbiter/mali_kbase_arbiter_pm.c +++ b/mali_kbase/arbiter/mali_kbase_arbiter_pm.c @@ -20,15 +20,12 @@ */ /** - * @file * Mali arbiter power manager state machine and APIs */ #include <mali_kbase.h> #include <mali_kbase_pm.h> -#include <mali_kbase_hwaccess_jm.h> #include <backend/gpu/mali_kbase_irq_internal.h> -#include <mali_kbase_hwcnt_context.h> #include <backend/gpu/mali_kbase_pm_internal.h> #include <tl/mali_kbase_tracepoints.h> #include <mali_kbase_gpuprops.h> @@ -319,6 +316,7 @@ int kbase_arbiter_pm_early_init(struct kbase_device *kbdev) if (kbdev->arb.arb_if) { kbase_arbif_gpu_request(kbdev); dev_dbg(kbdev->dev, "Waiting for initial GPU assignment...\n"); + err = wait_event_timeout(arb_vm_state->vm_state_wait, arb_vm_state->vm_state == KBASE_VM_STATE_INITIALIZING_WITH_GPU, @@ -328,8 +326,9 @@ int kbase_arbiter_pm_early_init(struct kbase_device *kbdev) dev_dbg(kbdev->dev, "Kbase probe Deferred after waiting %d ms to receive GPU_GRANT\n", gpu_req_timeout); - err = -EPROBE_DEFER; - goto arbif_eprobe_defer; + + err = -ENODEV; + goto arbif_timeout; } dev_dbg(kbdev->dev, @@ -337,9 +336,10 @@ int kbase_arbiter_pm_early_init(struct kbase_device *kbdev) } return 0; -arbif_eprobe_defer: +arbif_timeout: kbase_arbiter_pm_early_term(kbdev); return err; + arbif_init_fail: destroy_workqueue(arb_vm_state->vm_arb_wq); kfree(arb_vm_state); @@ -619,6 +619,18 @@ static void kbase_arbiter_pm_vm_gpu_stop(struct kbase_device *kbdev) case KBASE_VM_STATE_SUSPEND_PENDING: /* Suspend finishes with a stop so nothing else to do */ break; + case KBASE_VM_STATE_INITIALIZING: + case KBASE_VM_STATE_STOPPED_GPU_REQUESTED: + /* + * Case stop() is received when in a GPU REQUESTED state, it + * means that the granted() was missed so the GPU needs to be + * requested again. + */ + dev_dbg(kbdev->dev, + "GPU stop while already stopped with GPU requested"); + kbase_arbif_gpu_stopped(kbdev, true); + start_request_timer(kbdev); + break; default: dev_warn(kbdev->dev, "GPU_STOP when not expected - state %s\n", kbase_arbiter_pm_vm_state_str(arb_vm_state->vm_state)); @@ -656,9 +668,20 @@ static void kbase_gpu_lost(struct kbase_device *kbdev) break; case KBASE_VM_STATE_SUSPENDED: case KBASE_VM_STATE_STOPPED: - case KBASE_VM_STATE_STOPPED_GPU_REQUESTED: dev_dbg(kbdev->dev, "GPU lost while already stopped"); break; + case KBASE_VM_STATE_INITIALIZING: + case KBASE_VM_STATE_STOPPED_GPU_REQUESTED: + /* + * Case lost() is received when in a GPU REQUESTED state, it + * means that the granted() and stop() were missed so the GPU + * needs to be requested again. Very unlikely to happen. + */ + dev_dbg(kbdev->dev, + "GPU lost while already stopped with GPU requested"); + kbase_arbif_gpu_request(kbdev); + start_request_timer(kbdev); + break; case KBASE_VM_STATE_SUSPEND_WAIT_FOR_GRANT: dev_dbg(kbdev->dev, "GPU lost while waiting to suspend"); kbase_arbiter_pm_vm_set_state(kbdev, KBASE_VM_STATE_SUSPENDED); @@ -1020,8 +1043,8 @@ int kbase_arbiter_pm_ctx_active_handle_suspend(struct kbase_device *kbdev, /** * kbase_arbiter_pm_update_gpu_freq() - Updates GPU clock frequency received * from arbiter. - * @arb_freq - Pointer to struchture holding GPU clock frequenecy data - * @freq - New frequency value in KHz + * @arb_freq: Pointer to struchture holding GPU clock frequenecy data + * @freq: New frequency value in KHz */ void kbase_arbiter_pm_update_gpu_freq(struct kbase_arbiter_freq *arb_freq, uint32_t freq) @@ -1045,8 +1068,8 @@ void kbase_arbiter_pm_update_gpu_freq(struct kbase_arbiter_freq *arb_freq, /** * enumerate_arb_gpu_clk() - Enumerate a GPU clock on the given index - * @kbdev - kbase_device pointer - * @index - GPU clock index + * @kbdev: kbase_device pointer + * @index: GPU clock index * * Returns pointer to structure holding GPU clock frequency data reported from * arbiter, only index 0 is valid. @@ -1061,8 +1084,8 @@ static void *enumerate_arb_gpu_clk(struct kbase_device *kbdev, /** * get_arb_gpu_clk_rate() - Get the current rate of GPU clock frequency value - * @kbdev - kbase_device pointer - * @index - GPU clock index + * @kbdev: kbase_device pointer + * @index: GPU clock index * * Returns the GPU clock frequency value saved when gpu is granted from arbiter */ @@ -1082,9 +1105,9 @@ static unsigned long get_arb_gpu_clk_rate(struct kbase_device *kbdev, /** * arb_gpu_clk_notifier_register() - Register a clock rate change notifier. - * @kbdev - kbase_device pointer - * @gpu_clk_handle - Handle unique to the enumerated GPU clock - * @nb - notifier block containing the callback function pointer + * @kbdev: kbase_device pointer + * @gpu_clk_handle: Handle unique to the enumerated GPU clock + * @nb: notifier block containing the callback function pointer * * Returns 0 on success, negative error code otherwise. * @@ -1108,9 +1131,9 @@ static int arb_gpu_clk_notifier_register(struct kbase_device *kbdev, /** * gpu_clk_notifier_unregister() - Unregister clock rate change notifier - * @kbdev - kbase_device pointer - * @gpu_clk_handle - Handle unique to the enumerated GPU clock - * @nb - notifier block containing the callback function pointer + * @kbdev: kbase_device pointer + * @gpu_clk_handle: Handle unique to the enumerated GPU clock + * @nb: notifier block containing the callback function pointer * * This function pointer is used to unregister a callback function that * was previously registered to get notified of a frequency change of the diff --git a/mali_kbase/arbiter/mali_kbase_arbiter_pm.h b/mali_kbase/arbiter/mali_kbase_arbiter_pm.h index 1f570bb..091b431 100644 --- a/mali_kbase/arbiter/mali_kbase_arbiter_pm.h +++ b/mali_kbase/arbiter/mali_kbase_arbiter_pm.h @@ -20,7 +20,6 @@ */ /** - * @file * Mali arbiter power manager state machine and APIs */ @@ -108,6 +107,7 @@ int kbase_arbiter_pm_install_interrupts(struct kbase_device *kbdev); /** * kbase_arbiter_pm_vm_event() - Dispatch VM event to the state machine * @kbdev: The kbase device structure for the device (must be a valid pointer) + * @event: The event to dispatch * * The state machine function. Receives events and transitions states * according the event received and the current state diff --git a/mali_kbase/arbitration/Kconfig b/mali_kbase/arbitration/Kconfig index 95125f9..b4d6202 100644 --- a/mali_kbase/arbitration/Kconfig +++ b/mali_kbase/arbitration/Kconfig @@ -1,4 +1,4 @@ -# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note OR MIT # # (C) COPYRIGHT 2012-2021 ARM Limited. All rights reserved. # @@ -19,7 +19,7 @@ # config MALI_XEN - bool "Enable Xen Interface reference code" + tristate "Enable Xen Interface reference code" depends on MALI_ARBITRATION && XEN default n help @@ -27,13 +27,5 @@ config MALI_XEN virtualization setup for Mali If unsure, say N. -config MALI_KUTF_ARBITRATION_TEST - bool "Enable Arbitration Test reference code" - depends on MALI_KUTF && MALI_ARBITRATION - default n - help - Enables the build of test modules used in the reference - virtualization setup for Mali - If unsure, say N. source "drivers/gpu/arm/midgard/arbitration/ptm/Kconfig" diff --git a/mali_kbase/arbitration/ptm/Kconfig b/mali_kbase/arbitration/ptm/Kconfig index e11e674..074ebd5 100644 --- a/mali_kbase/arbitration/ptm/Kconfig +++ b/mali_kbase/arbitration/ptm/Kconfig @@ -19,7 +19,7 @@ # config MALI_PARTITION_MANAGER - bool "Enable compilation of partition manager modules" + tristate "Enable compilation of partition manager modules" depends on MALI_ARBITRATION default n help diff --git a/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.c b/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.c index e542ccf..9587c70 100644 --- a/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.c +++ b/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.c @@ -22,12 +22,22 @@ #include "backend/gpu/mali_kbase_cache_policy_backend.h" #include <device/mali_kbase_device.h> + void kbase_cache_set_coherency_mode(struct kbase_device *kbdev, u32 mode) { kbdev->current_gpu_coherency_mode = mode; - if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_COHERENCY_REG)) kbase_reg_write(kbdev, COHERENCY_ENABLE, mode); } +u32 kbase_cache_get_coherency_features(struct kbase_device *kbdev) +{ + u32 coherency_features; + + coherency_features = kbase_reg_read( + kbdev, GPU_CONTROL_REG(COHERENCY_FEATURES)); + + return coherency_features; +} + diff --git a/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.h b/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.h index 278125a..13c79d6 100644 --- a/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.h +++ b/mali_kbase/backend/gpu/mali_kbase_cache_policy_backend.h @@ -26,12 +26,21 @@ #include <uapi/gpu/arm/midgard/mali_base_kernel.h> /** - * kbase_cache_set_coherency_mode() - Sets the system coherency mode - * in the GPU. - * @kbdev: Device pointer - * @mode: Coherency mode. COHERENCY_ACE/ACE_LITE - */ + * kbase_cache_set_coherency_mode() - Sets the system coherency mode + * in the GPU. + * @kbdev: Device pointer + * @mode: Coherency mode. COHERENCY_ACE/ACE_LITE + */ void kbase_cache_set_coherency_mode(struct kbase_device *kbdev, u32 mode); -#endif /* _KBASE_CACHE_POLICY_H_ */ +/** + * kbase_cache_get_coherency_features() - Get the coherency features + * in the GPU. + * @kbdev: Device pointer + * + * Return: Register value to be returned + */ +u32 kbase_cache_get_coherency_features(struct kbase_device *kbdev); + +#endif /* _KBASE_CACHE_POLICY_BACKEND_H_ */ diff --git a/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.c b/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.c index 6ad0f58..d6b9750 100644 --- a/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.c +++ b/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.c @@ -26,6 +26,7 @@ #include <mali_kbase.h> #include <mali_kbase_config_defaults.h> #include <linux/clk.h> +#include <linux/pm_opp.h> #include <asm/div64.h> #include "backend/gpu/mali_kbase_clk_rate_trace_mgr.h" @@ -46,7 +47,7 @@ * Return: Pointer to clk trace ops if supported or NULL. */ static struct kbase_clk_rate_trace_op_conf * -get_clk_rate_trace_callbacks(struct kbase_device *kbdev __maybe_unused) +get_clk_rate_trace_callbacks(__maybe_unused struct kbase_device *kbdev) { /* base case */ struct kbase_clk_rate_trace_op_conf *callbacks = @@ -71,6 +72,49 @@ get_clk_rate_trace_callbacks(struct kbase_device *kbdev __maybe_unused) return callbacks; } +int kbase_lowest_gpu_freq_init(struct kbase_device *kbdev) +{ + /* Uses default reference frequency defined in below macro */ + u64 lowest_freq_khz = DEFAULT_REF_TIMEOUT_FREQ_KHZ; + + /* Only check lowest frequency in cases when OPPs are used and + * present in the device tree. + */ +#ifdef CONFIG_PM_OPP + struct dev_pm_opp *opp_ptr; + unsigned long found_freq = 0; + + /* find lowest frequency OPP */ + opp_ptr = dev_pm_opp_find_freq_ceil(kbdev->dev, &found_freq); + if (IS_ERR(opp_ptr)) { + dev_err(kbdev->dev, + "No OPPs found in device tree! Scaling timeouts using %llu kHz", + (unsigned long long)lowest_freq_khz); + } else { +#if KERNEL_VERSION(4, 11, 0) <= LINUX_VERSION_CODE + dev_pm_opp_put(opp_ptr); /* decrease OPP refcount */ +#endif + /* convert found frequency to KHz */ + found_freq /= 1000; + + /* If lowest frequency in OPP table is still higher + * than the reference, then keep the reference frequency + * as the one to use for scaling . + */ + if (found_freq < lowest_freq_khz) + lowest_freq_khz = found_freq; + } +#else + dev_err(kbdev->dev, + "No operating-points-v2 node or operating-points property in DT"); +#endif + + kbdev->lowest_gpu_freq_khz = lowest_freq_khz; + dev_dbg(kbdev->dev, "Lowest frequency identified is %llu kHz", + kbdev->lowest_gpu_freq_khz); + return 0; +} + static int gpu_clk_rate_change_notifier(struct notifier_block *nb, unsigned long event, void *data) { diff --git a/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.h b/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.h index f7ec9d1..df30b63 100644 --- a/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.h +++ b/mali_kbase/backend/gpu/mali_kbase_clk_rate_trace_mgr.h @@ -61,6 +61,21 @@ struct kbase_clk_data { int kbase_clk_rate_trace_manager_init(struct kbase_device *kbdev); /** + * kbase_init_lowest_gpu_freq() - Find the lowest frequency that the GPU can + * run as using the device tree, and save this + * within kbdev. + * + * This function could be called from kbase_clk_rate_trace_manager_init, + * but is left separate as it can be called as soon as + * dev_pm_opp_of_add_table() has been called to initialize the OPP table. + * + * @kbdev: Pointer to kbase device. + * + * Return: 0 in any case. + */ +int kbase_lowest_gpu_freq_init(struct kbase_device *kbdev); + +/** * kbase_clk_rate_trace_manager_term - Terminate GPU clock rate trace manager. * * @kbdev: Device pointer diff --git a/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c b/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c index 11088db..7b04286 100644 --- a/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c +++ b/mali_kbase/backend/gpu/mali_kbase_gpuprops_backend.c @@ -26,6 +26,7 @@ #include <mali_kbase.h> #include <device/mali_kbase_device.h> #include <backend/gpu/mali_kbase_pm_internal.h> +#include <backend/gpu/mali_kbase_cache_policy_backend.h> #include <mali_kbase_hwaccess_gpuprops.h> int kbase_backend_gpuprops_get(struct kbase_device *kbdev, @@ -146,7 +147,7 @@ int kbase_backend_gpuprops_get_curr_config(struct kbase_device *kbdev, curr_config_regdump->l2_present_hi = kbase_reg_read(kbdev, GPU_CONTROL_REG(L2_PRESENT_HI)); - if (WARN_ON(kbase_is_gpu_removed(kbdev))) + if (kbase_is_gpu_removed(kbdev)) return -EIO; return 0; @@ -156,30 +157,22 @@ int kbase_backend_gpuprops_get_curr_config(struct kbase_device *kbdev, int kbase_backend_gpuprops_get_features(struct kbase_device *kbdev, struct kbase_gpuprops_regdump *regdump) { - if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_COHERENCY_REG)) { - u32 coherency_features; + u32 coherency_features; + int error = 0; - /* Ensure we can access the GPU registers */ - kbase_pm_register_access_enable(kbdev); + /* Ensure we can access the GPU registers */ + kbase_pm_register_access_enable(kbdev); - coherency_features = kbase_reg_read(kbdev, - GPU_CONTROL_REG(COHERENCY_FEATURES)); + coherency_features = kbase_cache_get_coherency_features(kbdev); - if (kbase_is_gpu_removed(kbdev)) - return -EIO; + if (kbase_is_gpu_removed(kbdev)) + error = -EIO; - regdump->coherency_features = coherency_features; + regdump->coherency_features = coherency_features; - /* We're done accessing the GPU registers for now. */ - kbase_pm_register_access_disable(kbdev); - } else { - /* Pre COHERENCY_FEATURES we only supported ACE_LITE */ - regdump->coherency_features = - COHERENCY_FEATURE_BIT(COHERENCY_NONE) | - COHERENCY_FEATURE_BIT(COHERENCY_ACE_LITE); - } + kbase_pm_register_access_disable(kbdev); - return 0; + return error; } int kbase_backend_gpuprops_get_l2_features(struct kbase_device *kbdev, @@ -190,13 +183,24 @@ int kbase_backend_gpuprops_get_l2_features(struct kbase_device *kbdev, GPU_CONTROL_REG(L2_FEATURES)); u32 l2_config = kbase_reg_read(kbdev, GPU_CONTROL_REG(L2_CONFIG)); + u32 asn_hash[ASN_HASH_COUNT] = { + 0, + }; + int i; + if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_ASN_HASH)) { + for (i = 0; i < ASN_HASH_COUNT; i++) + asn_hash[i] = kbase_reg_read( + kbdev, GPU_CONTROL_REG(ASN_HASH(i))); + } if (kbase_is_gpu_removed(kbdev)) return -EIO; regdump->l2_features = l2_features; regdump->l2_config = l2_config; + for (i = 0; i < ASN_HASH_COUNT; i++) + regdump->l2_asn_hash[i] = asn_hash[i]; } return 0; diff --git a/mali_kbase/backend/gpu/mali_kbase_instr_backend.c b/mali_kbase/backend/gpu/mali_kbase_instr_backend.c index d7edf30..90cc537 100644 --- a/mali_kbase/backend/gpu/mali_kbase_instr_backend.c +++ b/mali_kbase/backend/gpu/mali_kbase_instr_backend.c @@ -53,6 +53,12 @@ int kbase_instr_hwcnt_enable_internal(struct kbase_device *kbdev, goto out_err; } + if (kbase_is_gpu_removed(kbdev)) { + /* GPU has been removed by Arbiter */ + spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags); + goto out_err; + } + /* Enable interrupt */ irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK)); kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK), irq_mask | @@ -152,6 +158,14 @@ int kbase_instr_hwcnt_disable_internal(struct kbase_context *kctx) kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_DISABLED; kbdev->hwcnt.backend.triggered = 0; + if (kbase_is_gpu_removed(kbdev)) { + /* GPU has been removed by Arbiter */ + spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags); + spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags); + err = 0; + goto out; + } + /* Disable interrupt */ irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK)); kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK), @@ -195,6 +209,11 @@ int kbase_instr_hwcnt_request_dump(struct kbase_context *kctx) goto unlock; } + if (kbase_is_gpu_removed(kbdev)) { + /* GPU has been removed by Arbiter */ + goto unlock; + } + kbdev->hwcnt.backend.triggered = 0; /* Mark that we're dumping - the PF handler can signal that we faulted @@ -310,6 +329,11 @@ int kbase_instr_hwcnt_clear(struct kbase_context *kctx) KBASE_INSTR_STATE_IDLE) goto out; + if (kbase_is_gpu_removed(kbdev)) { + /* GPU has been removed by Arbiter */ + goto out; + } + /* Clear the counters */ KBASE_KTRACE_ADD(kbdev, CORE_GPU_PRFCNT_CLEAR, NULL, 0); kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND), diff --git a/mali_kbase/backend/gpu/mali_kbase_jm_hw.c b/mali_kbase/backend/gpu/mali_kbase_jm_hw.c index ae0377f..001efd9 100644 --- a/mali_kbase/backend/gpu/mali_kbase_jm_hw.c +++ b/mali_kbase/backend/gpu/mali_kbase_jm_hw.c @@ -48,18 +48,13 @@ static u64 kbase_job_write_affinity(struct kbase_device *kbdev, int js, const u64 limited_core_mask) { u64 affinity; + bool skip_affinity_check = false; if ((core_req & (BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T)) == BASE_JD_REQ_T) { - /* Tiler-only atom */ - /* If the hardware supports XAFFINITY then we'll only enable - * the tiler (which is the default so this is a no-op), - * otherwise enable shader core 0. - */ - if (!kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_XAFFINITY)) - affinity = 1; - else - affinity = 0; + /* Tiler-only atom, affinity value can be programed as 0 */ + affinity = 0; + skip_affinity_check = true; } else if ((core_req & (BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP))) { unsigned int num_core_groups = kbdev->gpu_props.num_core_groups; @@ -89,7 +84,7 @@ static u64 kbase_job_write_affinity(struct kbase_device *kbdev, affinity = kbasep_apply_limited_core_mask(kbdev, affinity, limited_core_mask); } - if (unlikely(!affinity)) { + if (unlikely(!affinity && !skip_affinity_check)) { #ifdef CONFIG_MALI_DEBUG u64 shaders_ready = kbase_pm_get_ready_cores(kbdev, KBASE_PM_CORE_SHADER); @@ -251,18 +246,13 @@ void kbase_job_hw_submit(struct kbase_device *kbdev, (katom->core_req & BASE_JD_REQ_END_RENDERPASS)) cfg |= JS_CONFIG_DISABLE_DESCRIPTOR_WR_BK; - if (kbase_hw_has_feature(kbdev, - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION)) { - if (!kbdev->hwaccess.backend.slot_rb[js].job_chain_flag) { - cfg |= JS_CONFIG_JOB_CHAIN_FLAG; - katom->atom_flags |= KBASE_KATOM_FLAGS_JOBCHAIN; - kbdev->hwaccess.backend.slot_rb[js].job_chain_flag = - true; - } else { - katom->atom_flags &= ~KBASE_KATOM_FLAGS_JOBCHAIN; - kbdev->hwaccess.backend.slot_rb[js].job_chain_flag = - false; - } + if (!kbdev->hwaccess.backend.slot_rb[js].job_chain_flag) { + cfg |= JS_CONFIG_JOB_CHAIN_FLAG; + katom->atom_flags |= KBASE_KATOM_FLAGS_JOBCHAIN; + kbdev->hwaccess.backend.slot_rb[js].job_chain_flag = true; + } else { + katom->atom_flags &= ~KBASE_KATOM_FLAGS_JOBCHAIN; + kbdev->hwaccess.backend.slot_rb[js].job_chain_flag = false; } kbase_reg_write(kbdev, JOB_SLOT_REG(js, JS_CONFIG_NEXT), cfg); @@ -621,25 +611,17 @@ void kbasep_job_slot_soft_or_hard_stop_do_action(struct kbase_device *kbdev, /* Mark the point where we issue the soft-stop command */ KBASE_TLSTREAM_TL_EVENT_ATOM_SOFTSTOP_ISSUE(kbdev, target_katom); - if (kbase_hw_has_feature( - kbdev, - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION)) { - action = (target_katom->atom_flags & - KBASE_KATOM_FLAGS_JOBCHAIN) ? - JS_COMMAND_SOFT_STOP_1 : - JS_COMMAND_SOFT_STOP_0; - } + action = (target_katom->atom_flags & + KBASE_KATOM_FLAGS_JOBCHAIN) ? + JS_COMMAND_SOFT_STOP_1 : + JS_COMMAND_SOFT_STOP_0; } else if (action == JS_COMMAND_HARD_STOP) { target_katom->atom_flags |= KBASE_KATOM_FLAG_BEEN_HARD_STOPPED; - if (kbase_hw_has_feature( - kbdev, - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION)) { - action = (target_katom->atom_flags & - KBASE_KATOM_FLAGS_JOBCHAIN) ? - JS_COMMAND_HARD_STOP_1 : - JS_COMMAND_HARD_STOP_0; - } + action = (target_katom->atom_flags & + KBASE_KATOM_FLAGS_JOBCHAIN) ? + JS_COMMAND_HARD_STOP_1 : + JS_COMMAND_HARD_STOP_0; } kbase_reg_write(kbdev, JOB_SLOT_REG(js, JS_COMMAND), action); @@ -725,40 +707,11 @@ void kbase_backend_jm_kill_running_jobs_from_kctx(struct kbase_context *kctx) kbase_job_slot_hardstop(kctx, i, NULL); } -/** - * kbase_is_existing_atom_submitted_later_than_ready - * @ready: sequence number of the ready atom - * @existing: sequence number of the existing atom - * - * Returns true if the existing atom has been submitted later than the - * ready atom. It is used to understand if an atom that is ready has been - * submitted earlier than the currently running atom, so that the currently - * running atom should be preempted to allow the ready atom to run. - */ -static inline bool kbase_is_existing_atom_submitted_later_than_ready(u64 ready, u64 existing) -{ - /* No seq_nr set? */ - if (!ready || !existing) - return false; - - /* Efficiently handle the unlikely case of wrapping. - * The following code assumes that the delta between the sequence number - * of the two atoms is less than INT64_MAX. - * In the extremely unlikely case where the delta is higher, the comparison - * defaults for no preemption. - * The code also assumes that the conversion from unsigned to signed types - * works because the signed integers are 2's complement. - */ - return (s64)(ready - existing) < 0; -} - void kbase_job_slot_ctx_priority_check_locked(struct kbase_context *kctx, struct kbase_jd_atom *target_katom) { struct kbase_device *kbdev; - int js = target_katom->slot_nr; - int priority = target_katom->sched_priority; - int seq_nr = target_katom->seq_nr; + int target_js = target_katom->slot_nr; int i; bool stop_sent = false; @@ -768,26 +721,21 @@ void kbase_job_slot_ctx_priority_check_locked(struct kbase_context *kctx, lockdep_assert_held(&kbdev->hwaccess_lock); - for (i = 0; i < kbase_backend_nr_atoms_on_slot(kbdev, js); i++) { - struct kbase_jd_atom *katom; - - katom = kbase_gpu_inspect(kbdev, js, i); - if (!katom) - continue; + for (i = 0; i < kbase_backend_nr_atoms_on_slot(kbdev, target_js); i++) { + struct kbase_jd_atom *slot_katom; - if ((kbdev->js_ctx_scheduling_mode == - KBASE_JS_PROCESS_LOCAL_PRIORITY_MODE) && - (katom->kctx != kctx)) + slot_katom = kbase_gpu_inspect(kbdev, target_js, i); + if (!slot_katom) continue; - if ((katom->sched_priority > priority) || - (katom->kctx == kctx && kbase_is_existing_atom_submitted_later_than_ready(seq_nr, katom->seq_nr))) { + if (kbase_js_atom_runs_before(kbdev, target_katom, slot_katom, + KBASE_ATOM_ORDERING_FLAG_SEQNR)) { if (!stop_sent) KBASE_TLSTREAM_TL_ATTRIB_ATOM_PRIORITIZED( kbdev, target_katom); - kbase_job_slot_softstop(kbdev, js, katom); + kbase_job_slot_softstop(kbdev, target_js, slot_katom); stop_sent = true; } } diff --git a/mali_kbase/backend/gpu/mali_kbase_jm_rb.c b/mali_kbase/backend/gpu/mali_kbase_jm_rb.c index b475d79..1906286 100644 --- a/mali_kbase/backend/gpu/mali_kbase_jm_rb.c +++ b/mali_kbase/backend/gpu/mali_kbase_jm_rb.c @@ -387,6 +387,9 @@ static void kbase_gpu_mark_atom_for_return(struct kbase_device *kbdev, { lockdep_assert_held(&kbdev->hwaccess_lock); + KBASE_KTRACE_ADD_JM_SLOT_INFO(kbdev, JM_MARK_FOR_RETURN_TO_JS, + katom->kctx, katom, katom->jc, + katom->slot_nr, katom->event_code); kbase_gpu_release_atom(kbdev, katom, NULL); katom->gpu_rb_state = KBASE_ATOM_GPU_RB_RETURN_TO_JS; } @@ -564,7 +567,7 @@ static int kbase_jm_enter_protected_mode(struct kbase_device *kbdev, kbdev->protected_mode_transition = true; /* ***TRANSITION TO HIGHER STATE*** */ - /* fallthrough */ + fallthrough; case KBASE_ATOM_ENTER_PROTECTED_HWCNT: /* See if we can get away with disabling hwcnt atomically */ kbdev->protected_mode_hwcnt_desired = false; @@ -607,7 +610,7 @@ static int kbase_jm_enter_protected_mode(struct kbase_device *kbdev, kbase_pm_update_cores_state_nolock(kbdev); /* ***TRANSITION TO HIGHER STATE*** */ - /* fallthrough */ + fallthrough; case KBASE_ATOM_ENTER_PROTECTED_IDLE_L2: /* Avoid unnecessary waiting on non-ACE platforms. */ if (kbdev->system_coherency == COHERENCY_ACE) { @@ -638,7 +641,7 @@ static int kbase_jm_enter_protected_mode(struct kbase_device *kbdev, KBASE_ATOM_ENTER_PROTECTED_SET_COHERENCY; /* ***TRANSITION TO HIGHER STATE*** */ - /* fallthrough */ + fallthrough; case KBASE_ATOM_ENTER_PROTECTED_SET_COHERENCY: /* * When entering into protected mode, we must ensure that the @@ -671,7 +674,7 @@ static int kbase_jm_enter_protected_mode(struct kbase_device *kbdev, return -EAGAIN; /* ***TRANSITION TO HIGHER STATE*** */ - /* fallthrough */ + fallthrough; case KBASE_ATOM_ENTER_PROTECTED_FINISHED: if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_TGOX_R1_1234)) { /* @@ -742,7 +745,7 @@ static int kbase_jm_exit_protected_mode(struct kbase_device *kbdev, kbase_pm_update_cores_state_nolock(kbdev); /* ***TRANSITION TO HIGHER STATE*** */ - /* fallthrough */ + fallthrough; case KBASE_ATOM_EXIT_PROTECTED_IDLE_L2: if (kbdev->pm.backend.l2_state != KBASE_L2_OFF) { /* @@ -755,7 +758,7 @@ static int kbase_jm_exit_protected_mode(struct kbase_device *kbdev, KBASE_ATOM_EXIT_PROTECTED_RESET; /* ***TRANSITION TO HIGHER STATE*** */ - /* fallthrough */ + fallthrough; case KBASE_ATOM_EXIT_PROTECTED_RESET: /* Issue the reset to the GPU */ err = kbase_gpu_protected_mode_reset(kbdev); @@ -797,7 +800,7 @@ static int kbase_jm_exit_protected_mode(struct kbase_device *kbdev, KBASE_ATOM_EXIT_PROTECTED_RESET_WAIT; /* ***TRANSITION TO HIGHER STATE*** */ - /* fallthrough */ + fallthrough; case KBASE_ATOM_EXIT_PROTECTED_RESET_WAIT: /* A GPU reset is issued when exiting protected mode. Once the * reset is done all atoms' state will also be reset. For this @@ -854,7 +857,7 @@ void kbase_backend_slot_update(struct kbase_device *kbdev) KBASE_ATOM_GPU_RB_WAITING_PROTECTED_MODE_PREV; /* ***TRANSITION TO HIGHER STATE*** */ - /* fallthrough */ + fallthrough; case KBASE_ATOM_GPU_RB_WAITING_PROTECTED_MODE_PREV: if (kbase_gpu_check_secure_atoms(kbdev, !kbase_jd_katom_is_protected( @@ -874,7 +877,7 @@ void kbase_backend_slot_update(struct kbase_device *kbdev) KBASE_ATOM_GPU_RB_WAITING_PROTECTED_MODE_TRANSITION; /* ***TRANSITION TO HIGHER STATE*** */ - /* fallthrough */ + fallthrough; case KBASE_ATOM_GPU_RB_WAITING_PROTECTED_MODE_TRANSITION: /* @@ -909,7 +912,7 @@ void kbase_backend_slot_update(struct kbase_device *kbdev) KBASE_ATOM_GPU_RB_WAITING_FOR_CORE_AVAILABLE; /* ***TRANSITION TO HIGHER STATE*** */ - /* fallthrough */ + fallthrough; case KBASE_ATOM_GPU_RB_WAITING_FOR_CORE_AVAILABLE: if (katom[idx]->will_fail_event_code) { kbase_gpu_mark_atom_for_return(kbdev, @@ -936,6 +939,11 @@ void kbase_backend_slot_update(struct kbase_device *kbdev) if (katom[idx]->event_code == BASE_JD_EVENT_PM_EVENT) { + KBASE_KTRACE_ADD_JM_SLOT_INFO( + kbdev, JM_MARK_FOR_RETURN_TO_JS, + katom[idx]->kctx, katom[idx], + katom[idx]->jc, js, + katom[idx]->event_code); katom[idx]->gpu_rb_state = KBASE_ATOM_GPU_RB_RETURN_TO_JS; break; @@ -948,7 +956,7 @@ void kbase_backend_slot_update(struct kbase_device *kbdev) KBASE_ATOM_GPU_RB_READY; /* ***TRANSITION TO HIGHER STATE*** */ - /* fallthrough */ + fallthrough; case KBASE_ATOM_GPU_RB_READY: if (idx == 1) { @@ -994,7 +1002,7 @@ void kbase_backend_slot_update(struct kbase_device *kbdev) KBASE_ATOM_GPU_RB_SUBMITTED; /* ***TRANSITION TO HIGHER STATE*** */ - /* fallthrough */ + fallthrough; case KBASE_ATOM_GPU_RB_SUBMITTED: /* Inform power management at start/finish of @@ -1037,9 +1045,55 @@ void kbase_backend_run_atom(struct kbase_device *kbdev, kbase_backend_slot_update(kbdev); } -#define HAS_DEP(katom) (katom->pre_dep || katom->atom_flags & \ - (KBASE_KATOM_FLAG_X_DEP_BLOCKED | KBASE_KATOM_FLAG_FAIL_BLOCKER)) +/** + * kbase_rb_atom_might_depend - determine if one atom in the slot ringbuffer + * might depend on another from the same kctx + * @katom_a: dependee atom + * @katom_b: atom to query + * + * This can be used on atoms that belong to different slot ringbuffers + * + * Return: true if @katom_b might depend on @katom_a, false if it cannot depend. + */ +static inline bool +kbase_rb_atom_might_depend(const struct kbase_jd_atom *katom_a, + const struct kbase_jd_atom *katom_b) +{ + if (katom_a->kctx != katom_b->kctx) + return false; + return (katom_b->pre_dep || + (katom_b->atom_flags & (KBASE_KATOM_FLAG_X_DEP_BLOCKED | + KBASE_KATOM_FLAG_FAIL_BLOCKER))); +} +/** + * kbase_gpu_irq_evict - evict a slot's JSn_HEAD_NEXT atom from the HW if it is + * related to a failed JSn_HEAD atom + * @kbdev kbase device + * @js job slot to check + * @completion_code completion code of the failed atom + * + * Note: 'STOPPED' atoms are considered 'failed', as they are in the HW, but + * unlike other failure codes we _can_ re-run them. + * + * This forms step 1 in a 2-step process of removing any related atoms from a + * slot's JSn_HEAD_NEXT (ringbuffer index 1), should there have + * been a 'failure' on an atom in JSn_HEAD (ringbuffer index 0). + * + * This step only removes the atoms from the HW, and marks them as + * (potentially) ready to run again. + * + * Step 2 is on marking the JSn_HEAD atom as complete + * (kbase_gpu_complete_hw()), to dequeue said atoms and return them to the JS + * as appropriate, or re-submit them. + * + * Hence, this function must evict at a minimum the atoms related to the atom + * in JSn_HEAD that kbase_gpu_complete_hw() will also dequeue. It is acceptable + * if this function evicts more atoms than kbase_gpu_complete_hw() dequeues, as + * the next kbase_backend_slot_update() will resubmit any remaining. + * + * Return: true if an atom was evicted, false otherwise. + */ bool kbase_gpu_irq_evict(struct kbase_device *kbdev, int js, u32 completion_code) { @@ -1051,14 +1105,12 @@ bool kbase_gpu_irq_evict(struct kbase_device *kbdev, int js, katom = kbase_gpu_inspect(kbdev, js, 0); next_katom = kbase_gpu_inspect(kbdev, js, 1); - if (next_katom && katom->kctx == next_katom->kctx && - next_katom->gpu_rb_state == KBASE_ATOM_GPU_RB_SUBMITTED && - (HAS_DEP(next_katom) || next_katom->sched_priority == - katom->sched_priority) && - (kbase_reg_read(kbdev, JOB_SLOT_REG(js, JS_HEAD_NEXT_LO)) - != 0 || - kbase_reg_read(kbdev, JOB_SLOT_REG(js, JS_HEAD_NEXT_HI)) - != 0)) { + if (next_katom && + next_katom->gpu_rb_state == KBASE_ATOM_GPU_RB_SUBMITTED && + (kbase_rb_atom_might_depend(katom, next_katom) || + kbase_js_atom_runs_before(kbdev, katom, next_katom, 0u)) && + (kbase_reg_read(kbdev, JOB_SLOT_REG(js, JS_HEAD_NEXT_LO)) != 0 || + kbase_reg_read(kbdev, JOB_SLOT_REG(js, JS_HEAD_NEXT_HI)) != 0)) { kbase_reg_write(kbdev, JOB_SLOT_REG(js, JS_COMMAND_NEXT), JS_COMMAND_NOP); next_katom->gpu_rb_state = KBASE_ATOM_GPU_RB_READY; @@ -1083,6 +1135,30 @@ bool kbase_gpu_irq_evict(struct kbase_device *kbdev, int js, return false; } +/** + * kbase_gpu_complete_hw - complete the atom in a slot's JSn_HEAD + * @kbdev kbase device + * @js job slot to check + * @completion_code completion code of the completed atom + * @job_tail value read from JSn_TAIL, for STOPPED atoms + * @end_timestamp pointer to approximate ktime value when the katom completed + * + * Among other operations, this also executes step 2 of a 2-step process of + * removing any related atoms from a slot's JSn_HEAD_NEXT (ringbuffer index 1), + * should there have been a 'failure' on an atom in JSn_HEAD (ringbuffer index + * 0). The first step is done in kbase_gpu_irq_evict(). + * + * Note: 'STOPPED' atoms are considered 'failed', as they are in the HW, but + * unlike other failure codes we _can_ re-run them. + * + * When the JSn_HEAD atom is considered to be 'failed', then this will dequeue + * and return to the JS some (usually all) of the atoms evicted from the HW + * during the kbase_gpu_irq_evict() for that JSn_HEAD atom. If it dequeues an + * atom, that atom must not have been running or must already be evicted, as + * otherwise we would be in the incorrect state of having an atom both running + * on the HW and returned to the JS. + */ + void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js, u32 completion_code, u64 job_tail, @@ -1133,9 +1209,8 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js, * registers by kbase_gpu_soft_hard_stop_slot(), to ensure that * the atoms on this slot are returned in the correct order. */ - if (next_katom && katom->kctx == next_katom->kctx && - next_katom->sched_priority == - katom->sched_priority) { + if (next_katom && + kbase_js_atom_runs_before(kbdev, katom, next_katom, 0u)) { WARN_ON(next_katom->gpu_rb_state == KBASE_ATOM_GPU_RB_SUBMITTED); kbase_gpu_dequeue_atom(kbdev, js, end_timestamp); @@ -1145,12 +1220,14 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js, struct kbasep_js_device_data *js_devdata = &kbdev->js_data; int i; - if (!kbase_ctx_flag(katom->kctx, KCTX_DYING)) + if (!kbase_ctx_flag(katom->kctx, KCTX_DYING)) { dev_warn(kbdev->dev, "error detected from slot %d, job status 0x%08x (%s)", js, completion_code, kbase_gpu_exception_name( completion_code)); + } + #if KBASE_KTRACE_DUMP_ON_JOB_SLOT_ERROR != 0 KBASE_KTRACE_DUMP(kbdev); #endif @@ -1168,18 +1245,17 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js, struct kbase_jd_atom *katom_idx1 = kbase_gpu_inspect(kbdev, i, 1); - if (katom_idx0 && katom_idx0->kctx == katom->kctx && - HAS_DEP(katom_idx0) && - katom_idx0->gpu_rb_state != - KBASE_ATOM_GPU_RB_SUBMITTED) { + if (katom_idx0 && + kbase_rb_atom_might_depend(katom, katom_idx0) && + katom_idx0->gpu_rb_state != + KBASE_ATOM_GPU_RB_SUBMITTED) { /* Dequeue katom_idx0 from ringbuffer */ kbase_gpu_dequeue_atom(kbdev, i, end_timestamp); - if (katom_idx1 && - katom_idx1->kctx == katom->kctx - && HAS_DEP(katom_idx1) && - katom_idx0->gpu_rb_state != - KBASE_ATOM_GPU_RB_SUBMITTED) { + if (katom_idx1 && kbase_rb_atom_might_depend( + katom, katom_idx1) && + katom_idx0->gpu_rb_state != + KBASE_ATOM_GPU_RB_SUBMITTED) { /* Dequeue katom_idx1 from ringbuffer */ kbase_gpu_dequeue_atom(kbdev, i, end_timestamp); @@ -1192,11 +1268,10 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js, katom_idx0->event_code = BASE_JD_EVENT_STOPPED; kbase_jm_return_atom_to_js(kbdev, katom_idx0); - } else if (katom_idx1 && - katom_idx1->kctx == katom->kctx && - HAS_DEP(katom_idx1) && - katom_idx1->gpu_rb_state != - KBASE_ATOM_GPU_RB_SUBMITTED) { + } else if (katom_idx1 && kbase_rb_atom_might_depend( + katom, katom_idx1) && + katom_idx1->gpu_rb_state != + KBASE_ATOM_GPU_RB_SUBMITTED) { /* Can not dequeue this atom yet - will be * dequeued when atom at idx0 completes */ @@ -1369,17 +1444,63 @@ void kbase_backend_reset(struct kbase_device *kbdev, ktime_t *end_timestamp) kbase_pm_protected_override_disable(kbdev); } +/** + * should_stop_next_atom - given a soft/hard stop action, determine if the next + * atom on a slot should be stopped + * @kbdev: kbase devices + * @head_katom: atom currently in the JSn_HEAD + * @next_katom: atom currently in the JSn_HEAD_NEXT + * @action: JS_COMMAND_<...> action for soft/hard-stop + * + * This is used in cases where @head_katom is the target of the soft/hard-stop. + * It only makes sense to call this when @head_katom and @next_katom are from + * the same slot. + * + * Return: true if @next_katom should also be stopped with the given action, + * false otherwise + */ +static bool should_stop_next_atom(struct kbase_device *kbdev, + const struct kbase_jd_atom *head_katom, + const struct kbase_jd_atom *next_katom, + u32 action) +{ + bool ret = false; + u32 hw_action = action & JS_COMMAND_MASK; + + switch (hw_action) { + case JS_COMMAND_SOFT_STOP: + ret = kbase_js_atom_runs_before(kbdev, head_katom, next_katom, + 0u); + break; + case JS_COMMAND_HARD_STOP: + /* Unlike soft-stop, a hard-stop targeting a particular atom + * should not cause atoms from unrelated contexts to be + * removed + */ + ret = (head_katom->kctx == next_katom->kctx); + break; + default: + /* Other stop actions are possible, but the driver should not + * be generating them at this point in the call chain + */ + WARN(1, "Unexpected stop action: 0x%.8x", hw_action); + break; + } + return ret; +} + static inline void kbase_gpu_stop_atom(struct kbase_device *kbdev, int js, struct kbase_jd_atom *katom, u32 action) { + struct kbase_context *kctx = katom->kctx; u32 hw_action = action & JS_COMMAND_MASK; kbase_job_check_enter_disjoint(kbdev, action, katom->core_req, katom); kbasep_job_slot_soft_or_hard_stop_do_action(kbdev, js, hw_action, katom->core_req, katom); - katom->kctx->blocked_js[js][katom->sched_priority] = true; + kbase_jsctx_slot_prio_blocked_set(kctx, js, katom->sched_priority); } static inline void kbase_gpu_remove_atom(struct kbase_device *kbdev, @@ -1387,11 +1508,14 @@ static inline void kbase_gpu_remove_atom(struct kbase_device *kbdev, u32 action, bool disjoint) { + struct kbase_context *kctx = katom->kctx; + lockdep_assert_held(&kbdev->hwaccess_lock); katom->event_code = BASE_JD_EVENT_REMOVED_FROM_NEXT; kbase_gpu_mark_atom_for_return(kbdev, katom); - katom->kctx->blocked_js[katom->slot_nr][katom->sched_priority] = true; + kbase_jsctx_slot_prio_blocked_set(kctx, katom->slot_nr, + katom->sched_priority); if (disjoint) kbase_job_check_enter_disjoint(kbdev, action, katom->core_req, @@ -1419,7 +1543,9 @@ bool kbase_backend_soft_hard_stop_slot(struct kbase_device *kbdev, u32 action) { struct kbase_jd_atom *katom_idx0; + struct kbase_context *kctx_idx0 = NULL; struct kbase_jd_atom *katom_idx1; + struct kbase_context *kctx_idx1 = NULL; bool katom_idx0_valid, katom_idx1_valid; @@ -1433,30 +1559,32 @@ bool kbase_backend_soft_hard_stop_slot(struct kbase_device *kbdev, katom_idx0 = kbase_gpu_inspect(kbdev, js, 0); katom_idx1 = kbase_gpu_inspect(kbdev, js, 1); - if (katom_idx0) + if (katom_idx0) { + kctx_idx0 = katom_idx0->kctx; prio_idx0 = katom_idx0->sched_priority; - if (katom_idx1) + } + if (katom_idx1) { + kctx_idx1 = katom_idx1->kctx; prio_idx1 = katom_idx1->sched_priority; + } if (katom) { katom_idx0_valid = (katom_idx0 == katom); - /* If idx0 is to be removed and idx1 is on the same context, - * then idx1 must also be removed otherwise the atoms might be - * returned out of order - */ if (katom_idx1) - katom_idx1_valid = (katom_idx1 == katom) || - (katom_idx0_valid && - (katom_idx0->kctx == - katom_idx1->kctx)); + katom_idx1_valid = (katom_idx1 == katom); else katom_idx1_valid = false; } else { - katom_idx0_valid = - (katom_idx0 && (!kctx || katom_idx0->kctx == kctx)); - katom_idx1_valid = - (katom_idx1 && (!kctx || katom_idx1->kctx == kctx)); + katom_idx0_valid = (katom_idx0 && (!kctx || kctx_idx0 == kctx)); + katom_idx1_valid = (katom_idx1 && (!kctx || kctx_idx1 == kctx)); } + /* If there's an atom in JSn_HEAD_NEXT that we haven't already decided + * to stop, but we're stopping the JSn_HEAD atom, see if they are + * related/ordered in some way that would require the same stop action + */ + if (!katom_idx1_valid && katom_idx0_valid && katom_idx1) + katom_idx1_valid = should_stop_next_atom(kbdev, katom_idx0, + katom_idx1, action); if (katom_idx0_valid) stop_x_dep_idx0 = should_stop_x_dep_slot(katom_idx0); @@ -1472,14 +1600,15 @@ bool kbase_backend_soft_hard_stop_slot(struct kbase_device *kbdev, katom_idx1->event_code = BASE_JD_EVENT_REMOVED_FROM_NEXT; kbase_jm_return_atom_to_js(kbdev, katom_idx1); - katom_idx1->kctx->blocked_js[js][prio_idx1] = - true; + kbase_jsctx_slot_prio_blocked_set(kctx_idx1, js, + prio_idx1); } katom_idx0->event_code = BASE_JD_EVENT_REMOVED_FROM_NEXT; kbase_jm_return_atom_to_js(kbdev, katom_idx0); - katom_idx0->kctx->blocked_js[js][prio_idx0] = true; + kbase_jsctx_slot_prio_blocked_set(kctx_idx0, js, + prio_idx0); } else { /* katom_idx0 is on GPU */ if (katom_idx1_valid && katom_idx1->gpu_rb_state == diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_backend.c b/mali_kbase/backend/gpu/mali_kbase_pm_backend.c index cc791df..5df7f67 100644 --- a/mali_kbase/backend/gpu/mali_kbase_pm_backend.c +++ b/mali_kbase/backend/gpu/mali_kbase_pm_backend.c @@ -32,6 +32,9 @@ #include <mali_kbase_hwaccess_jm.h> #include <backend/gpu/mali_kbase_js_internal.h> #include <backend/gpu/mali_kbase_jm_internal.h> +#else +#include <linux/pm_runtime.h> +#include <mali_kbase_reset_gpu.h> #endif /* !MALI_USE_CSF */ #include <mali_kbase_hwcnt_context.h> #include <backend/gpu/mali_kbase_pm_internal.h> @@ -69,6 +72,10 @@ int kbase_pm_runtime_init(struct kbase_device *kbdev) callbacks->power_runtime_idle_callback; kbdev->pm.backend.callback_soft_reset = callbacks->soft_reset_callback; + kbdev->pm.backend.callback_power_runtime_gpu_idle = + callbacks->power_runtime_gpu_idle_callback; + kbdev->pm.backend.callback_power_runtime_gpu_active = + callbacks->power_runtime_gpu_active_callback; if (callbacks->power_runtime_init_callback) return callbacks->power_runtime_init_callback(kbdev); @@ -86,6 +93,8 @@ int kbase_pm_runtime_init(struct kbase_device *kbdev) kbdev->pm.backend.callback_power_runtime_off = NULL; kbdev->pm.backend.callback_power_runtime_idle = NULL; kbdev->pm.backend.callback_soft_reset = NULL; + kbdev->pm.backend.callback_power_runtime_gpu_idle = NULL; + kbdev->pm.backend.callback_power_runtime_gpu_active = NULL; return 0; } @@ -120,10 +129,10 @@ void kbase_pm_register_access_disable(struct kbase_device *kbdev) callbacks = (struct kbase_pm_callback_conf *)POWER_MANAGEMENT_CALLBACKS; + kbdev->pm.backend.gpu_powered = false; + if (callbacks) callbacks->power_off_callback(kbdev); - - kbdev->pm.backend.gpu_powered = false; } int kbase_hwaccess_pm_init(struct kbase_device *kbdev) @@ -193,6 +202,7 @@ int kbase_hwaccess_pm_init(struct kbase_device *kbdev) kbase_pm_hwcnt_disable_worker); kbase_hwcnt_context_disable(kbdev->hwcnt_gpu_ctx); + if (IS_ENABLED(CONFIG_MALI_HW_ERRATA_1485982_NOT_AFFECTED)) { kbdev->pm.backend.l2_always_on = false; kbdev->pm.backend.gpu_clock_slow_down_wa = false; @@ -263,6 +273,76 @@ void kbase_pm_do_poweron(struct kbase_device *kbdev, bool is_resume) */ } +static void pm_handle_power_off(struct kbase_device *kbdev) +{ + struct kbase_pm_backend_data *backend = &kbdev->pm.backend; +#if MALI_USE_CSF + enum kbase_mcu_state mcu_state; +#endif + unsigned long flags; + + lockdep_assert_held(&kbdev->pm.lock); + + if (backend->poweron_required) + return; + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); +#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME) + if (kbdev->pm.backend.gpu_wakeup_override ) { + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + return; + } +#endif + WARN_ON(backend->shaders_state != + KBASE_SHADERS_OFF_CORESTACK_OFF || + backend->l2_state != KBASE_L2_OFF); +#if MALI_USE_CSF + mcu_state = backend->mcu_state; + WARN_ON(!kbase_pm_is_mcu_inactive(kbdev, mcu_state)); +#endif + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + +#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME) + if (backend->callback_power_runtime_gpu_idle) { + WARN_ON(backend->gpu_idled); + backend->callback_power_runtime_gpu_idle(kbdev); + backend->gpu_idled = true; + return; + } +#endif + + /* Disable interrupts and turn the clock off */ + if (!kbase_pm_clock_off(kbdev)) { + /* + * Page/bus faults are pending, must drop locks to + * process. Interrupts are disabled so no more faults + * should be generated at this point. + */ + kbase_pm_unlock(kbdev); + kbase_flush_mmu_wqs(kbdev); + kbase_pm_lock(kbdev); + +#ifdef CONFIG_MALI_ARBITER_SUPPORT + /* poweron_required may have changed while pm lock + * was released. + */ + if (kbase_pm_is_gpu_lost(kbdev)) + backend->poweron_required = false; +#endif + + /* Turn off clock now that fault have been handled. We + * dropped locks so poweron_required may have changed - + * power back on if this is the case (effectively only + * re-enabling of the interrupts would be done in this + * case, as the clocks to GPU were not withdrawn yet). + */ + if (backend->poweron_required) + kbase_pm_clock_on(kbdev, false); + else + WARN_ON(!kbase_pm_clock_off(kbdev)); + } +} + static void kbase_pm_gpu_poweroff_wait_wq(struct work_struct *data) { struct kbase_device *kbdev = container_of(data, struct kbase_device, @@ -271,6 +351,8 @@ static void kbase_pm_gpu_poweroff_wait_wq(struct work_struct *data) struct kbase_pm_backend_data *backend = &pm->backend; unsigned long flags; + KBASE_KTRACE_ADD(kbdev, PM_POWEROFF_WAIT_WQ, NULL, 0); + #if !MALI_USE_CSF /* Wait for power transitions to complete. We do this with no locks held * so that we don't deadlock with any pending workqueues. @@ -285,46 +367,7 @@ static void kbase_pm_gpu_poweroff_wait_wq(struct work_struct *data) backend->poweron_required = false; #endif - if (!backend->poweron_required) { - unsigned long flags; - - spin_lock_irqsave(&kbdev->hwaccess_lock, flags); - WARN_ON(backend->shaders_state != - KBASE_SHADERS_OFF_CORESTACK_OFF || - backend->l2_state != KBASE_L2_OFF); - spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); - - /* Disable interrupts and turn the clock off */ - if (!kbase_pm_clock_off(kbdev)) { - /* - * Page/bus faults are pending, must drop locks to - * process. Interrupts are disabled so no more faults - * should be generated at this point. - */ - kbase_pm_unlock(kbdev); - kbase_flush_mmu_wqs(kbdev); - kbase_pm_lock(kbdev); - -#ifdef CONFIG_MALI_ARBITER_SUPPORT - /* poweron_required may have changed while pm lock - * was released. - */ - if (kbase_pm_is_gpu_lost(kbdev)) - backend->poweron_required = false; -#endif - - /* Turn off clock now that fault have been handled. We - * dropped locks so poweron_required may have changed - - * power back on if this is the case (effectively only - * re-enabling of the interrupts would be done in this - * case, as the clocks to GPU were not withdrawn yet). - */ - if (backend->poweron_required) - kbase_pm_clock_on(kbdev, false); - else - WARN_ON(!kbase_pm_clock_off(kbdev)); - } - } + pm_handle_power_off(kbdev); spin_lock_irqsave(&kbdev->hwaccess_lock, flags); backend->poweroff_wait_in_progress = false; @@ -512,6 +555,74 @@ static void kbase_pm_hwcnt_disable_worker(struct work_struct *data) spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); } +#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME) +/** + * kbase_pm_do_poweroff_sync - Do the synchronous power down of GPU + * + * @kbdev: The kbase device structure for the device (must be a valid pointer) + * + * This function is called at the time of system suspend or device unload + * to power down the GPU synchronously. This is needed as the power down of GPU + * would usually happen from the runtime suspend callback function (if gpu_active + * and gpu_idle callbacks are used) and runtime suspend operation is disabled + * when system suspend takes place. + * The function first waits for the @gpu_poweroff_wait_work to complete, which + * could have been enqueued after the last PM reference was released. + */ +static void kbase_pm_do_poweroff_sync(struct kbase_device *kbdev) +{ + struct kbase_pm_backend_data *backend = &kbdev->pm.backend; + unsigned long flags; + + WARN_ON(kbdev->pm.active_count); + + kbase_pm_wait_for_poweroff_work_complete(kbdev); + + kbase_pm_lock(kbdev); + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + WARN_ON(backend->poweroff_wait_in_progress); + if (backend->gpu_powered) { + int ret; + + backend->mcu_desired = false; + backend->l2_desired = false; + kbase_pm_update_state(kbdev); + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + ret = kbase_pm_wait_for_desired_state(kbdev); + if (ret) { + dev_warn(kbdev->dev, "Wait failed on synchronous power off"); + kbase_pm_unlock(kbdev); + /* Wait for the completion of reset, triggered due to + * the previous failure. + */ + kbase_reset_gpu_wait(kbdev); + /* Wait again for the poweroff work which could have + * been enqueued by the GPU reset worker. + */ + kbase_pm_wait_for_poweroff_work_complete(kbdev); + kbase_pm_lock(kbdev); + } + + /* Due to the power policy, GPU could have been kept active + * throughout and so need to invoke the idle callback before + * the power down. + */ + if (backend->callback_power_runtime_gpu_idle && + !backend->gpu_idled) { + backend->callback_power_runtime_gpu_idle(kbdev); + backend->gpu_idled = true; + } + + kbase_pm_clock_off(kbdev); + } else { + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + } + + kbase_pm_unlock(kbdev); +} +#endif + void kbase_pm_do_poweroff(struct kbase_device *kbdev) { unsigned long flags; @@ -561,12 +672,31 @@ static bool is_poweroff_in_progress(struct kbase_device *kbdev) return ret; } -void kbase_pm_wait_for_poweroff_complete(struct kbase_device *kbdev) +void kbase_pm_wait_for_poweroff_work_complete(struct kbase_device *kbdev) { wait_event_killable(kbdev->pm.backend.poweroff_wait, is_poweroff_in_progress(kbdev)); } -KBASE_EXPORT_TEST_API(kbase_pm_wait_for_poweroff_complete); +KBASE_EXPORT_TEST_API(kbase_pm_wait_for_poweroff_work_complete); + +static bool is_gpu_powered_down(struct kbase_device *kbdev) +{ + bool ret; + unsigned long flags; + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + ret = !kbdev->pm.backend.gpu_powered; + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + return ret; +} + +void kbase_pm_wait_for_gpu_power_down(struct kbase_device *kbdev) +{ + wait_event_killable(kbdev->pm.backend.poweroff_wait, + is_gpu_powered_down(kbdev)); +} +KBASE_EXPORT_TEST_API(kbase_pm_wait_for_gpu_power_down); int kbase_hwaccess_pm_powerup(struct kbase_device *kbdev, unsigned int flags) @@ -612,6 +742,15 @@ int kbase_hwaccess_pm_powerup(struct kbase_device *kbdev, * cores off */ kbdev->pm.active_count = 1; +#if MALI_USE_CSF && KBASE_PM_RUNTIME + if (kbdev->pm.backend.callback_power_runtime_gpu_active) { + /* Take the RPM reference count to match with the internal + * PM reference count + */ + kbdev->pm.backend.callback_power_runtime_gpu_active(kbdev); + WARN_ON(kbdev->pm.backend.gpu_idled); + } +#endif spin_lock_irqsave(&kbdev->pm.backend.gpu_cycle_counter_requests_lock, irq_flags); @@ -653,11 +792,15 @@ void kbase_hwaccess_pm_halt(struct kbase_device *kbdev) { KBASE_DEBUG_ASSERT(kbdev != NULL); +#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME) + kbase_pm_do_poweroff_sync(kbdev); +#else mutex_lock(&kbdev->pm.lock); kbase_pm_do_poweroff(kbdev); mutex_unlock(&kbdev->pm.lock); - kbase_pm_wait_for_poweroff_complete(kbdev); + kbase_pm_wait_for_poweroff_work_complete(kbdev); +#endif } KBASE_EXPORT_TEST_API(kbase_hwaccess_pm_halt); @@ -761,6 +904,9 @@ void kbase_hwaccess_pm_gpu_idle(struct kbase_device *kbdev) void kbase_hwaccess_pm_suspend(struct kbase_device *kbdev) { +#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME) + kbase_pm_do_poweroff_sync(kbdev); +#else /* Force power off the GPU and all cores (regardless of policy), only * after the PM active count reaches zero (otherwise, we risk turning it * off prematurely) @@ -775,7 +921,11 @@ void kbase_hwaccess_pm_suspend(struct kbase_device *kbdev) kbase_pm_unlock(kbdev); - kbase_pm_wait_for_poweroff_complete(kbdev); + kbase_pm_wait_for_poweroff_work_complete(kbdev); +#endif + + WARN_ON(kbdev->pm.backend.gpu_powered); + WARN_ON(atomic_read(&kbdev->faults_pending)); if (kbdev->pm.backend.callback_power_suspend) kbdev->pm.backend.callback_power_suspend(kbdev); @@ -844,9 +994,12 @@ void kbase_pm_handle_gpu_lost(struct kbase_device *kbdev) /* Cancel any pending HWC dumps */ spin_lock_irqsave(&kbdev->hwcnt.lock, flags); - kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_IDLE; - kbdev->hwcnt.backend.triggered = 1; - wake_up(&kbdev->hwcnt.backend.wait); + if (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_DUMPING || + kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_FAULT) { + kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_FAULT; + kbdev->hwcnt.backend.triggered = 1; + wake_up(&kbdev->hwcnt.backend.wait); + } spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags); } mutex_unlock(&arb_vm_state->vm_state_lock); @@ -854,3 +1007,208 @@ void kbase_pm_handle_gpu_lost(struct kbase_device *kbdev) } #endif /* CONFIG_MALI_ARBITER_SUPPORT */ + +#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME) +int kbase_pm_force_mcu_wakeup_after_sleep(struct kbase_device *kbdev) +{ + unsigned long flags; + + lockdep_assert_held(&kbdev->pm.lock); + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + /* Set the override flag to force the power up of L2 cache */ + kbdev->pm.backend.gpu_wakeup_override = true; + kbase_pm_update_state(kbdev); + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + return kbase_pm_wait_for_desired_state(kbdev); +} + +static int pm_handle_mcu_sleep_on_runtime_suspend(struct kbase_device *kbdev) +{ + unsigned long flags; + int ret; + + lockdep_assert_held(&kbdev->csf.scheduler.lock); + lockdep_assert_held(&kbdev->pm.lock); + + /* In case of no active CSG on slot, powering up L2 could be skipped and + * proceed directly to suspend GPU. + * ToDo: firmware has to be reloaded after wake-up as no halt command + * has been sent when GPU was put to sleep mode. + */ + if (!kbase_csf_scheduler_get_nr_active_csgs(kbdev)) + dev_info( + kbdev->dev, + "No active CSGs. Can skip the power up of L2 and go for suspension directly"); + + ret = kbase_pm_force_mcu_wakeup_after_sleep(kbdev); + if (ret) { + dev_warn(kbdev->dev, "Wait for MCU wake up failed on runtime suspend"); + return ret; + } + + /* Check if a Doorbell mirror interrupt occurred meanwhile */ + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + if (kbdev->pm.backend.gpu_sleep_mode_active && + kbdev->pm.backend.exit_gpu_sleep_mode) { + dev_dbg(kbdev->dev, "DB mirror interrupt occurred during runtime suspend after L2 power up"); + kbdev->pm.backend.gpu_wakeup_override = false; + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + return -EBUSY; + } + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + /* Need to release the kbdev->pm.lock to avoid lock ordering issue + * with kctx->reg.lock, which is taken if the sync wait condition is + * evaluated after the CSG suspend operation. + */ + kbase_pm_unlock(kbdev); + ret = kbase_csf_scheduler_handle_runtime_suspend(kbdev); + kbase_pm_lock(kbdev); + + /* Power down L2 cache */ + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + kbdev->pm.backend.gpu_wakeup_override = false; + kbase_pm_update_state(kbdev); + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + /* After re-acquiring the kbdev->pm.lock, check if the device + * became active (or active then idle) meanwhile. + */ + if (kbdev->pm.active_count || + kbdev->pm.backend.poweroff_wait_in_progress) { + dev_dbg(kbdev->dev, + "Device became active on runtime suspend after suspending Scheduler"); + ret = -EBUSY; + } + + if (ret) + return ret; + + ret = kbase_pm_wait_for_desired_state(kbdev); + if (ret) + dev_warn(kbdev->dev, "Wait for power down failed on runtime suspend"); + + return ret; +} + +int kbase_pm_handle_runtime_suspend(struct kbase_device *kbdev) +{ + enum kbase_mcu_state mcu_state; + bool exit_early = false; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + /* This check is needed for the case where Kbase had invoked the + * @power_off_callback directly. + */ + if (!kbdev->pm.backend.gpu_powered) { + dev_dbg(kbdev->dev, "GPU already powered down on runtime suspend"); + exit_early = true; + } + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + if (exit_early) + goto out; + + ret = kbase_reset_gpu_try_prevent(kbdev); + if (ret == -ENOMEM) { + dev_dbg(kbdev->dev, "Quit runtime suspend as GPU is in bad state"); + /* Finish the runtime suspend, no point in trying again as GPU is + * in irrecoverable bad state. + */ + goto out; + } else if (ret) { + dev_dbg(kbdev->dev, "Quit runtime suspend for failing to prevent gpu reset"); + ret = -EBUSY; + goto out; + } + + kbase_csf_scheduler_lock(kbdev); + kbase_pm_lock(kbdev); + + /* + * This is to handle the case where GPU device becomes active and idle + * very quickly whilst the runtime suspend callback is executing. + * This is useful for the following scenario :- + * - GPU goes idle and pm_callback_runtime_gpu_idle() is called. + * - Auto-suspend timer expires and kbase_device_runtime_suspend() + * is called. + * - GPU becomes active and pm_callback_runtime_gpu_active() calls + * pm_runtime_get(). + * - Shortly after that GPU becomes idle again. + * - kbase_pm_handle_runtime_suspend() gets called. + * - pm_callback_runtime_gpu_idle() is called. + * + * We do not want to power down the GPU immediately after it goes idle. + * So if we notice that GPU had become active when the runtime suspend + * had already kicked in, we abort the runtime suspend. + * By aborting the runtime suspend, we defer the power down of GPU. + * + * This check also helps prevent warnings regarding L2 and MCU states + * inside the pm_handle_power_off() function. The warning stems from + * the fact that pm.lock is released before invoking Scheduler function + * to suspend the CSGs. + */ + if (kbdev->pm.active_count || + kbdev->pm.backend.poweroff_wait_in_progress) { + dev_dbg(kbdev->dev, "Device became active on runtime suspend"); + ret = -EBUSY; + goto unlock; + } + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + if (kbdev->pm.backend.gpu_sleep_mode_active && + kbdev->pm.backend.exit_gpu_sleep_mode) { + dev_dbg(kbdev->dev, "DB mirror interrupt occurred during runtime suspend before L2 power up"); + ret = -EBUSY; + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + goto unlock; + } + + mcu_state = kbdev->pm.backend.mcu_state; + WARN_ON(!kbase_pm_is_mcu_inactive(kbdev, mcu_state)); + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + if (mcu_state == KBASE_MCU_IN_SLEEP) { + ret = pm_handle_mcu_sleep_on_runtime_suspend(kbdev); + if (ret) + goto unlock; + } + + /* Disable interrupts and turn off the GPU clocks */ + if (!kbase_pm_clock_off(kbdev)) { + dev_warn(kbdev->dev, "Failed to turn off GPU clocks on runtime suspend, MMU faults pending"); + + WARN_ON(!kbdev->poweroff_pending); + /* Previous call to kbase_pm_clock_off() would have disabled + * the interrupts and also synchronized with the interrupt + * handlers, so more fault work items can't be enqueued. + * + * Can't wait for the completion of MMU fault work items as + * there is a possibility of a deadlock since the fault work + * items would do the group termination which requires the + * Scheduler lock. + */ + ret = -EBUSY; + goto unlock; + } + + wake_up(&kbdev->pm.backend.poweroff_wait); + WARN_ON(kbdev->pm.backend.gpu_powered); + dev_dbg(kbdev->dev, "GPU power down complete"); + +unlock: + kbase_pm_unlock(kbdev); + kbase_csf_scheduler_unlock(kbdev); + kbase_reset_gpu_allow(kbdev); +out: + if (ret) { + ret = -EBUSY; + pm_runtime_mark_last_busy(kbdev->dev); + } + + return ret; +} +#endif diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_defs.h b/mali_kbase/backend/gpu/mali_kbase_pm_defs.h index d9d3aa3..52877f5 100644 --- a/mali_kbase/backend/gpu/mali_kbase_pm_defs.h +++ b/mali_kbase/backend/gpu/mali_kbase_pm_defs.h @@ -29,6 +29,10 @@ #include "mali_kbase_pm_always_on.h" #include "mali_kbase_pm_coarse_demand.h" +#if defined(CONFIG_PM_RUNTIME) || defined(CONFIG_PM) +#define KBASE_PM_RUNTIME 1 +#endif + /* Forward definition - see mali_kbase.h */ struct kbase_device; struct kbase_jd_atom; @@ -271,10 +275,18 @@ union kbase_pm_policy_data { * &struct kbase_pm_callback_conf * @callback_power_runtime_off: Callback when the GPU may be turned off. See * &struct kbase_pm_callback_conf - * @callback_power_runtime_idle: Optional callback when the GPU may be idle. See - * &struct kbase_pm_callback_conf + * @callback_power_runtime_idle: Optional callback invoked by runtime PM core + * when the GPU may be idle. See + * &struct kbase_pm_callback_conf * @callback_soft_reset: Optional callback to software reset the GPU. See * &struct kbase_pm_callback_conf + * @callback_power_runtime_gpu_idle: Callback invoked by Kbase when GPU has + * become idle. + * See &struct kbase_pm_callback_conf. + * @callback_power_runtime_gpu_active: Callback when GPU has become active and + * @callback_power_runtime_gpu_idle was + * called previously. + * See &struct kbase_pm_callback_conf. * @ca_cores_enabled: Cores that are currently available * @mcu_state: The current state of the micro-control unit, only applicable * to GPUs that have such a component @@ -312,6 +324,34 @@ union kbase_pm_policy_data { * @policy_change_lock: Used to serialize the policy change calls. In CSF case, * the change of policy may involve the scheduler to * suspend running CSGs and then reconfigure the MCU. + * @gpu_sleep_supported: Flag to indicate that if GPU sleep feature can be + * supported by the kernel driver or not. If this + * flag is not set, then HW state is directly saved + * when GPU idle notification is received. + * @gpu_sleep_mode_active: Flag to indicate that the GPU needs to be in sleep + * mode. It is set when the GPU idle notification is + * received and is cleared when HW state has been + * saved in the runtime suspend callback function or + * when the GPU power down is aborted if GPU became + * active whilst it was in sleep mode. The flag is + * guarded with hwaccess_lock spinlock. + * @exit_gpu_sleep_mode: Flag to indicate the GPU can now exit the sleep + * mode due to the submission of work from Userspace. + * The flag is guarded with hwaccess_lock spinlock. + * The @gpu_sleep_mode_active flag is not immediately + * reset when this flag is set, this is to ensure that + * MCU doesn't gets disabled undesirably without the + * suspend of CSGs. That could happen when + * scheduler_pm_active() and scheduler_pm_idle() gets + * called before the Scheduler gets reactivated. + * @gpu_idled: Flag to ensure that the gpu_idle & gpu_active callbacks are + * always called in pair. The flag is guarded with pm.lock mutex. + * @gpu_wakeup_override: Flag to force the power up of L2 cache & reactivation + * of MCU. This is set during the runtime suspend + * callback function, when GPU needs to exit the sleep + * mode for the saving the HW state before power down. + * @db_mirror_interrupt_enabled: Flag tracking if the Doorbell mirror interrupt + * is enabled or not. * @in_reset: True if a GPU is resetting and normal power manager operation is * suspended * @partial_shaderoff: True if we want to partial power off shader cores, @@ -398,6 +438,8 @@ struct kbase_pm_backend_data { void (*callback_power_runtime_off)(struct kbase_device *kbdev); int (*callback_power_runtime_idle)(struct kbase_device *kbdev); int (*callback_soft_reset)(struct kbase_device *kbdev); + void (*callback_power_runtime_gpu_idle)(struct kbase_device *kbdev); + void (*callback_power_runtime_gpu_active)(struct kbase_device *kbdev); u64 ca_cores_enabled; @@ -413,6 +455,15 @@ struct kbase_pm_backend_data { bool policy_change_clamp_state_to_off; unsigned int csf_pm_sched_flags; struct mutex policy_change_lock; + +#ifdef KBASE_PM_RUNTIME + bool gpu_sleep_supported; + bool gpu_sleep_mode_active; + bool exit_gpu_sleep_mode; + bool gpu_idled; + bool gpu_wakeup_override; + bool db_mirror_interrupt_enabled; +#endif #endif bool l2_desired; bool l2_always_on; @@ -420,11 +471,13 @@ struct kbase_pm_backend_data { bool in_reset; +#if !MALI_USE_CSF bool partial_shaderoff; bool protected_entry_transition_override; bool protected_transition_override; int protected_l2_override; +#endif bool hwcnt_desired; bool hwcnt_disabled; diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c index bcada93..d65c684 100644 --- a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c +++ b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c @@ -40,6 +40,7 @@ #include <mali_kbase_reset_gpu.h> #include <mali_kbase_ctx_sched.h> #include <mali_kbase_hwcnt_context.h> +#include <mali_kbase_pbha.h> #include <backend/gpu/mali_kbase_cache_policy_backend.h> #include <device/mali_kbase_device.h> #include <backend/gpu/mali_kbase_irq_internal.h> @@ -104,9 +105,15 @@ bool kbase_pm_is_mcu_desired(struct kbase_device *kbdev) if (unlikely(!kbdev->csf.firmware_inited)) return false; - if (kbdev->csf.scheduler.pm_active_count) + if (kbdev->csf.scheduler.pm_active_count && + kbdev->pm.backend.mcu_desired) return true; +#ifdef KBASE_PM_RUNTIME + if (kbdev->pm.backend.gpu_wakeup_override) + return true; +#endif + /* MCU is supposed to be ON, only when scheduler.pm_active_count is * non zero. But for always_on policy, the MCU needs to be kept on, * unless policy changing transition needs it off. @@ -120,6 +127,7 @@ bool kbase_pm_is_mcu_desired(struct kbase_device *kbdev) bool kbase_pm_is_l2_desired(struct kbase_device *kbdev) { +#if !MALI_USE_CSF if (kbdev->pm.backend.protected_entry_transition_override) return false; @@ -130,15 +138,19 @@ bool kbase_pm_is_l2_desired(struct kbase_device *kbdev) if (kbdev->pm.backend.protected_transition_override && !kbdev->pm.backend.shaders_desired) return false; - -#if MALI_USE_CSF - if (kbdev->pm.backend.policy_change_clamp_state_to_off) +#else + if (unlikely(kbdev->pm.backend.policy_change_clamp_state_to_off)) return false; + + /* Power up the L2 cache only when MCU is desired */ + if (likely(kbdev->csf.firmware_inited)) + return kbase_pm_is_mcu_desired(kbdev); #endif return kbdev->pm.backend.l2_desired; } +#if !MALI_USE_CSF void kbase_pm_protected_override_enable(struct kbase_device *kbdev) { lockdep_assert_held(&kbdev->hwaccess_lock); @@ -204,6 +216,7 @@ void kbase_pm_protected_l2_override(struct kbase_device *kbdev, bool override) kbase_pm_update_state(kbdev); } +#endif /** * core_type_to_reg - Decode a core type and action to a register. @@ -259,9 +272,8 @@ static void mali_cci_flush_l2(struct kbase_device *kbdev) * to be called from. */ - kbase_reg_write(kbdev, - GPU_CONTROL_REG(GPU_COMMAND), - GPU_COMMAND_CLEAN_INV_CACHES); + kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND), + GPU_COMMAND_CACHE_CLN_INV_L2); raw = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_RAWSTAT)); @@ -610,6 +622,35 @@ static inline bool kbase_pm_handle_mcu_core_attr_update(struct kbase_device *kbd return (core_mask_update || timer_update); } +bool kbase_pm_is_mcu_inactive(struct kbase_device *kbdev, + enum kbase_mcu_state state) +{ + lockdep_assert_held(&kbdev->hwaccess_lock); + + return ((state == KBASE_MCU_OFF) || (state == KBASE_MCU_IN_SLEEP)); +} + +#ifdef KBASE_PM_RUNTIME +/** + * kbase_pm_enable_mcu_db_notification - Enable the Doorbell notification on + * MCU side + * + * @kbdev: Pointer to the device. + * + * This function is called to re-enable the Doorbell notification on MCU side + * when MCU needs to beome active again. + */ +static void kbase_pm_enable_mcu_db_notification(struct kbase_device *kbdev) +{ + u32 val = kbase_reg_read(kbdev, GPU_CONTROL_REG(MCU_CONTROL)); + + lockdep_assert_held(&kbdev->hwaccess_lock); + + val &= ~MCU_CNTRL_DOORBELL_DISABLE_MASK; + kbase_reg_write(kbdev, GPU_CONTROL_REG(MCU_CONTROL), val); +} +#endif + static int kbase_pm_mcu_update_state(struct kbase_device *kbdev) { struct kbase_pm_backend_data *backend = &kbdev->pm.backend; @@ -618,12 +659,12 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev) lockdep_assert_held(&kbdev->hwaccess_lock); /* - * Initial load of firmare should have been done to + * Initial load of firmware should have been done to * exercise the MCU state machine. */ if (unlikely(!kbdev->csf.firmware_inited)) { WARN_ON(backend->mcu_state != KBASE_MCU_OFF); - return -EIO; + return 0; } do { @@ -770,8 +811,15 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev) if (!backend->hwcnt_disabled) kbase_pm_trigger_hwcnt_disable(kbdev); - if (backend->hwcnt_disabled) - backend->mcu_state = KBASE_MCU_ON_HALT; + + if (backend->hwcnt_disabled) { +#ifdef KBASE_PM_RUNTIME + if (backend->gpu_sleep_mode_active) + backend->mcu_state = KBASE_MCU_ON_SLEEP_INITIATE; + else +#endif + backend->mcu_state = KBASE_MCU_ON_HALT; + } break; case KBASE_MCU_ON_HALT: @@ -816,7 +864,32 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev) kbase_csf_firmware_disable_mcu_wait(kbdev); backend->mcu_state = KBASE_MCU_OFF; break; +#ifdef KBASE_PM_RUNTIME + case KBASE_MCU_ON_SLEEP_INITIATE: + if (!kbase_pm_is_mcu_desired(kbdev)) { + kbase_csf_firmware_trigger_mcu_sleep(kbdev); + backend->mcu_state = KBASE_MCU_ON_PEND_SLEEP; + } else + backend->mcu_state = KBASE_MCU_ON_HWCNT_ENABLE; + break; + + case KBASE_MCU_ON_PEND_SLEEP: + if (kbase_csf_firmware_is_mcu_in_sleep(kbdev)) { + backend->mcu_state = KBASE_MCU_IN_SLEEP; + kbase_pm_enable_db_mirror_interrupt(kbdev); + kbase_csf_scheduler_reval_idleness_post_sleep(kbdev); + } + break; + case KBASE_MCU_IN_SLEEP: + if (kbase_pm_is_mcu_desired(kbdev) && + backend->l2_state == KBASE_L2_ON) { + kbase_pm_enable_mcu_db_notification(kbdev); + kbase_pm_disable_db_mirror_interrupt(kbdev); + backend->mcu_state = KBASE_MCU_ON_HWCNT_ENABLE; + } + break; +#endif case KBASE_MCU_RESET_WAIT: /* Reset complete */ if (!backend->in_reset) @@ -889,8 +962,24 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev) #endif backend->shaders_state = KBASE_SHADERS_OFF_CORESTACK_OFF; - backend->l2_state = KBASE_L2_OFF; - dev_dbg(kbdev->dev, "GPU lost has occurred - L2 off\n"); + backend->hwcnt_desired = false; + if (!backend->hwcnt_disabled) { + /* Don't progress until hw counters are disabled + * This may involve waiting for a worker to complete. + * The HW counters backend disable code checks for the + * GPU removed case and will error out without touching + * the hardware. This step is needed to keep the HW + * counters in a consistent state after a GPU lost. + */ + backend->l2_state = + KBASE_L2_ON_HWCNT_DISABLE; + kbase_pm_trigger_hwcnt_disable(kbdev); + } + + if (backend->hwcnt_disabled) { + backend->l2_state = KBASE_L2_OFF; + dev_dbg(kbdev->dev, "GPU lost has occurred - L2 off\n"); + } break; } @@ -911,6 +1000,7 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev) * powering it on */ kbase_pm_l2_config_override(kbdev); + kbase_pbha_write_settings(kbdev); #if !MALI_USE_CSF /* L2 is required, power on. Powering on the * tiler will also power the first L2 cache. @@ -1027,7 +1117,8 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev) break; #else /* Do not power off L2 until the MCU has been stopped */ - if (backend->mcu_state != KBASE_MCU_OFF) + if ((backend->mcu_state != KBASE_MCU_OFF) && + (backend->mcu_state != KBASE_MCU_IN_SLEEP)) break; #endif @@ -1608,7 +1699,7 @@ static int kbase_pm_shaders_update_state(struct kbase_device *kbdev) return 0; } -#endif +#endif /* !MALI_USE_CSF */ static bool kbase_pm_is_in_desired_state_nolock(struct kbase_device *kbdev) { @@ -1635,7 +1726,8 @@ static bool kbase_pm_is_in_desired_state_nolock(struct kbase_device *kbdev) kbdev->pm.backend.mcu_state != KBASE_MCU_ON) in_desired_state = false; else if (!kbase_pm_is_mcu_desired(kbdev) && - kbdev->pm.backend.mcu_state != KBASE_MCU_OFF) + (kbdev->pm.backend.mcu_state != KBASE_MCU_OFF) && + (kbdev->pm.backend.mcu_state != KBASE_MCU_IN_SLEEP)) in_desired_state = false; #endif @@ -1734,8 +1826,8 @@ void kbase_pm_update_state(struct kbase_device *kbdev) if (kbase_pm_mcu_update_state(kbdev)) return; - if (prev_mcu_state != KBASE_MCU_OFF && - kbdev->pm.backend.mcu_state == KBASE_MCU_OFF) { + if (!kbase_pm_is_mcu_inactive(kbdev, prev_mcu_state) && + kbase_pm_is_mcu_inactive(kbdev, kbdev->pm.backend.mcu_state)) { if (kbase_pm_l2_update_state(kbdev)) return; } @@ -1828,6 +1920,9 @@ void kbase_pm_reset_start_locked(struct kbase_device *kbdev) */ if (likely(kbdev->csf.firmware_inited)) { backend->mcu_state = KBASE_MCU_RESET_WAIT; +#ifdef KBASE_PM_RUNTIME + backend->exit_gpu_sleep_mode = true; +#endif kbdev->csf.firmware_reload_needed = true; } else { WARN_ON(backend->mcu_state != KBASE_MCU_OFF); @@ -1865,6 +1960,9 @@ void kbase_pm_reset_complete(struct kbase_device *kbdev) */ kbase_gpu_cache_clean_wait_complete(kbdev); backend->in_reset = false; +#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME) + backend->gpu_wakeup_override = false; +#endif kbase_pm_update_state(kbdev); spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); @@ -2098,6 +2196,7 @@ static void update_user_reg_page_mapping(struct kbase_device *kbdev) */ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume) { + struct kbase_pm_backend_data *backend = &kbdev->pm.backend; bool reset_required = is_resume; unsigned long flags; @@ -2115,7 +2214,13 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume) } #endif - if (kbdev->pm.backend.gpu_powered) { + if (backend->gpu_powered) { +#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME) + if (backend->gpu_idled) { + backend->callback_power_runtime_gpu_active(kbdev); + backend->gpu_idled = false; + } +#endif /* Already turned on */ if (kbdev->poweroff_pending) kbase_pm_enable_interrupts(kbdev); @@ -2128,15 +2233,15 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume) KBASE_KTRACE_ADD(kbdev, PM_GPU_ON, NULL, 0u); - if (is_resume && kbdev->pm.backend.callback_power_resume) { - kbdev->pm.backend.callback_power_resume(kbdev); + if (is_resume && backend->callback_power_resume) { + backend->callback_power_resume(kbdev); return; - } else if (kbdev->pm.backend.callback_power_on) { - reset_required = kbdev->pm.backend.callback_power_on(kbdev); + } else if (backend->callback_power_on) { + reset_required = backend->callback_power_on(kbdev); } spin_lock_irqsave(&kbdev->hwaccess_lock, flags); - kbdev->pm.backend.gpu_powered = true; + backend->gpu_powered = true; spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); #if MALI_USE_CSF @@ -2194,8 +2299,8 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume) /* Turn on the L2 caches */ spin_lock_irqsave(&kbdev->hwaccess_lock, flags); - kbdev->pm.backend.gpu_ready = true; - kbdev->pm.backend.l2_desired = true; + backend->gpu_ready = true; + backend->l2_desired = true; #if MALI_USE_CSF if (reset_required) { /* GPU reset was done after the power on, so send the post @@ -2209,6 +2314,17 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume) #endif kbase_pm_update_state(kbdev); spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + +#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME) + /* GPU is now powered up. Invoke the GPU active callback as GPU idle + * callback would have been invoked before the power down. + */ + if (backend->gpu_idled) { + backend->callback_power_runtime_gpu_active(kbdev); + backend->gpu_idled = false; + } +#endif + } KBASE_EXPORT_TEST_API(kbase_pm_clock_on); @@ -2252,19 +2368,22 @@ bool kbase_pm_clock_off(struct kbase_device *kbdev) kbase_ipa_control_handle_gpu_power_off(kbdev); #endif - kbdev->pm.backend.gpu_ready = false; - - /* The GPU power may be turned off from this point */ - kbdev->pm.backend.gpu_powered = false; - + if (kbase_is_gpu_removed(kbdev) #ifdef CONFIG_MALI_ARBITER_SUPPORT - if (kbase_pm_is_gpu_lost(kbdev)) { + || kbase_pm_is_gpu_lost(kbdev)) { +#else + ) { +#endif /* Ensure we unblock any threads that are stuck waiting * for the GPU */ kbase_gpu_cache_clean_wait_complete(kbdev); } -#endif + + kbdev->pm.backend.gpu_ready = false; + + /* The GPU power may be turned off from this point */ + kbdev->pm.backend.gpu_powered = false; spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_internal.h b/mali_kbase/backend/gpu/mali_kbase_pm_internal.h index 70d009e..ef26c16 100644 --- a/mali_kbase/backend/gpu/mali_kbase_pm_internal.h +++ b/mali_kbase/backend/gpu/mali_kbase_pm_internal.h @@ -137,6 +137,10 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume); * off. It should be modified during integration to perform the necessary * actions to turn the clock off (if this is possible in the integration). * + * If runtime PM is enabled and @power_runtime_gpu_idle_callback is used + * then this function would usually be invoked from the runtime suspend + * callback function. + * * @kbdev: The kbase device structure for the device (must be a valid * pointer) * @@ -242,7 +246,7 @@ int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev); * NOTE: This may not wait until the correct state is reached if there is a * power off in progress. To correctly wait for the desired state the caller * must ensure that this is not the case by, for example, calling - * kbase_pm_wait_for_poweroff_complete() + * kbase_pm_wait_for_poweroff_work_complete() * * @kbdev: The kbase device structure for the device (must be a valid pointer) * @@ -432,12 +436,25 @@ void kbase_pm_release_gpu_cycle_counter(struct kbase_device *kbdev); void kbase_pm_release_gpu_cycle_counter_nolock(struct kbase_device *kbdev); /** - * kbase_pm_wait_for_poweroff_complete - Wait for the poweroff workqueue to - * complete + * kbase_pm_wait_for_poweroff_work_complete - Wait for the poweroff workqueue to + * complete * * @kbdev: The kbase device structure for the device (must be a valid pointer) + * + * This function effectively just waits for the @gpu_poweroff_wait_work work + * item to complete, if it was enqueued. GPU may not have been powered down + * before this function returns. */ -void kbase_pm_wait_for_poweroff_complete(struct kbase_device *kbdev); +void kbase_pm_wait_for_poweroff_work_complete(struct kbase_device *kbdev); + +/** + * kbase_pm_wait_for_gpu_power_down - Wait for the GPU power down to complete + * + * @kbdev: The kbase device structure for the device (must be a valid pointer) + * + * This function waits for the actual gpu power down to complete. + */ +void kbase_pm_wait_for_gpu_power_down(struct kbase_device *kbdev); /** * kbase_pm_runtime_init - Initialize runtime-pm for Mali GPU platform device @@ -635,6 +652,7 @@ void kbase_pm_reset_start_locked(struct kbase_device *kbdev); */ void kbase_pm_reset_complete(struct kbase_device *kbdev); +#if !MALI_USE_CSF /** * kbase_pm_protected_override_enable - Enable the protected mode override * @kbdev: Device pointer @@ -707,6 +725,7 @@ int kbase_pm_protected_entry_override_enable(struct kbase_device *kbdev); * to enter protected mode. */ void kbase_pm_protected_entry_override_disable(struct kbase_device *kbdev); +#endif /* If true, the driver should explicitly control corestack power management, * instead of relying on the Power Domain Controller. @@ -737,6 +756,21 @@ bool kbase_pm_is_l2_desired(struct kbase_device *kbdev); bool kbase_pm_is_mcu_desired(struct kbase_device *kbdev); /** + * kbase_pm_is_mcu_inactive - Check if the MCU is inactive (i.e. either + * it is disabled or it is in sleep) + * + * @kbdev: kbase device + * @state: state of the MCU state machine. + * + * This function must be called with hwaccess_lock held. + * L2 cache can be turned off if this function returns true. + * + * Return: true if MCU is inactive + */ +bool kbase_pm_is_mcu_inactive(struct kbase_device *kbdev, + enum kbase_mcu_state state); + +/** * kbase_pm_idle_groups_sched_suspendable - Check whether the scheduler can be * suspended to low power state when all * the CSGs are idle @@ -818,4 +852,83 @@ static inline void kbase_pm_unlock(struct kbase_device *kbdev) #endif /* !MALI_USE_CSF */ } +#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME) +/** + * kbase_pm_gpu_sleep_allowed - Check if the GPU is allowed to be put in sleep + * + * @kbdev: Device pointer + * + * This function is called on GPU idle notification and if it returns false then + * GPU power down will be triggered by suspending the CSGs and halting the MCU. + * + * Return: true if the GPU is allowed to be in the sleep state. + */ +static inline bool kbase_pm_gpu_sleep_allowed(struct kbase_device *kbdev) +{ + /* If the autosuspend_delay has been set to 0 then it doesn't make + * sense to first put GPU to sleep state and then power it down, + * instead would be better to power it down right away. + * Also need to do the same when autosuspend_delay is set to a negative + * value, which implies that runtime pm is effectively disabled by the + * kernel. + * A high positive value of autosuspend_delay can be used to keep the + * GPU in sleep state for a long time. + */ + if (unlikely(!kbdev->dev->power.autosuspend_delay || + (kbdev->dev->power.autosuspend_delay < 0))) + return false; + + return kbdev->pm.backend.gpu_sleep_supported; +} + +/** + * kbase_pm_enable_db_mirror_interrupt - Enable the doorbell mirror interrupt to + * detect the User doorbell rings. + * + * @kbdev: Device pointer + * + * This function is called just before sending the sleep request to MCU firmware + * so that User doorbell rings can be detected whilst GPU remains in the sleep + * state. + * + */ +static inline void kbase_pm_enable_db_mirror_interrupt(struct kbase_device *kbdev) +{ + lockdep_assert_held(&kbdev->hwaccess_lock); + + if (!kbdev->pm.backend.db_mirror_interrupt_enabled) { + u32 irq_mask = kbase_reg_read(kbdev, + GPU_CONTROL_REG(GPU_IRQ_MASK)); + + WARN_ON(irq_mask & DOORBELL_MIRROR); + + kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK), + irq_mask | DOORBELL_MIRROR); + kbdev->pm.backend.db_mirror_interrupt_enabled = true; + } +} + +/** + * kbase_pm_disable_db_mirror_interrupt - Disable the doorbell mirror interrupt. + * + * @kbdev: Device pointer + * + * This function is called when doorbell mirror interrupt is received or MCU + * needs to be reactivated by enabling the doorbell notification. + */ +static inline void kbase_pm_disable_db_mirror_interrupt(struct kbase_device *kbdev) +{ + lockdep_assert_held(&kbdev->hwaccess_lock); + + if (kbdev->pm.backend.db_mirror_interrupt_enabled) { + u32 irq_mask = kbase_reg_read(kbdev, + GPU_CONTROL_REG(GPU_IRQ_MASK)); + + kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK), + irq_mask & ~DOORBELL_MIRROR); + kbdev->pm.backend.db_mirror_interrupt_enabled = false; + } +} +#endif + #endif /* _KBASE_BACKEND_PM_INTERNAL_H_ */ diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h b/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h index 4e99928..96f196f 100644 --- a/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h +++ b/mali_kbase/backend/gpu/mali_kbase_pm_mcu_states.h @@ -42,6 +42,20 @@ * @POWER_DOWN: MCU halted operations, pending being disabled. * @PEND_OFF: MCU is being disabled, pending on powering off. * @RESET_WAIT: The GPU is resetting, MCU state is unknown. + * @HCTL_SHADERS_PEND_ON: Global configuration requests sent to the firmware + * have completed and shaders have been requested to + * power on. + * @HCTL_CORES_NOTIFY_PEND: Shader cores have powered up and firmware is being + * notified of the mask of enabled shader cores. + * @HCTL_MCU_ON_RECHECK: MCU is on and hwcnt disabling is triggered + * and checks are done to increase the number of + * enabled cores. + * @HCTL_SHADERS_READY_OFF: MCU has halted and cores need to be powered down + * @HCTL_SHADERS_PEND_OFF: Cores are transitioning to power down. + * @ON_SLEEP_INITIATE: MCU is on and hwcnt has been disabled and MCU + * is being put to sleep. + * @ON_PEND_SLEEP: MCU sleep is in progress. + * @IN_SLEEP: Sleep request is completed and MCU has halted. */ KBASEP_MCU_STATE(OFF) KBASEP_MCU_STATE(PEND_ON_RELOAD) @@ -61,3 +75,7 @@ KBASEP_MCU_STATE(HCTL_CORES_NOTIFY_PEND) KBASEP_MCU_STATE(HCTL_MCU_ON_RECHECK) KBASEP_MCU_STATE(HCTL_SHADERS_READY_OFF) KBASEP_MCU_STATE(HCTL_SHADERS_PEND_OFF) +/* Additional MCU states to support GPU sleep feature */ +KBASEP_MCU_STATE(ON_SLEEP_INITIATE) +KBASEP_MCU_STATE(ON_PEND_SLEEP) +KBASEP_MCU_STATE(IN_SLEEP) diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_policy.c b/mali_kbase/backend/gpu/mali_kbase_pm_policy.c index cf61ef8..7b126a1 100644 --- a/mali_kbase/backend/gpu/mali_kbase_pm_policy.c +++ b/mali_kbase/backend/gpu/mali_kbase_pm_policy.c @@ -183,7 +183,7 @@ void kbase_pm_update_dynamic_cores_onoff(struct kbase_device *kbdev) void kbase_pm_update_cores_state_nolock(struct kbase_device *kbdev) { - bool shaders_desired; + bool shaders_desired = false; lockdep_assert_held(&kbdev->hwaccess_lock); @@ -192,6 +192,7 @@ void kbase_pm_update_cores_state_nolock(struct kbase_device *kbdev) if (kbdev->pm.backend.poweroff_wait_in_progress) return; +#if !MALI_USE_CSF if (kbdev->pm.backend.protected_transition_override) /* We are trying to change in/out of protected mode - force all * cores off so that the L2 powers down @@ -199,15 +200,8 @@ void kbase_pm_update_cores_state_nolock(struct kbase_device *kbdev) shaders_desired = false; else shaders_desired = kbdev->pm.backend.pm_current_policy->shaders_needed(kbdev); - -#if MALI_USE_CSF - /* On CSF GPUs, Host driver isn't supposed to do the power management - * for shader cores. CSF firmware will power up the cores appropriately - * and so from Driver's standpoint 'shaders_desired' flag shall always - * remain 0. - */ - shaders_desired = false; #endif + if (kbdev->pm.backend.shaders_desired != shaders_desired) { KBASE_KTRACE_ADD(kbdev, PM_CORES_CHANGE_DESIRED, NULL, kbdev->pm.backend.shaders_desired); diff --git a/mali_kbase/backend/gpu/mali_kbase_time.c b/mali_kbase/backend/gpu/mali_kbase_time.c index d10e404..92a366b 100644 --- a/mali_kbase/backend/gpu/mali_kbase_time.c +++ b/mali_kbase/backend/gpu/mali_kbase_time.c @@ -23,6 +23,7 @@ #include <mali_kbase_hwaccess_time.h> #include <device/mali_kbase_device.h> #include <backend/gpu/mali_kbase_pm_internal.h> +#include <mali_kbase_config_defaults.h> void kbase_backend_get_gpu_time_norequest(struct kbase_device *kbdev, u64 *cycle_counter, @@ -31,18 +32,8 @@ void kbase_backend_get_gpu_time_norequest(struct kbase_device *kbdev, { u32 hi1, hi2; - if (cycle_counter) { - /* Read hi, lo, hi to ensure a coherent u64 */ - do { - hi1 = kbase_reg_read(kbdev, - GPU_CONTROL_REG(CYCLE_COUNT_HI)); - *cycle_counter = kbase_reg_read(kbdev, - GPU_CONTROL_REG(CYCLE_COUNT_LO)); - hi2 = kbase_reg_read(kbdev, - GPU_CONTROL_REG(CYCLE_COUNT_HI)); - } while (hi1 != hi2); - *cycle_counter |= (((u64) hi1) << 32); - } + if (cycle_counter) + *cycle_counter = kbase_backend_get_cycle_cnt(kbdev); if (system_time) { /* Read hi, lo, hi to ensure a coherent u64 */ @@ -107,3 +98,66 @@ void kbase_backend_get_gpu_time(struct kbase_device *kbdev, u64 *cycle_counter, kbase_pm_release_gpu_cycle_counter(kbdev); #endif } + +unsigned int kbase_get_timeout_ms(struct kbase_device *kbdev, + enum kbase_timeout_selector selector) +{ + /* Timeout calculation: + * dividing number of cycles by freq in KHz automatically gives value + * in milliseconds. nr_cycles will have to be multiplied by 1e3 to + * get result in microseconds, and 1e6 to get result in nanoseconds. + */ + + u64 timeout, nr_cycles = 0; + u64 freq_khz = kbdev->lowest_gpu_freq_khz; + + WARN_ON(!freq_khz); + + switch (selector) { + /* use Firmware timeout if invalid selection */ + default: +#if !MALI_USE_CSF + WARN(1, "Invalid timeout selector used! Using default value"); + timeout = JM_DEFAULT_TIMEOUT_CYCLES; + CSTD_UNUSED(nr_cycles); +#else + WARN(1, + "Invalid timeout selector used! Using CSF Firmware timeout"); + fallthrough; + case CSF_FIRMWARE_TIMEOUT: + nr_cycles = CSF_FIRMWARE_TIMEOUT_CYCLES; + timeout = div_u64(nr_cycles, freq_khz); + /* cap CSF FW timeout to FIRMWARE_PING_INTERVAL_MS + * if calculated timeout exceeds it. This should be adapted to a + * direct timeout comparison once the FIRMWARE_PING_INTERVAL_MS + * option is added to this timeout function. A compile-time check + * such as BUILD_BUG_ON can also be done once the firmware ping + * interval in cycles becomes available as a macro. + */ + if (timeout > FIRMWARE_PING_INTERVAL_MS) { + dev_dbg(kbdev->dev, "Capped CSF_FIRMWARE_TIMEOUT %llu to %d", + timeout, FIRMWARE_PING_INTERVAL_MS); + timeout = FIRMWARE_PING_INTERVAL_MS; + } +#endif + break; + } + return (unsigned int)timeout; +} + +u64 kbase_backend_get_cycle_cnt(struct kbase_device *kbdev) +{ + u32 hi1, hi2, lo; + + /* Read hi, lo, hi to ensure a coherent u64 */ + do { + hi1 = kbase_reg_read(kbdev, + GPU_CONTROL_REG(CYCLE_COUNT_HI)); + lo = kbase_reg_read(kbdev, + GPU_CONTROL_REG(CYCLE_COUNT_LO)); + hi2 = kbase_reg_read(kbdev, + GPU_CONTROL_REG(CYCLE_COUNT_HI)); + } while (hi1 != hi2); + + return lo | (((u64) hi1) << 32); +} diff --git a/mali_kbase/build.bp b/mali_kbase/build.bp index 979e06f..030af9d 100644 --- a/mali_kbase/build.bp +++ b/mali_kbase/build.bp @@ -154,7 +154,9 @@ bob_defaults { // (catch-all for experimental CS code without separating it into // different features). "MALI_INCREMENTAL_RENDERING={{.incremental_rendering}}", - "GPU_TIMESTAMP_CORRECTION={{.gpu_timestamp_correction}}", + "MALI_GPU_TIMESTAMP_CORRECTION={{.gpu_timestamp_correction}}", + "MALI_BASE_CSF_PERFORMANCE_TESTS={{.base_csf_performance_tests}}", + "MALI_GPU_TIMESTAMP_INTERPOLATION={{.gpu_timestamp_interpolation}}", ], } diff --git a/mali_kbase/context/mali_kbase_context.c b/mali_kbase/context/mali_kbase_context.c index b2e7025..85f4c0a 100644 --- a/mali_kbase/context/mali_kbase_context.c +++ b/mali_kbase/context/mali_kbase_context.c @@ -283,7 +283,7 @@ int kbase_context_mmu_init(struct kbase_context *kctx) { return kbase_mmu_init( kctx->kbdev, &kctx->mmu, kctx, - base_context_mmu_group_id_get(kctx->create_flags)); + kbase_context_mmu_group_id_get(kctx->create_flags)); } void kbase_context_mmu_term(struct kbase_context *kctx) diff --git a/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.c b/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.c index a62cafa..ce6d546 100644 --- a/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.c +++ b/mali_kbase/csf/ipa_control/mali_kbase_csf_ipa_control.c @@ -253,7 +253,7 @@ static inline void calc_prfcnt_delta(struct kbase_device *kbdev, if (!WARN_ON_ONCE(kbdev->csf.ipa_control.cur_gpu_rate == 0)) if (prfcnt->gpu_norm) - delta_value /= kbdev->csf.ipa_control.cur_gpu_rate; + delta_value = div_u64(delta_value, kbdev->csf.ipa_control.cur_gpu_rate); prfcnt->latest_raw_value = raw_value; @@ -300,17 +300,20 @@ kbase_ipa_control_rate_change_notify(struct kbase_clk_rate_listener *listener, /* Interrupts are already disabled and interrupt state is also saved */ spin_lock(&ipa_ctrl->lock); - for (i = 0; i < ipa_ctrl->num_active_sessions; i++) { - size_t j; + for (i = 0; i < KBASE_IPA_CONTROL_MAX_SESSIONS; i++) { struct kbase_ipa_control_session *session = &ipa_ctrl->sessions[i]; - for (j = 0; j < session->num_prfcnts; j++) { - struct kbase_ipa_control_prfcnt *prfcnt = - &session->prfcnts[j]; + if (session->active) { + size_t j; + + for (j = 0; j < session->num_prfcnts; j++) { + struct kbase_ipa_control_prfcnt *prfcnt = + &session->prfcnts[j]; - if (prfcnt->gpu_norm) - calc_prfcnt_delta(kbdev, prfcnt, true); - } + if (prfcnt->gpu_norm) + calc_prfcnt_delta(kbdev, prfcnt, true); + } + } } ipa_ctrl->cur_gpu_rate = clk_rate_hz; @@ -480,16 +483,21 @@ static int session_gpu_start(struct kbase_device *kbdev, */ if (!ret) { if (session) { + /* On starting a session, value read is required for + * IPA power model's calculation initialization. + */ session_read_raw_values(kbdev, session); } else { size_t session_idx; for (session_idx = 0; - session_idx < ipa_ctrl->num_active_sessions; - session_idx++) - session_read_raw_values( - kbdev, - &ipa_ctrl->sessions[session_idx]); + session_idx < KBASE_IPA_CONTROL_MAX_SESSIONS; + session_idx++) { + struct kbase_ipa_control_session *session_to_check = &ipa_ctrl->sessions[session_idx]; + + if (session_to_check->active) + session_read_raw_values(kbdev, session_to_check); + } } } @@ -783,6 +791,12 @@ int kbase_ipa_control_query(struct kbase_device *kbdev, const void *client, ipa_ctrl = &kbdev->csf.ipa_control; session = (struct kbase_ipa_control_session *)client; + if (WARN_ON(!session->active)) { + dev_err(kbdev->dev, + "%s: attempt to query inactive session", __func__); + return -EINVAL; + } + if (WARN_ON(num_values < session->num_prfcnts)) { dev_err(kbdev->dev, "%s: not enough space (%zu) to return all counter values (%zu)", @@ -860,20 +874,23 @@ void kbase_ipa_control_handle_gpu_power_off(struct kbase_device *kbdev) ret); } - for (session_idx = 0; session_idx < ipa_ctrl->num_active_sessions; + for (session_idx = 0; session_idx < KBASE_IPA_CONTROL_MAX_SESSIONS; session_idx++) { + struct kbase_ipa_control_session *session = &ipa_ctrl->sessions[session_idx]; - size_t i; - for (i = 0; i < session->num_prfcnts; i++) { - struct kbase_ipa_control_prfcnt *prfcnt = - &session->prfcnts[i]; + if (session->active) { + size_t i; - calc_prfcnt_delta(kbdev, prfcnt, true); + for (i = 0; i < session->num_prfcnts; i++) { + struct kbase_ipa_control_prfcnt *prfcnt = + &session->prfcnts[i]; + + calc_prfcnt_delta(kbdev, prfcnt, true); + } } } - spin_unlock(&ipa_ctrl->lock); } @@ -975,13 +992,17 @@ void kbase_ipa_control_protm_exited(struct kbase_device *kbdev) lockdep_assert_held(&kbdev->hwaccess_lock); - for (i = 0; i < ipa_ctrl->num_active_sessions; i++) { + for (i = 0; i < KBASE_IPA_CONTROL_MAX_SESSIONS; i++) { + struct kbase_ipa_control_session *session = &ipa_ctrl->sessions[i]; - u64 protm_time = time_now - MAX(session->last_query_time, - ipa_ctrl->protm_start); - session->protm_time += protm_time; + if (session->active) { + u64 protm_time = time_now - MAX(session->last_query_time, + ipa_ctrl->protm_start); + + session->protm_time += protm_time; + } } /* Acknowledge the protected_mode bit in the IPA_CONTROL STATUS diff --git a/mali_kbase/csf/mali_kbase_csf.c b/mali_kbase/csf/mali_kbase_csf.c index d49e343..142e5a8 100644 --- a/mali_kbase/csf/mali_kbase_csf.c +++ b/mali_kbase/csf/mali_kbase_csf.c @@ -32,6 +32,7 @@ #include <mmu/mali_kbase_mmu.h> #include "mali_kbase_csf_timeout.h" #include <csf/ipa_control/mali_kbase_csf_ipa_control.h> +#include <mali_kbase_hwaccess_time.h> #define CS_REQ_EXCEPTION_MASK (CS_REQ_FAULT_MASK | CS_REQ_FATAL_MASK) #define CS_ACK_EXCEPTION_MASK (CS_ACK_FAULT_MASK | CS_ACK_FATAL_MASK) @@ -140,7 +141,7 @@ static void gpu_munmap_user_io_pages(struct kbase_context *kctx, WARN_ON(reg->flags & KBASE_REG_FREE); mutex_lock(&kctx->kbdev->csf.reg_lock); - kbase_remove_va_region(reg); + kbase_remove_va_region(kctx->kbdev, reg); mutex_unlock(&kctx->kbdev->csf.reg_lock); } @@ -171,6 +172,11 @@ static int gpu_mmap_user_io_pages(struct kbase_device *kbdev, const size_t num_pages = 2; int ret; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + #if ((KERNEL_VERSION(4, 4, 147) >= LINUX_VERSION_CODE) || \ ((KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE) && \ (KERNEL_VERSION(4, 5, 0) <= LINUX_VERSION_CODE))) @@ -195,19 +201,18 @@ static int gpu_mmap_user_io_pages(struct kbase_device *kbdev, return ret; /* Map input page */ - ret = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu, - reg->start_pfn, &phys[0], - 1, mem_flags, MCU_AS_NR, - KBASE_MEM_GROUP_CSF_IO); + ret = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu, reg->start_pfn, + &phys[0], 1, mem_flags, MCU_AS_NR, + KBASE_MEM_GROUP_CSF_IO, mmu_sync_info); if (ret) goto bad_insert; /* Map output page, it needs rw access */ mem_flags |= KBASE_REG_GPU_WR; ret = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu, - reg->start_pfn + 1, &phys[1], - 1, mem_flags, MCU_AS_NR, - KBASE_MEM_GROUP_CSF_IO); + reg->start_pfn + 1, &phys[1], 1, mem_flags, + MCU_AS_NR, KBASE_MEM_GROUP_CSF_IO, + mmu_sync_info); if (ret) goto bad_insert_output_page; @@ -218,7 +223,7 @@ bad_insert_output_page: reg->start_pfn, 1, MCU_AS_NR); bad_insert: mutex_lock(&kbdev->csf.reg_lock); - kbase_remove_va_region(reg); + kbase_remove_va_region(kbdev, reg); mutex_unlock(&kbdev->csf.reg_lock); return ret; @@ -475,7 +480,7 @@ static int csf_queue_register_internal(struct kbase_context *kctx, /* Only one pointer expected, otherwise coding error */ if ((reg == NULL && reg_ex == NULL) || (reg && reg_ex)) { - dev_err(kctx->kbdev->dev, + dev_dbg(kctx->kbdev->dev, "Error, one and only one param-ptr expected!"); return -EINVAL; } @@ -1053,6 +1058,11 @@ static int create_normal_suspend_buffer(struct kbase_context *const kctx, PFN_UP(kctx->kbdev->csf.global_iface.groups[0].suspend_size); int err = 0; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + lockdep_assert_held(&kctx->csf.lock); /* Allocate and initialize Region Object */ @@ -1090,9 +1100,9 @@ static int create_normal_suspend_buffer(struct kbase_context *const kctx, /* Update MMU table */ err = kbase_mmu_insert_pages(kctx->kbdev, &kctx->kbdev->csf.mcu_mmu, - reg->start_pfn, &s_buf->phy[0], - nr_pages, mem_flags, - MCU_AS_NR, KBASE_MEM_GROUP_CSF_FW); + reg->start_pfn, &s_buf->phy[0], nr_pages, + mem_flags, MCU_AS_NR, + KBASE_MEM_GROUP_CSF_FW, mmu_sync_info); if (err) goto mmu_insert_failed; @@ -1102,7 +1112,7 @@ static int create_normal_suspend_buffer(struct kbase_context *const kctx, mmu_insert_failed: mutex_lock(&kctx->kbdev->csf.reg_lock); - WARN_ON(kbase_remove_va_region(reg)); + kbase_remove_va_region(kctx->kbdev, reg); mutex_unlock(&kctx->kbdev->csf.reg_lock); add_va_region_failed: @@ -1138,6 +1148,11 @@ static int create_protected_suspend_buffer(struct kbase_device *const kbdev, PFN_UP(kbdev->csf.global_iface.groups[0].suspend_size); int err = 0; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + /* Allocate and initialize Region Object */ reg = kbase_alloc_free_region(&kbdev->csf.shared_reg_rbtree, 0, nr_pages, KBASE_REG_ZONE_MCU_SHARED); @@ -1170,10 +1185,9 @@ static int create_protected_suspend_buffer(struct kbase_device *const kbdev, goto add_va_region_failed; /* Update MMU table */ - err = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu, - reg->start_pfn, phys, - nr_pages, mem_flags, MCU_AS_NR, - KBASE_MEM_GROUP_CSF_FW); + err = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu, reg->start_pfn, + phys, nr_pages, mem_flags, MCU_AS_NR, + KBASE_MEM_GROUP_CSF_FW, mmu_sync_info); if (err) goto mmu_insert_failed; @@ -1183,7 +1197,7 @@ static int create_protected_suspend_buffer(struct kbase_device *const kbdev, mmu_insert_failed: mutex_lock(&kbdev->csf.reg_lock); - WARN_ON(kbase_remove_va_region(reg)); + kbase_remove_va_region(kbdev, reg); mutex_unlock(&kbdev->csf.reg_lock); add_va_region_failed: @@ -1244,16 +1258,9 @@ static int create_suspend_buffers(struct kbase_context *const kctx, */ static u32 generate_group_uid(void) { - /* use first KBase device to store max UID */ - struct kbase_device *kbdev = kbase_find_device(-1); - u32 uid = 1; - - if (kbdev) - uid = (u32) atomic_inc_return(&kbdev->group_max_uid_in_devices); - else - WARN(1, "NULL kbase device pointer in group UID generation"); + static atomic_t global_csg_uid = ATOMIC_INIT(0); - return uid; + return (u32)atomic_inc_return(&global_csg_uid); } /** @@ -1272,8 +1279,8 @@ static int create_queue_group(struct kbase_context *const kctx, int group_handle = find_free_group_handle(kctx); if (group_handle < 0) { - dev_err(kctx->kbdev->dev, - "All queue group handles are already in use\n"); + dev_dbg(kctx->kbdev->dev, + "All queue group handles are already in use"); } else { struct kbase_queue_group * const group = kmalloc(sizeof(struct kbase_queue_group), @@ -1349,16 +1356,16 @@ int kbase_csf_queue_group_create(struct kbase_context *const kctx, if ((create->in.tiler_max > tiler_count) || (create->in.fragment_max > fragment_count) || (create->in.compute_max > compute_count)) { - dev_err(kctx->kbdev->dev, - "Invalid maximum number of endpoints for a queue group\n"); + dev_dbg(kctx->kbdev->dev, + "Invalid maximum number of endpoints for a queue group"); err = -EINVAL; } else if (create->in.priority >= BASE_QUEUE_GROUP_PRIORITY_COUNT) { - dev_err(kctx->kbdev->dev, "Invalid queue group priority %u\n", + dev_dbg(kctx->kbdev->dev, "Invalid queue group priority %u", (unsigned int)create->in.priority); err = -EINVAL; } else if (!iface_has_enough_streams(kctx->kbdev, create->in.cs_min)) { - dev_err(kctx->kbdev->dev, - "No CSG has at least %d CSs\n", + dev_dbg(kctx->kbdev->dev, + "No CSG has at least %d CSs", create->in.cs_min); err = -EINVAL; } else { @@ -1403,7 +1410,7 @@ static void term_normal_suspend_buffer(struct kbase_context *const kctx, WARN_ON(s_buf->reg->flags & KBASE_REG_FREE); mutex_lock(&kctx->kbdev->csf.reg_lock); - WARN_ON(kbase_remove_va_region(s_buf->reg)); + kbase_remove_va_region(kctx->kbdev, s_buf->reg); mutex_unlock(&kctx->kbdev->csf.reg_lock); kbase_mem_pool_free_pages( @@ -1436,7 +1443,7 @@ static void term_protected_suspend_buffer(struct kbase_device *const kbdev, WARN_ON(s_buf->reg->flags & KBASE_REG_FREE); mutex_lock(&kbdev->csf.reg_lock); - WARN_ON(kbase_remove_va_region(s_buf->reg)); + kbase_remove_va_region(kbdev, s_buf->reg); mutex_unlock(&kbdev->csf.reg_lock); kbase_csf_protected_memory_free(kbdev, s_buf->pma, nr_pages); @@ -1994,6 +2001,26 @@ bool kbase_csf_error_pending(struct kbase_context *kctx) return event_pended; } +static void sync_update_notify_gpu(struct kbase_context *kctx) +{ + bool can_notify_gpu; + unsigned long flags; + + spin_lock_irqsave(&kctx->kbdev->hwaccess_lock, flags); + can_notify_gpu = kctx->kbdev->pm.backend.gpu_powered; +#ifdef KBASE_PM_RUNTIME + if (kctx->kbdev->pm.backend.gpu_sleep_mode_active) + can_notify_gpu = false; +#endif + + if (can_notify_gpu) { + kbase_csf_ring_doorbell(kctx->kbdev, CSF_KERNEL_DOORBELL_NR); + KBASE_KTRACE_ADD(kctx->kbdev, SYNC_UPDATE_EVENT_NOTIFY_GPU, kctx, 0u); + } + + spin_unlock_irqrestore(&kctx->kbdev->hwaccess_lock, flags); +} + void kbase_csf_event_signal(struct kbase_context *kctx, bool notify_gpu) { struct kbase_csf_event *event, *next_event; @@ -2014,13 +2041,8 @@ void kbase_csf_event_signal(struct kbase_context *kctx, bool notify_gpu) * synch object wait operations are re-evaluated on a write to any * CS_DOORBELL/GLB_DOORBELL register. */ - if (notify_gpu) { - spin_lock_irqsave(&kctx->kbdev->hwaccess_lock, flags); - if (kctx->kbdev->pm.backend.gpu_powered) - kbase_csf_ring_doorbell(kctx->kbdev, CSF_KERNEL_DOORBELL_NR); - KBASE_KTRACE_ADD(kctx->kbdev, SYNC_UPDATE_EVENT_NOTIFY_GPU, kctx, 0u); - spin_unlock_irqrestore(&kctx->kbdev->hwaccess_lock, flags); - } + if (notify_gpu) + sync_update_notify_gpu(kctx); /* Now invoke the callbacks registered on backend side. * Allow item removal inside the loop, if requested by the callback. @@ -2364,31 +2386,6 @@ static void protm_event_worker(struct work_struct *data) group, 0u); } -static void report_queue_fatal_error(struct kbase_queue *const queue, - u32 cs_fatal, u64 cs_fatal_info, - u8 group_handle) -{ - struct base_csf_notification error = - { .type = BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR, - .payload = { - .csg_error = { - .handle = group_handle, - .error = { - .error_type = - BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL, - .payload = { - .fatal_queue = { - .sideband = - cs_fatal_info, - .status = cs_fatal, - .csi_index = - queue->csi_index, - } } } } } }; - - add_error(queue->kctx, &queue->error, &error); - kbase_event_wakeup(queue->kctx); -} - /** * handle_fault_event - Handler for CS fault. * @@ -2429,10 +2426,34 @@ handle_fault_event(struct kbase_queue *const queue, kbase_gpu_exception_name(cs_fault_exception_type), cs_fault_exception_data, cs_fault_info_exception_data); - if (cs_fault_exception_type == - CS_FAULT_EXCEPTION_TYPE_RESOURCE_EVICTION_TIMEOUT) - report_queue_fatal_error(queue, GPU_EXCEPTION_TYPE_SW_FAULT_2, - 0, queue->group->handle); +} + +static void report_queue_fatal_error(struct kbase_queue *const queue, + u32 cs_fatal, u64 cs_fatal_info, + u8 group_handle) +{ + struct base_csf_notification error = { + .type = BASE_CSF_NOTIFICATION_GPU_QUEUE_GROUP_ERROR, + .payload = { + .csg_error = { + .handle = group_handle, + .error = { + .error_type = + BASE_GPU_QUEUE_GROUP_QUEUE_ERROR_FATAL, + .payload = { + .fatal_queue = { + .sideband = cs_fatal_info, + .status = cs_fatal, + .csi_index = queue->csi_index, + } + } + } + } + } + }; + + add_error(queue->kctx, &queue->error, &error); + kbase_event_wakeup(queue->kctx); } /** @@ -2531,6 +2552,7 @@ handle_fatal_event(struct kbase_queue *const queue, if (!queue_work(queue->kctx->csf.wq, &queue->fatal_event_work)) release_queue(queue); } + } /** @@ -2757,9 +2779,14 @@ static void process_csg_interrupts(struct kbase_device *const kbdev, group->handle, csg_nr); /* Check if the scheduling tick can be advanced */ - if (kbase_csf_scheduler_all_csgs_idle(kbdev) && - !scheduler->gpu_idle_fw_timer_enabled) { - kbase_csf_scheduler_advance_tick_nolock(kbdev); + if (kbase_csf_scheduler_all_csgs_idle(kbdev)) { + if (!scheduler->gpu_idle_fw_timer_enabled) + kbase_csf_scheduler_advance_tick_nolock(kbdev); + } else if (atomic_read(&scheduler->non_idle_offslot_grps)) { + /* If there are non-idle CSGs waiting for a slot, fire + * a tock for a replacement. + */ + mod_delayed_work(scheduler->wq, &scheduler->tock_work, 0); } } @@ -2770,7 +2797,8 @@ static void process_csg_interrupts(struct kbase_device *const kbdev, KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_PROGRESS_TIMER_INTERRUPT, group, req ^ ack); dev_info(kbdev->dev, - "Timeout notification received for group %u of ctx %d_%d on slot %d\n", + "[%llu] Iterator PROGRESS_TIMER timeout notification received for group %u of ctx %d_%d on slot %d\n", + kbase_backend_get_cycle_cnt(kbdev), group->handle, group->kctx->tgid, group->kctx->id, csg_nr); handle_progress_timer_event(group); @@ -2868,6 +2896,79 @@ static void process_prfcnt_interrupts(struct kbase_device *kbdev, u32 glb_req, } } +/** + * check_protm_enter_req_complete - Check if PROTM_ENTER request completed + * + * @kbdev: Instance of a GPU platform device that implements a CSF interface. + * @glb_req: Global request register value. + * @glb_ack: Global acknowledge register value. + * + * This function checks if the PROTM_ENTER Global request had completed and + * appropriately sends notification about the protected mode entry to components + * like IPA, HWC, IPA_CONTROL. + */ +static inline void check_protm_enter_req_complete(struct kbase_device *kbdev, + u32 glb_req, u32 glb_ack) +{ + lockdep_assert_held(&kbdev->hwaccess_lock); + kbase_csf_scheduler_spin_lock_assert_held(kbdev); + + if (likely(!kbdev->csf.scheduler.active_protm_grp)) + return; + + if (kbdev->protected_mode) + return; + + if ((glb_req & GLB_REQ_PROTM_ENTER_MASK) != + (glb_ack & GLB_REQ_PROTM_ENTER_MASK)) + return; + + dev_dbg(kbdev->dev, "Protected mode entry interrupt received"); + + kbdev->protected_mode = true; + kbase_ipa_protection_mode_switch_event(kbdev); + kbase_ipa_control_protm_entered(kbdev); + kbase_hwcnt_backend_csf_protm_entered(&kbdev->hwcnt_gpu_iface); +} + +/** + * process_protm_exit - Handle the protected mode exit interrupt + * + * @kbdev: Instance of a GPU platform device that implements a CSF interface. + * @glb_ack: Global acknowledge register value. + * + * This function handles the PROTM_EXIT interrupt and sends notification + * about the protected mode exit to components like HWC, IPA_CONTROL. + */ +static inline void process_protm_exit(struct kbase_device *kbdev, u32 glb_ack) +{ + const struct kbase_csf_global_iface *const global_iface = + &kbdev->csf.global_iface; + struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; + + lockdep_assert_held(&kbdev->hwaccess_lock); + kbase_csf_scheduler_spin_lock_assert_held(kbdev); + + dev_dbg(kbdev->dev, "Protected mode exit interrupt received"); + + kbase_csf_firmware_global_input_mask(global_iface, GLB_REQ, glb_ack, + GLB_REQ_PROTM_EXIT_MASK); + + if (likely(scheduler->active_protm_grp)) { + KBASE_KTRACE_ADD_CSF_GRP(kbdev, SCHEDULER_EXIT_PROTM, + scheduler->active_protm_grp, 0u); + scheduler->active_protm_grp = NULL; + } else { + dev_warn(kbdev->dev, "PROTM_EXIT interrupt after no pmode group"); + } + + if (!WARN_ON(!kbdev->protected_mode)) { + kbdev->protected_mode = false; + kbase_ipa_control_protm_exited(kbdev); + kbase_hwcnt_backend_csf_protm_exited(&kbdev->hwcnt_gpu_iface); + } +} + void kbase_csf_interrupt(struct kbase_device *kbdev, u32 val) { unsigned long flags; @@ -2898,19 +2999,10 @@ void kbase_csf_interrupt(struct kbase_device *kbdev, u32 val) global_iface, GLB_ACK); KBASE_KTRACE_ADD(kbdev, GLB_REQ_ACQ, NULL, glb_req ^ glb_ack); - if ((glb_req ^ glb_ack) & GLB_REQ_PROTM_EXIT_MASK) { - dev_dbg(kbdev->dev, "Protected mode exit interrupt received"); - kbase_csf_firmware_global_input_mask( - global_iface, GLB_REQ, glb_ack, - GLB_REQ_PROTM_EXIT_MASK); - WARN_ON(!kbase_csf_scheduler_protected_mode_in_use(kbdev)); - KBASE_KTRACE_ADD_CSF_GRP(kbdev, SCHEDULER_EXIT_PROTM, scheduler->active_protm_grp, 0u); - scheduler->active_protm_grp = NULL; - kbdev->protected_mode = false; - kbase_ipa_control_protm_exited(kbdev); - kbase_hwcnt_backend_csf_protm_exited( - &kbdev->hwcnt_gpu_iface); - } + check_protm_enter_req_complete(kbdev, glb_req, glb_ack); + + if ((glb_req ^ glb_ack) & GLB_REQ_PROTM_EXIT_MASK) + process_protm_exit(kbdev, glb_ack); /* Handle IDLE Hysteresis notification event */ if ((glb_req ^ glb_ack) & GLB_REQ_IDLE_EVENT_MASK) { @@ -3066,4 +3158,3 @@ u8 kbase_csf_priority_check(struct kbase_device *kbdev, u8 req_priority) return out_priority; } - diff --git a/mali_kbase/csf/mali_kbase_csf.h b/mali_kbase/csf/mali_kbase_csf.h index e3bd436..640d2ed 100644 --- a/mali_kbase/csf/mali_kbase_csf.h +++ b/mali_kbase/csf/mali_kbase_csf.h @@ -39,10 +39,13 @@ */ #define KBASEP_USER_DB_NR_INVALID ((s8)-1) -#define FIRMWARE_PING_INTERVAL_MS (4000) /* 4 seconds */ +#define FIRMWARE_PING_INTERVAL_MS (8000) /* 8 seconds */ #define FIRMWARE_IDLE_HYSTERESIS_TIME_MS (10) /* Default 10 milliseconds */ +/* Idle hysteresis time can be scaled down when GPU sleep feature is used */ +#define FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER (5) + /** * enum kbase_csf_event_callback_action - return type for CSF event callbacks. * diff --git a/mali_kbase/csf/mali_kbase_csf_csg_debugfs.c b/mali_kbase/csf/mali_kbase_csf_csg_debugfs.c index 14deb98..40bee79 100644 --- a/mali_kbase/csf/mali_kbase_csf_csg_debugfs.c +++ b/mali_kbase/csf/mali_kbase_csf_csg_debugfs.c @@ -24,10 +24,32 @@ #include <linux/seq_file.h> #include <linux/delay.h> #include <csf/mali_kbase_csf_trace_buffer.h> +#include <backend/gpu/mali_kbase_pm_internal.h> #if IS_ENABLED(CONFIG_DEBUG_FS) #include "mali_kbase_csf_tl_reader.h" +#define MAX_SCHED_STATE_STRING_LEN (16) +static const char *scheduler_state_to_string(struct kbase_device *kbdev, + enum kbase_csf_scheduler_state sched_state) +{ + switch (sched_state) { + case SCHED_BUSY: + return "BUSY"; + case SCHED_INACTIVE: + return "INACTIVE"; + case SCHED_SUSPENDED: + return "SUSPENDED"; +#ifdef KBASE_PM_RUNTIME + case SCHED_SLEEPING: + return "SLEEPING"; +#endif + default: + dev_warn(kbdev->dev, "Unknown Scheduler state %d", sched_state); + return NULL; + } +} + /** * blocked_reason_to_string() - Convert blocking reason id to a string * @@ -142,10 +164,6 @@ static void kbasep_csf_scheduler_dump_active_queue(struct seq_file *file, !queue->group)) return; - /* Ring the doorbell to have firmware update CS_EXTRACT */ - kbase_csf_ring_cs_user_doorbell(queue->kctx->kbdev, queue); - msleep(100); - addr = (u32 *)queue->user_io_addr; cs_insert = addr[CS_INSERT_LO/4] | ((u64)addr[CS_INSERT_HI/4] << 32); @@ -253,32 +271,68 @@ static void kbasep_csf_scheduler_dump_active_queue(struct seq_file *file, /* Waiting timeout for STATUS_UPDATE acknowledgment, in milliseconds */ #define CSF_STATUS_UPDATE_TO_MS (100) +static void update_active_group_status(struct seq_file *file, + struct kbase_queue_group *const group) +{ + struct kbase_device *const kbdev = group->kctx->kbdev; + struct kbase_csf_cmd_stream_group_info const *const ginfo = + &kbdev->csf.global_iface.groups[group->csg_nr]; + long remaining = + kbase_csf_timeout_in_jiffies(CSF_STATUS_UPDATE_TO_MS); + unsigned long flags; + + /* Global doorbell ring for CSG STATUS_UPDATE request or User doorbell + * ring for Extract offset update, shall not be made when MCU has been + * put to sleep otherwise it will undesirably make MCU exit the sleep + * state. Also it isn't really needed as FW will implicitly update the + * status of all on-slot groups when MCU sleep request is sent to it. + */ + if (kbdev->csf.scheduler.state == SCHED_SLEEPING) + return; + + /* Ring the User doobell shared between the queues bound to this + * group, to have FW update the CS_EXTRACT for all the queues + * bound to the group. Ring early so that FW gets adequate time + * for the handling. + */ + kbase_csf_ring_doorbell(kbdev, group->doorbell_nr); + + kbase_csf_scheduler_spin_lock(kbdev, &flags); + kbase_csf_firmware_csg_input_mask(ginfo, CSG_REQ, + ~kbase_csf_firmware_csg_output(ginfo, CSG_ACK), + CSG_REQ_STATUS_UPDATE_MASK); + kbase_csf_scheduler_spin_unlock(kbdev, flags); + kbase_csf_ring_csg_doorbell(kbdev, group->csg_nr); + + remaining = wait_event_timeout(kbdev->csf.event_wait, + !((kbase_csf_firmware_csg_input_read(ginfo, CSG_REQ) ^ + kbase_csf_firmware_csg_output(ginfo, CSG_ACK)) & + CSG_REQ_STATUS_UPDATE_MASK), remaining); + + if (!remaining) { + dev_err(kbdev->dev, + "Timed out for STATUS_UPDATE on group %d on slot %d", + group->handle, group->csg_nr); + + seq_printf(file, "*** Warn: Timed out for STATUS_UPDATE on slot %d\n", + group->csg_nr); + seq_puts(file, "*** The following group-record is likely stale\n"); + } +} + static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file, struct kbase_queue_group *const group) { if (kbase_csf_scheduler_group_get_slot(group) >= 0) { struct kbase_device *const kbdev = group->kctx->kbdev; - unsigned long flags; u32 ep_c, ep_r; char exclusive; struct kbase_csf_cmd_stream_group_info const *const ginfo = &kbdev->csf.global_iface.groups[group->csg_nr]; - long remaining = - kbase_csf_timeout_in_jiffies(CSF_STATUS_UPDATE_TO_MS); u8 slot_priority = kbdev->csf.scheduler.csg_slots[group->csg_nr].priority; - kbase_csf_scheduler_spin_lock(kbdev, &flags); - kbase_csf_firmware_csg_input_mask(ginfo, CSG_REQ, - ~kbase_csf_firmware_csg_output(ginfo, CSG_ACK), - CSG_REQ_STATUS_UPDATE_MASK); - kbase_csf_scheduler_spin_unlock(kbdev, flags); - kbase_csf_ring_csg_doorbell(kbdev, group->csg_nr); - - remaining = wait_event_timeout(kbdev->csf.event_wait, - !((kbase_csf_firmware_csg_input_read(ginfo, CSG_REQ) ^ - kbase_csf_firmware_csg_output(ginfo, CSG_ACK)) & - CSG_REQ_STATUS_UPDATE_MASK), remaining); + update_active_group_status(file, group); ep_c = kbase_csf_firmware_csg_output(ginfo, CSG_STATUS_EP_CURRENT); @@ -291,16 +345,6 @@ static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file, else exclusive = '0'; - if (!remaining) { - dev_err(kbdev->dev, - "Timed out for STATUS_UPDATE on group %d on slot %d", - group->handle, group->csg_nr); - - seq_printf(file, "*** Warn: Timed out for STATUS_UPDATE on slot %d\n", - group->csg_nr); - seq_printf(file, "*** The following group-record is likely stale\n"); - } - seq_puts(file, "GroupID, CSG NR, CSG Prio, Run State, Priority, C_EP(Alloc/Req), F_EP(Alloc/Req), T_EP(Alloc/Req), Exclusive\n"); seq_printf(file, "%7d, %6d, %8d, %9d, %8d, %11d/%3d, %11d/%3d, %11d/%3d, %9c\n", group->handle, @@ -315,6 +359,10 @@ static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file, CSG_STATUS_EP_CURRENT_TILER_EP_GET(ep_c), CSG_STATUS_EP_REQ_TILER_EP_GET(ep_r), exclusive); + + /* Wait for the User doobell ring to take effect */ + if (kbdev->csf.scheduler.state != SCHED_SLEEPING) + msleep(100); } else { seq_puts(file, "GroupID, CSG NR, Run State, Priority\n"); seq_printf(file, "%7d, %6d, %9d, %8d\n", @@ -362,6 +410,12 @@ static int kbasep_csf_queue_group_debugfs_show(struct seq_file *file, mutex_lock(&kctx->csf.lock); kbase_csf_scheduler_lock(kbdev); + if (kbdev->csf.scheduler.state == SCHED_SLEEPING) { + /* Wait for the MCU sleep request to complete. Please refer the + * update_active_group_status() function for the explanation. + */ + kbase_pm_wait_for_desired_state(kbdev); + } for (gr = 0; gr < MAX_QUEUE_GROUP_NUM; gr++) { struct kbase_queue_group *const group = kctx->csf.queue_groups[gr]; @@ -395,6 +449,12 @@ static int kbasep_csf_scheduler_dump_active_groups(struct seq_file *file, MALI_CSF_CSG_DEBUGFS_VERSION); kbase_csf_scheduler_lock(kbdev); + if (kbdev->csf.scheduler.state == SCHED_SLEEPING) { + /* Wait for the MCU sleep request to complete. Please refer the + * update_active_group_status() function for the explanation. + */ + kbase_pm_wait_for_desired_state(kbdev); + } for (csg_nr = 0; csg_nr < num_groups; csg_nr++) { struct kbase_queue_group *const group = kbdev->csf.scheduler.csg_slots[csg_nr].resident_group; @@ -502,59 +562,93 @@ DEFINE_SIMPLE_ATTRIBUTE(kbasep_csf_debugfs_scheduling_timer_kick_fops, "%llu\n"); /** - * kbase_csf_debugfs_scheduler_suspend_get() - get if the scheduler is suspended. + * kbase_csf_debugfs_scheduler_state_get() - Get the state of scheduler. * - * @data: The debugfs dentry private data, a pointer to kbase_device - * @val: The debugfs output value, boolean: 1 suspended, 0 otherwise + * @file: Object of the file that is being read. + * @user_buf: User buffer that contains the string. + * @count: Length of user buffer + * @ppos: Offset within file object * - * Return: 0 + * This function will return the current Scheduler state to Userspace + * Scheduler may exit that state by the time the state string is received + * by the Userspace. + * + * Return: 0 if Scheduler was found in an unexpected state, or the + * size of the state string if it was copied successfully to the + * User buffer or a negative value in case of an error. */ -static int kbase_csf_debugfs_scheduler_suspend_get( - void *data, u64 *val) +static ssize_t kbase_csf_debugfs_scheduler_state_get(struct file *file, + char __user *user_buf, size_t count, loff_t *ppos) { - struct kbase_device *kbdev = data; + struct kbase_device *kbdev = file->private_data; struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; + const char *state_string; kbase_csf_scheduler_lock(kbdev); - *val = (scheduler->state == SCHED_SUSPENDED); + state_string = scheduler_state_to_string(kbdev, scheduler->state); kbase_csf_scheduler_unlock(kbdev); - return 0; + if (!state_string) + count = 0; + + return simple_read_from_buffer(user_buf, count, ppos, + state_string, strlen(state_string)); } /** - * kbase_csf_debugfs_scheduler_suspend_set() - set the scheduler to suspended. + * kbase_csf_debugfs_scheduler_state_set() - Set the state of scheduler. * - * @data: The debugfs dentry private data, a pointer to kbase_device - * @val: The debugfs input value, boolean: 1 suspend, 0 otherwise + * @file: Object of the file that is being written to. + * @ubuf: User buffer that contains the string. + * @count: Length of user buffer + * @ppos: Offset within file object * - * Return: Negative value if already in requested state, 0 otherwise. + * This function will update the Scheduler state as per the state string + * passed by the Userspace. Scheduler may or may not remain in new state + * for long. + * + * Return: Negative value if the string doesn't correspond to a valid Scheduler + * state or if copy from user buffer failed, otherwise the length of + * the User buffer. */ -static int kbase_csf_debugfs_scheduler_suspend_set( - void *data, u64 val) +static ssize_t kbase_csf_debugfs_scheduler_state_set(struct file *file, + const char __user *ubuf, size_t count, loff_t *ppos) { - struct kbase_device *kbdev = data; - struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; - enum kbase_csf_scheduler_state state; + struct kbase_device *kbdev = file->private_data; + char buf[MAX_SCHED_STATE_STRING_LEN]; + ssize_t ret = count; - kbase_csf_scheduler_lock(kbdev); - state = scheduler->state; - kbase_csf_scheduler_unlock(kbdev); + CSTD_UNUSED(ppos); + + count = min_t(size_t, sizeof(buf) - 1, count); + if (copy_from_user(buf, ubuf, count)) + return -EFAULT; + + buf[count] = 0; - if (val && (state != SCHED_SUSPENDED)) + if (sysfs_streq(buf, "SUSPENDED")) kbase_csf_scheduler_pm_suspend(kbdev); - else if (!val && (state == SCHED_SUSPENDED)) - kbase_csf_scheduler_pm_resume(kbdev); - else - return -1; +#ifdef KBASE_PM_RUNTIME + else if (sysfs_streq(buf, "SLEEPING")) + kbase_csf_scheduler_force_sleep(kbdev); +#endif + else if (sysfs_streq(buf, "INACTIVE")) + kbase_csf_scheduler_force_wakeup(kbdev); + else { + dev_dbg(kbdev->dev, "Bad scheduler state %s", buf); + ret = -EINVAL; + } - return 0; + return ret; } -DEFINE_SIMPLE_ATTRIBUTE(kbasep_csf_debugfs_scheduler_suspend_fops, - &kbase_csf_debugfs_scheduler_suspend_get, - &kbase_csf_debugfs_scheduler_suspend_set, - "%llu\n"); +static const struct file_operations kbasep_csf_debugfs_scheduler_state_fops = { + .owner = THIS_MODULE, + .read = kbase_csf_debugfs_scheduler_state_get, + .write = kbase_csf_debugfs_scheduler_state_set, + .open = simple_open, + .llseek = default_llseek, +}; void kbase_csf_debugfs_init(struct kbase_device *kbdev) { @@ -568,9 +662,9 @@ void kbase_csf_debugfs_init(struct kbase_device *kbdev) debugfs_create_file("scheduling_timer_kick", 0200, kbdev->mali_debugfs_directory, kbdev, &kbasep_csf_debugfs_scheduling_timer_kick_fops); - debugfs_create_file("scheduler_suspend", 0644, + debugfs_create_file("scheduler_state", 0644, kbdev->mali_debugfs_directory, kbdev, - &kbasep_csf_debugfs_scheduler_suspend_fops); + &kbasep_csf_debugfs_scheduler_state_fops); kbase_csf_tl_reader_debugfs_init(kbdev); kbase_csf_firmware_trace_buffer_debugfs_init(kbdev); diff --git a/mali_kbase/csf/mali_kbase_csf_defs.h b/mali_kbase/csf/mali_kbase_csf_defs.h index 53526ce..de471eb 100644 --- a/mali_kbase/csf/mali_kbase_csf_defs.h +++ b/mali_kbase/csf/mali_kbase_csf_defs.h @@ -219,11 +219,19 @@ enum kbase_csf_csg_slot_state { * management reference. This can happen if the GPU * becomes idle for a duration exceeding a threshold, * or due to a system triggered suspend action. + * @SCHED_SLEEPING: The scheduler is in low-power mode with scheduling + * operations suspended and is not holding the power + * management reference. This state is set, only for the + * GPUs that supports the sleep feature, when GPU idle + * notification is received. The state is changed to + * @SCHED_SUSPENDED from the runtime suspend callback + * function after the suspend of CSGs. */ enum kbase_csf_scheduler_state { SCHED_BUSY, SCHED_INACTIVE, SCHED_SUSPENDED, + SCHED_SLEEPING, }; /** @@ -561,7 +569,9 @@ struct kbase_csf_heap_context_allocator { * @kbase_context. It is not the same as a heap context structure allocated by * the kernel for use by the firmware. * - * @lock: Lock preventing concurrent access to the tiler heaps. + * @lock: Lock to prevent the concurrent access to tiler heaps (after the + * initialization), a tiler heap can be terminated whilst an OoM + * event is being handled for it. * @list: List of tiler heaps. * @ctx_alloc: Allocator for heap context structures. * @nr_of_heaps: Total number of tiler heaps that were added during the @@ -802,6 +812,11 @@ struct kbase_csf_csg_slot { * @active_protm_grp: Indicates if firmware has been permitted to let GPU * enter protected mode with the given group. On exit * from protected mode the pointer is reset to NULL. + * This pointer is set and PROTM_ENTER request is sent + * atomically with @interrupt_lock held. + * This pointer being set doesn't necessarily indicates + * that GPU is in protected mode, kbdev->protected_mode + * needs to be checked for that. * @gpu_idle_fw_timer_enabled: Whether the CSF scheduler has activiated the * firmware idle hysteresis timer for preparing a * GPU suspend on idle. diff --git a/mali_kbase/csf/mali_kbase_csf_firmware.c b/mali_kbase/csf/mali_kbase_csf_firmware.c index 1b31122..785555c 100644 --- a/mali_kbase/csf/mali_kbase_csf_firmware.c +++ b/mali_kbase/csf/mali_kbase_csf_firmware.c @@ -27,12 +27,14 @@ #include "mali_kbase_reset_gpu.h" #include "mali_kbase_ctx_sched.h" #include "mali_kbase_csf_scheduler.h" +#include <mali_kbase_hwaccess_time.h> #include "device/mali_kbase_device.h" #include "backend/gpu/mali_kbase_pm_internal.h" #include "tl/mali_kbase_timeline_priv.h" #include "mali_kbase_csf_tl_reader.h" #include "backend/gpu/mali_kbase_clk_rate_trace_mgr.h" #include <csf/ipa_control/mali_kbase_csf_ipa_control.h> +#include <uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h> #include <linux/list.h> #include <linux/slab.h> @@ -47,7 +49,7 @@ #include <asm/arch_timer.h> #define MALI_MAX_FIRMWARE_NAME_LEN ((size_t)20) - +#define ACK_TIMEOUT_MILLISECONDS 1000 static char fw_name[MALI_MAX_FIRMWARE_NAME_LEN] = "mali_csffw.bin"; module_param_string(fw_name, fw_name, sizeof(fw_name), 0644); @@ -190,8 +192,10 @@ static int setup_shared_iface_static_region(struct kbase_device *kbdev) reg = kbase_alloc_free_region(&kbdev->csf.shared_reg_rbtree, 0, interface->num_pages, KBASE_REG_ZONE_MCU_SHARED); if (reg) { + mutex_lock(&kbdev->csf.reg_lock); ret = kbase_add_va_region_rbtree(kbdev, reg, interface->virtual, interface->num_pages, 1); + mutex_unlock(&kbdev->csf.reg_lock); if (ret) kfree(reg); else @@ -1305,9 +1309,12 @@ static int wait_for_global_request(struct kbase_device *const kbdev, wait_timeout); if (!remaining) { - dev_warn(kbdev->dev, "Timed out waiting for global request %x to complete", + dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for global request %x to complete", + kbase_backend_get_cycle_cnt(kbdev), + kbdev->csf.fw_timeout_ms, req_mask); err = -ETIMEDOUT; + } return err; @@ -1388,11 +1395,6 @@ static void global_init(struct kbase_device *const kbdev, u64 core_mask) kbase_csf_scheduler_spin_lock(kbdev, &flags); - /* Set the coherency mode for protected mode execution */ - WARN_ON(kbdev->system_coherency == COHERENCY_ACE); - kbase_csf_firmware_global_input(global_iface, GLB_PROTM_COHERENCY, - kbdev->system_coherency); - /* Update shader core allocation enable mask */ enable_endpoints_global(global_iface, core_mask); enable_shader_poweroff_timer(kbdev, global_iface); @@ -1675,12 +1677,75 @@ u32 kbase_csf_firmware_set_mcu_core_pwroff_time(struct kbase_device *kbdev, u32 return pwroff; } +/** + * kbase_device_csf_iterator_trace_init - Send request to enable iterator + * trace port. + * @kbdev: Kernel base device pointer + * + * Return: 0 on success (or if enable request is not sent), or error + * code -EINVAL on failure of GPU to acknowledge enable request. + */ +static int kbase_device_csf_iterator_trace_init(struct kbase_device *kbdev) +{ + /* Enable the iterator trace port if supported by the GPU. + * It requires the GPU to have a nonzero "iter_trace_enable" + * property in the device tree, and the FW must advertise + * this feature in GLB_FEATURES. + */ + if (kbdev->pm.backend.gpu_powered) { + /* check device tree for iterator trace enable property */ + const void *iter_trace_param = of_get_property( + kbdev->dev->of_node, + "iter_trace_enable", NULL); + + const struct kbase_csf_global_iface *iface = + &kbdev->csf.global_iface; + + if (iter_trace_param) { + u32 iter_trace_value = be32_to_cpup(iter_trace_param); + + if ((iface->features & + GLB_FEATURES_ITER_TRACE_SUPPORTED_MASK) && + iter_trace_value) { + long ack_timeout; + + ack_timeout = kbase_csf_timeout_in_jiffies( + ACK_TIMEOUT_MILLISECONDS); + + /* write enable request to global input */ + kbase_csf_firmware_global_input_mask( + iface, GLB_REQ, + GLB_REQ_ITER_TRACE_ENABLE_MASK, + GLB_REQ_ITER_TRACE_ENABLE_MASK); + /* Ring global doorbell */ + kbase_csf_ring_doorbell(kbdev, + CSF_KERNEL_DOORBELL_NR); + + ack_timeout = wait_event_timeout( + kbdev->csf.event_wait, + !((kbase_csf_firmware_global_input_read( + iface, GLB_REQ) ^ + kbase_csf_firmware_global_output( + iface, GLB_ACK)) & + GLB_REQ_ITER_TRACE_ENABLE_MASK), + ack_timeout); + + return ack_timeout ? 0 : -EINVAL; + + } + } + + } + return 0; +} int kbase_csf_firmware_early_init(struct kbase_device *kbdev) { init_waitqueue_head(&kbdev->csf.event_wait); kbdev->csf.interrupt_received = false; - kbdev->csf.fw_timeout_ms = CSF_FIRMWARE_TIMEOUT_MS; + + kbdev->csf.fw_timeout_ms = + kbase_get_timeout_ms(kbdev, CSF_FIRMWARE_TIMEOUT); INIT_LIST_HEAD(&kbdev->csf.firmware_interfaces); INIT_LIST_HEAD(&kbdev->csf.firmware_config); @@ -1721,8 +1786,14 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev) } kbdev->csf.gpu_idle_hysteresis_ms = FIRMWARE_IDLE_HYSTERESIS_TIME_MS; +#ifdef KBASE_PM_RUNTIME + if (kbase_pm_gpu_sleep_allowed(kbdev)) + kbdev->csf.gpu_idle_hysteresis_ms /= + FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER; +#endif + WARN_ON(!kbdev->csf.gpu_idle_hysteresis_ms); kbdev->csf.gpu_idle_dur_count = convert_dur_to_idle_count( - kbdev, FIRMWARE_IDLE_HYSTERESIS_TIME_MS); + kbdev, kbdev->csf.gpu_idle_hysteresis_ms); kbdev->csf.mcu_core_pwroff_dur_us = DEFAULT_GLB_PWROFF_TIMEOUT_US; kbdev->csf.mcu_core_pwroff_dur_count = convert_dur_to_core_pwroff_count( @@ -1851,6 +1922,9 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev) if (ret != 0) goto error; + ret = kbase_device_csf_iterator_trace_init(kbdev); + if (ret != 0) + goto error; /* Firmware loaded successfully */ release_firmware(firmware); @@ -2048,30 +2122,20 @@ int kbase_csf_firmware_set_timeout(struct kbase_device *const kbdev, void kbase_csf_enter_protected_mode(struct kbase_device *kbdev) { struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface; - unsigned long flags; - int err; - kbase_csf_scheduler_spin_lock(kbdev, &flags); + kbase_csf_scheduler_spin_lock_assert_held(kbdev); set_global_request(global_iface, GLB_REQ_PROTM_ENTER_MASK); dev_dbg(kbdev->dev, "Sending request to enter protected mode"); kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR); - kbase_csf_scheduler_spin_unlock(kbdev, flags); - - err = wait_for_global_request(kbdev, GLB_REQ_PROTM_ENTER_MASK); - - if (!err) { - unsigned long irq_flags; - - spin_lock_irqsave(&kbdev->hwaccess_lock, flags); - kbdev->protected_mode = true; - kbase_ipa_protection_mode_switch_event(kbdev); - kbase_ipa_control_protm_entered(kbdev); +} - kbase_csf_scheduler_spin_lock(kbdev, &irq_flags); - kbase_hwcnt_backend_csf_protm_entered(&kbdev->hwcnt_gpu_iface); - kbase_csf_scheduler_spin_unlock(kbdev, irq_flags); +void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev) +{ + int err = wait_for_global_request(kbdev, GLB_REQ_PROTM_ENTER_MASK); - spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + if (err) { + if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE)) + kbase_reset_gpu(kbdev); } } @@ -2081,12 +2145,38 @@ void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev) unsigned long flags; kbase_csf_scheduler_spin_lock(kbdev, &flags); + /* Validate there are no on-slot groups when sending the + * halt request to firmware. + */ + WARN_ON(kbase_csf_scheduler_get_nr_active_csgs_locked(kbdev)); set_global_request(global_iface, GLB_REQ_HALT_MASK); dev_dbg(kbdev->dev, "Sending request to HALT MCU"); kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR); kbase_csf_scheduler_spin_unlock(kbdev, flags); } +#ifdef KBASE_PM_RUNTIME +void kbase_csf_firmware_trigger_mcu_sleep(struct kbase_device *kbdev) +{ + struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface; + unsigned long flags; + + kbase_csf_scheduler_spin_lock(kbdev, &flags); + set_global_request(global_iface, GLB_REQ_SLEEP_MASK); + dev_dbg(kbdev->dev, "Sending sleep request to MCU"); + kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR); + kbase_csf_scheduler_spin_unlock(kbdev, flags); +} + +bool kbase_csf_firmware_is_mcu_in_sleep(struct kbase_device *kbdev) +{ + lockdep_assert_held(&kbdev->hwaccess_lock); + + return (global_request_complete(kbdev, GLB_REQ_SLEEP_MASK) && + kbase_csf_firmware_mcu_halted(kbdev)); +} +#endif + int kbase_csf_trigger_firmware_config_update(struct kbase_device *kbdev) { struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface; @@ -2095,6 +2185,7 @@ int kbase_csf_trigger_firmware_config_update(struct kbase_device *kbdev) /* Ensure GPU is powered-up until we complete config update.*/ kbase_csf_scheduler_pm_active(kbdev); + kbase_csf_scheduler_wait_mcu_active(kbdev); /* The 'reg_lock' is also taken and is held till the update is * complete, to ensure the config update gets serialized. @@ -2288,7 +2379,7 @@ int kbase_csf_firmware_mcu_shared_mapping_init( mmu_insert_pages_error: mutex_lock(&kbdev->csf.reg_lock); - kbase_remove_va_region(va_reg); + kbase_remove_va_region(kbdev, va_reg); va_region_add_error: kbase_free_alloced_region(va_reg); mutex_unlock(&kbdev->csf.reg_lock); @@ -2320,7 +2411,7 @@ void kbase_csf_firmware_mcu_shared_mapping_term( { if (csf_mapping->va_reg) { mutex_lock(&kbdev->csf.reg_lock); - kbase_remove_va_region(csf_mapping->va_reg); + kbase_remove_va_region(kbdev, csf_mapping->va_reg); kbase_free_alloced_region(csf_mapping->va_reg); mutex_unlock(&kbdev->csf.reg_lock); } diff --git a/mali_kbase/csf/mali_kbase_csf_firmware.h b/mali_kbase/csf/mali_kbase_csf_firmware.h index 60d7065..0edcc30 100644 --- a/mali_kbase/csf/mali_kbase_csf_firmware.h +++ b/mali_kbase/csf/mali_kbase_csf_firmware.h @@ -78,9 +78,6 @@ /* Maximum CSs per csg. */ #define MAX_SUPPORTED_STREAMS_PER_GROUP 32 -/* Waiting timeout for status change acknowledgment, in milliseconds */ -#define CSF_FIRMWARE_TIMEOUT_MS (3000) /* Relaxed to 3000ms from 800ms due to Android */ - struct kbase_device; @@ -442,13 +439,27 @@ int kbase_csf_firmware_set_timeout(struct kbase_device *kbdev, u64 timeout); /** * kbase_csf_enter_protected_mode - Send the Global request to firmware to - * enter protected mode and wait for its - * completion. + * enter protected mode. * * @kbdev: Instance of a GPU platform device that implements a CSF interface. + * + * The function must be called with kbdev->csf.scheduler.interrupt_lock held + * and it does not wait for the protected mode entry to complete. */ void kbase_csf_enter_protected_mode(struct kbase_device *kbdev); +/** + * kbase_csf_wait_protected_mode_enter - Wait for the completion of PROTM_ENTER + * Global request sent to firmware. + * + * @kbdev: Instance of a GPU platform device that implements a CSF interface. + * + * This function needs to be called after kbase_csf_wait_protected_mode_enter() + * to wait for the protected mode entry to complete. GPU reset is triggered if + * the wait is unsuccessful. + */ +void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev); + static inline bool kbase_csf_firmware_mcu_halted(struct kbase_device *kbdev) { return (kbase_reg_read(kbdev, GPU_CONTROL_REG(MCU_STATUS)) == @@ -497,6 +508,26 @@ static inline void kbase_csf_firmware_disable_mcu(struct kbase_device *kbdev) */ void kbase_csf_firmware_disable_mcu_wait(struct kbase_device *kbdev); +#ifdef KBASE_PM_RUNTIME +/** + * kbase_csf_firmware_trigger_mcu_sleep - Send the command to put MCU in sleep + * state. + * + * @kbdev: Instance of a GPU platform device that implements a CSF interface. + */ +void kbase_csf_firmware_trigger_mcu_sleep(struct kbase_device *kbdev); + +/** + * kbase_csf_firmware_is_mcu_in_sleep - Check if sleep request has completed + * and MCU has halted. + * + * @kbdev: Instance of a GPU platform device that implements a CSF interface. + * + * Return: true if sleep request has completed, otherwise false. + */ +bool kbase_csf_firmware_is_mcu_in_sleep(struct kbase_device *kbdev); +#endif + /** * kbase_trigger_firmware_reload - Trigger the reboot of MCU firmware, for the * cold boot case firmware image would be diff --git a/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c b/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c index 33ae3f7..e99c968 100644 --- a/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c +++ b/mali_kbase/csf/mali_kbase_csf_firmware_no_mali.c @@ -27,6 +27,7 @@ #include "mali_kbase_reset_gpu.h" #include "mali_kbase_ctx_sched.h" #include "device/mali_kbase_device.h" +#include <mali_kbase_hwaccess_time.h> #include "backend/gpu/mali_kbase_pm_internal.h" #include "mali_kbase_csf_scheduler.h" #include "mmu/mali_kbase_mmu.h" @@ -551,6 +552,8 @@ static int wait_for_global_request(struct kbase_device *const kbdev, dev_warn(kbdev->dev, "Timed out waiting for global request %x to complete", req_mask); err = -ETIMEDOUT; + + } return err; @@ -886,7 +889,9 @@ int kbase_csf_firmware_early_init(struct kbase_device *kbdev) { init_waitqueue_head(&kbdev->csf.event_wait); kbdev->csf.interrupt_received = false; - kbdev->csf.fw_timeout_ms = CSF_FIRMWARE_TIMEOUT_MS; + + kbdev->csf.fw_timeout_ms = + kbase_get_timeout_ms(kbdev, CSF_FIRMWARE_TIMEOUT); INIT_LIST_HEAD(&kbdev->csf.firmware_interfaces); INIT_LIST_HEAD(&kbdev->csf.firmware_config); @@ -920,8 +925,14 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev) } kbdev->csf.gpu_idle_hysteresis_ms = FIRMWARE_IDLE_HYSTERESIS_TIME_MS; +#ifdef KBASE_PM_RUNTIME + if (kbase_pm_gpu_sleep_allowed(kbdev)) + kbdev->csf.gpu_idle_hysteresis_ms /= + FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER; +#endif + WARN_ON(!kbdev->csf.gpu_idle_hysteresis_ms); kbdev->csf.gpu_idle_dur_count = convert_dur_to_idle_count( - kbdev, FIRMWARE_IDLE_HYSTERESIS_TIME_MS); + kbdev, kbdev->csf.gpu_idle_hysteresis_ms); ret = kbase_mcu_shared_interface_region_tracker_init(kbdev); if (ret != 0) { @@ -1110,15 +1121,21 @@ int kbase_csf_firmware_set_timeout(struct kbase_device *const kbdev, void kbase_csf_enter_protected_mode(struct kbase_device *kbdev) { struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface; - unsigned long flags; - kbase_csf_scheduler_spin_lock(kbdev, &flags); + kbase_csf_scheduler_spin_lock_assert_held(kbdev); set_global_request(global_iface, GLB_REQ_PROTM_ENTER_MASK); dev_dbg(kbdev->dev, "Sending request to enter protected mode"); kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR); - kbase_csf_scheduler_spin_unlock(kbdev, flags); +} + +void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev) +{ + int err = wait_for_global_request(kbdev, GLB_REQ_PROTM_ENTER_MASK); - wait_for_global_request(kbdev, GLB_REQ_PROTM_ENTER_MASK); + if (err) { + if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE)) + kbase_reset_gpu(kbdev); + } } void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev) @@ -1127,12 +1144,38 @@ void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev) unsigned long flags; kbase_csf_scheduler_spin_lock(kbdev, &flags); + /* Validate there are no on-slot groups when sending the + * halt request to firmware. + */ + WARN_ON(kbase_csf_scheduler_get_nr_active_csgs_locked(kbdev)); set_global_request(global_iface, GLB_REQ_HALT_MASK); dev_dbg(kbdev->dev, "Sending request to HALT MCU"); kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR); kbase_csf_scheduler_spin_unlock(kbdev, flags); } +#ifdef KBASE_PM_RUNTIME +void kbase_csf_firmware_trigger_mcu_sleep(struct kbase_device *kbdev) +{ + struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface; + unsigned long flags; + + kbase_csf_scheduler_spin_lock(kbdev, &flags); + set_global_request(global_iface, GLB_REQ_SLEEP_MASK); + dev_dbg(kbdev->dev, "Sending sleep request to MCU"); + kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR); + kbase_csf_scheduler_spin_unlock(kbdev, flags); +} + +bool kbase_csf_firmware_is_mcu_in_sleep(struct kbase_device *kbdev) +{ + lockdep_assert_held(&kbdev->hwaccess_lock); + + return (global_request_complete(kbdev, GLB_REQ_SLEEP_MASK) && + kbase_csf_firmware_mcu_halted(kbdev)); +} +#endif + int kbase_csf_trigger_firmware_config_update(struct kbase_device *kbdev) { struct kbase_csf_global_iface *global_iface = &kbdev->csf.global_iface; @@ -1331,7 +1374,7 @@ int kbase_csf_firmware_mcu_shared_mapping_init( mmu_insert_pages_error: mutex_lock(&kbdev->csf.reg_lock); - kbase_remove_va_region(va_reg); + kbase_remove_va_region(kbdev, va_reg); va_region_add_error: kbase_free_alloced_region(va_reg); mutex_unlock(&kbdev->csf.reg_lock); @@ -1363,7 +1406,7 @@ void kbase_csf_firmware_mcu_shared_mapping_term( { if (csf_mapping->va_reg) { mutex_lock(&kbdev->csf.reg_lock); - kbase_remove_va_region(csf_mapping->va_reg); + kbase_remove_va_region(kbdev, csf_mapping->va_reg); kbase_free_alloced_region(csf_mapping->va_reg); mutex_unlock(&kbdev->csf.reg_lock); } diff --git a/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.c b/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.c index 96746c6..1815a26 100644 --- a/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.c +++ b/mali_kbase/csf/mali_kbase_csf_heap_context_alloc.c @@ -50,8 +50,8 @@ static u64 sub_alloc(struct kbase_csf_heap_context_allocator *const ctx_alloc) MAX_TILER_HEAPS); if (unlikely(heap_nr >= MAX_TILER_HEAPS)) { - dev_err(kctx->kbdev->dev, - "No free tiler heap contexts in the pool\n"); + dev_dbg(kctx->kbdev->dev, + "No free tiler heap contexts in the pool"); return 0; } @@ -159,6 +159,11 @@ u64 kbase_csf_heap_context_allocator_alloc( u64 nr_pages = PFN_UP(HEAP_CTX_REGION_SIZE); u64 heap_gpu_va = 0; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + #ifdef CONFIG_MALI_VECTOR_DUMP flags |= BASE_MEM_PROT_CPU_RD; #endif @@ -169,13 +174,14 @@ u64 kbase_csf_heap_context_allocator_alloc( * allocate it. */ if (!ctx_alloc->region) { - ctx_alloc->region = kbase_mem_alloc(kctx, nr_pages, nr_pages, - 0, &flags, &ctx_alloc->gpu_va); + ctx_alloc->region = + kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, + &ctx_alloc->gpu_va, mmu_sync_info); } /* If the pool still isn't allocated then an error occurred. */ if (unlikely(!ctx_alloc->region)) { - dev_err(kctx->kbdev->dev, "Failed to allocate a pool of tiler heap contexts\n"); + dev_dbg(kctx->kbdev->dev, "Failed to allocate a pool of tiler heap contexts"); } else { heap_gpu_va = sub_alloc(ctx_alloc); } diff --git a/mali_kbase/csf/mali_kbase_csf_kcpu.c b/mali_kbase/csf/mali_kbase_csf_kcpu.c index 4e26a49..8729307 100644 --- a/mali_kbase/csf/mali_kbase_csf_kcpu.c +++ b/mali_kbase/csf/mali_kbase_csf_kcpu.c @@ -34,7 +34,7 @@ static DEFINE_SPINLOCK(kbase_csf_fence_lock); #endif static void kcpu_queue_process(struct kbase_kcpu_command_queue *kcpu_queue, - bool ignore_waits); + bool drain_queue); static void kcpu_queue_process_worker(struct work_struct *data); @@ -220,7 +220,7 @@ static int kbase_kcpu_jit_allocate_process( for (i = 0; i < count; i++, info++) { /* The JIT ID is still in use so fail the allocation */ if (kctx->jit_alloc[info->id]) { - dev_warn(kctx->kbdev->dev, "JIT ID still in use\n"); + dev_dbg(kctx->kbdev->dev, "JIT ID still in use"); return -EINVAL; } } @@ -458,7 +458,7 @@ static int kbase_kcpu_jit_free_process(struct kbase_kcpu_command_queue *queue, int item_err = 0; if (!kctx->jit_alloc[ids[i]]) { - dev_warn(kctx->kbdev->dev, "invalid JIT free ID\n"); + dev_dbg(kctx->kbdev->dev, "invalid JIT free ID"); rc = -EINVAL; item_err = rc; } else { @@ -964,7 +964,7 @@ static int kbase_kcpu_cqs_wait_operation_process(struct kbase_device *kbdev, sig_set = *evt > cqs_wait_operation->objs[i].val; break; default: - dev_warn(kbdev->dev, + dev_dbg(kbdev->dev, "Unsupported CQS wait operation %d", cqs_wait_operation->objs[i].operation); kbase_phy_alloc_mapping_put(queue->kctx, mapping); @@ -976,8 +976,9 @@ static int kbase_kcpu_cqs_wait_operation_process(struct kbase_device *kbdev, /* Increment evt up to the error_state value depending on the CQS data type */ switch (cqs_wait_operation->objs[i].data_type) { default: - dev_warn(kbdev->dev, "Unreachable data_type=%d", cqs_wait_operation->objs[i].data_type); - /* Fallthrough - hint to compiler that there's really only 2 options at present */ + dev_dbg(kbdev->dev, "Unreachable data_type=%d", cqs_wait_operation->objs[i].data_type); + /* Fallthrough - hint to compiler that there's really only 2 options at present */ + fallthrough; case BASEP_CQS_DATA_TYPE_U32: evt = (u64 *)((u8 *)evt + sizeof(u32)); break; @@ -1100,7 +1101,7 @@ static void kbase_kcpu_cqs_set_operation_process( *evt = cqs_set_operation->objs[i].val; break; default: - dev_warn(kbdev->dev, + dev_dbg(kbdev->dev, "Unsupported CQS set operation %d", cqs_set_operation->objs[i].operation); queue->has_error = true; break; @@ -1109,8 +1110,9 @@ static void kbase_kcpu_cqs_set_operation_process( /* Increment evt up to the error_state value depending on the CQS data type */ switch (cqs_set_operation->objs[i].data_type) { default: - dev_warn(kbdev->dev, "Unreachable data_type=%d", cqs_set_operation->objs[i].data_type); - /* Fallthrough - hint to compiler that there's really only 2 options at present */ + dev_dbg(kbdev->dev, "Unreachable data_type=%d", cqs_set_operation->objs[i].data_type); + /* Fallthrough - hint to compiler that there's really only 2 options at present */ + fallthrough; case BASEP_CQS_DATA_TYPE_U32: evt = (u64 *)((u8 *)evt + sizeof(u32)); break; @@ -1465,8 +1467,8 @@ static int delete_queue(struct kbase_context *kctx, u32 id) kfree(queue); } else { - dev_warn(kctx->kbdev->dev, - "Attempt to delete a non-existent KCPU queue\n"); + dev_dbg(kctx->kbdev->dev, + "Attempt to delete a non-existent KCPU queue"); mutex_unlock(&kctx->csf.kcpu_queues.lock); err = -EINVAL; } @@ -1525,7 +1527,7 @@ static void KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_FREE_END( } static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue, - bool ignore_waits) + bool drain_queue) { struct kbase_device *kbdev = queue->kctx->kbdev; bool process_next = true; @@ -1548,7 +1550,7 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue, status = 0; #if IS_ENABLED(CONFIG_SYNC_FILE) - if (ignore_waits) { + if (drain_queue) { kbase_kcpu_fence_wait_cancel(queue, &cmd->info.fence); } else { @@ -1601,7 +1603,7 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue, status = kbase_kcpu_cqs_wait_process(kbdev, queue, &cmd->info.cqs_wait); - if (!status && !ignore_waits) { + if (!status && !drain_queue) { process_next = false; } else { /* Either all CQS objects were signaled or @@ -1623,7 +1625,7 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue, status = kbase_kcpu_cqs_wait_operation_process(kbdev, queue, &cmd->info.cqs_wait_operation); - if (!status && !ignore_waits) { + if (!status && !drain_queue) { process_next = false; } else { /* Either all CQS objects were signaled or @@ -1651,22 +1653,25 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue, case BASE_KCPU_COMMAND_TYPE_MAP_IMPORT: { struct kbase_ctx_ext_res_meta *meta = NULL; - KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_MAP_IMPORT_START( - kbdev, queue); + if (!drain_queue) { + KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_MAP_IMPORT_START( + kbdev, queue); - kbase_gpu_vm_lock(queue->kctx); - meta = kbase_sticky_resource_acquire( - queue->kctx, cmd->info.import.gpu_va); - kbase_gpu_vm_unlock(queue->kctx); + kbase_gpu_vm_lock(queue->kctx); + meta = kbase_sticky_resource_acquire( + queue->kctx, cmd->info.import.gpu_va); + kbase_gpu_vm_unlock(queue->kctx); - if (meta == NULL) { - queue->has_error = true; - dev_warn(kbdev->dev, - "failed to map an external resource\n"); - } + if (meta == NULL) { + queue->has_error = true; + dev_dbg( + kbdev->dev, + "failed to map an external resource"); + } - KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_MAP_IMPORT_END( - kbdev, queue, meta ? 0 : 1); + KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_MAP_IMPORT_END( + kbdev, queue, meta ? 0 : 1); + } break; } case BASE_KCPU_COMMAND_TYPE_UNMAP_IMPORT: { @@ -1682,8 +1687,8 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue, if (!ret) { queue->has_error = true; - dev_warn(kbdev->dev, - "failed to release the reference. resource not found\n"); + dev_dbg(kbdev->dev, + "failed to release the reference. resource not found"); } KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_UNMAP_IMPORT_END( @@ -1703,8 +1708,8 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue, if (!ret) { queue->has_error = true; - dev_warn(kbdev->dev, - "failed to release the reference. resource not found\n"); + dev_dbg(kbdev->dev, + "failed to release the reference. resource not found"); } KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_UNMAP_IMPORT_FORCE_END( @@ -1713,24 +1718,32 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue, } case BASE_KCPU_COMMAND_TYPE_JIT_ALLOC: { - KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_ALLOC_START( - kbdev, queue); - - status = kbase_kcpu_jit_allocate_process(queue, cmd); - if (status == -EAGAIN) { - process_next = false; + if (drain_queue) { + /* We still need to call this function to clean the JIT alloc info up */ + kbase_kcpu_jit_allocate_finish(queue, cmd); } else { - if (status != 0) - queue->has_error = true; + KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_ALLOC_START( + kbdev, queue); - KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_ALLOC_INFO( - kbdev, queue, &cmd->info.jit_alloc, - status); + status = kbase_kcpu_jit_allocate_process(queue, + cmd); + if (status == -EAGAIN) { + process_next = false; + } else { + if (status != 0) + queue->has_error = true; - kbase_kcpu_jit_allocate_finish(queue, cmd); - KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_ALLOC_END( + KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_ALLOC_INFO( + kbdev, queue, + &cmd->info.jit_alloc, status); + + kbase_kcpu_jit_allocate_finish(queue, + cmd); + KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_JIT_ALLOC_END( kbdev, queue); + } } + break; } case BASE_KCPU_COMMAND_TYPE_JIT_FREE: @@ -1748,56 +1761,39 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue, struct kbase_suspend_copy_buffer *sus_buf = cmd->info.suspend_buf_copy.sus_buf; - KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_GROUP_SUSPEND_START( - kbdev, queue); + if (!drain_queue) { + KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_GROUP_SUSPEND_START( + kbdev, queue); - status = kbase_csf_queue_group_suspend_process( + status = kbase_csf_queue_group_suspend_process( queue->kctx, sus_buf, cmd->info.suspend_buf_copy.group_handle); - if (status) - queue->has_error = true; + if (status) + queue->has_error = true; - KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_GROUP_SUSPEND_END( - kbdev, queue, status); + KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_GROUP_SUSPEND_END( + kbdev, queue, status); - if (!sus_buf->cpu_alloc) { - int i; + if (!sus_buf->cpu_alloc) { + int i; - for (i = 0; i < sus_buf->nr_pages; i++) - put_page(sus_buf->pages[i]); - } else { - kbase_mem_phy_alloc_kernel_unmapped( - sus_buf->cpu_alloc); - kbase_mem_phy_alloc_put(sus_buf->cpu_alloc); + for (i = 0; i < sus_buf->nr_pages; i++) + put_page(sus_buf->pages[i]); + } else { + kbase_mem_phy_alloc_kernel_unmapped( + sus_buf->cpu_alloc); + kbase_mem_phy_alloc_put( + sus_buf->cpu_alloc); + } } kfree(sus_buf->pages); kfree(sus_buf); break; } -#if MALI_UNIT_TEST - case BASE_KCPU_COMMAND_TYPE_SAMPLE_TIME: { - u64 time = ktime_get_raw_ns(); - void *target_page = kmap(*cmd->info.sample_time.page); - - if (target_page) { - memcpy(target_page + - cmd->info.sample_time.page_offset, - &time, sizeof(time)); - kunmap(*cmd->info.sample_time.page); - } else { - dev_warn(kbdev->dev, - "Could not kmap target page\n"); - queue->has_error = true; - } - put_page(*cmd->info.sample_time.page); - kfree(cmd->info.sample_time.page); - break; - } -#endif /* MALI_UNIT_TEST */ default: - dev_warn(kbdev->dev, - "Unrecognized command type\n"); + dev_dbg(kbdev->dev, + "Unrecognized command type"); break; } /* switch */ @@ -1933,14 +1929,6 @@ static void KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_COMMAND( kbdev, queue, cmd->info.suspend_buf_copy.sus_buf, cmd->info.suspend_buf_copy.group_handle); break; -#if MALI_UNIT_TEST - case BASE_KCPU_COMMAND_TYPE_SAMPLE_TIME: - /* - * This is test-only KCPU command, no need to have a timeline - * entry - */ - break; -#endif /* MALI_UNIT_TEST */ } } @@ -1966,8 +1954,8 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx, * in the set. */ if (enq->nr_commands != 1) { - dev_err(kctx->kbdev->dev, - "More than one commands enqueued\n"); + dev_dbg(kctx->kbdev->dev, + "More than one commands enqueued"); return -EINVAL; } @@ -2081,40 +2069,9 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx, &command.info.suspend_buf_copy, kcpu_cmd); break; -#if MALI_UNIT_TEST - case BASE_KCPU_COMMAND_TYPE_SAMPLE_TIME: { - int const page_cnt = 1; - - kcpu_cmd->type = BASE_KCPU_COMMAND_TYPE_SAMPLE_TIME; - kcpu_cmd->info.sample_time.page_addr = - command.info.sample_time.time & PAGE_MASK; - kcpu_cmd->info.sample_time.page_offset = - command.info.sample_time.time & ~PAGE_MASK; - kcpu_cmd->info.sample_time.page = kcalloc( - page_cnt, sizeof(struct page *), GFP_KERNEL); - if (!kcpu_cmd->info.sample_time.page) { - ret = -ENOMEM; - } else { - int pinned_pages = get_user_pages_fast( - kcpu_cmd->info.sample_time.page_addr, - page_cnt, 1, - kcpu_cmd->info.sample_time.page); - - if (pinned_pages < 0) { - ret = pinned_pages; - kfree(kcpu_cmd->info.sample_time.page); - } else if (pinned_pages != page_cnt) { - ret = -EINVAL; - kfree(kcpu_cmd->info.sample_time.page); - } - } - - break; - } -#endif /* MALI_UNIT_TEST */ default: - dev_warn(queue->kctx->kbdev->dev, - "Unknown command type %u\n", command.type); + dev_dbg(queue->kctx->kbdev->dev, + "Unknown command type %u", command.type); ret = -EINVAL; break; } diff --git a/mali_kbase/csf/mali_kbase_csf_kcpu.h b/mali_kbase/csf/mali_kbase_csf_kcpu.h index 9964f20..6300569 100644 --- a/mali_kbase/csf/mali_kbase_csf_kcpu.h +++ b/mali_kbase/csf/mali_kbase_csf_kcpu.h @@ -196,13 +196,6 @@ struct kbase_kcpu_command_group_suspend_info { u8 group_handle; }; -#if MALI_UNIT_TEST -struct kbase_kcpu_command_sample_time_info { - u64 page_addr; - u64 page_offset; - struct page **page; -}; -#endif /* MALI_UNIT_TEST */ /** * struct kbase_cpu_command - Command which is to be part of the kernel @@ -235,9 +228,6 @@ struct kbase_kcpu_command { struct kbase_kcpu_command_jit_alloc_info jit_alloc; struct kbase_kcpu_command_jit_free_info jit_free; struct kbase_kcpu_command_group_suspend_info suspend_buf_copy; -#if MALI_UNIT_TEST - struct kbase_kcpu_command_sample_time_info sample_time; -#endif /* MALI_UNIT_TEST */ } info; }; diff --git a/mali_kbase/csf/mali_kbase_csf_reset_gpu.c b/mali_kbase/csf/mali_kbase_csf_reset_gpu.c index f6d61d7..7b63132 100644 --- a/mali_kbase/csf/mali_kbase_csf_reset_gpu.c +++ b/mali_kbase/csf/mali_kbase_csf_reset_gpu.c @@ -461,11 +461,14 @@ static void kbase_csf_reset_gpu_worker(struct work_struct *data) { struct kbase_device *kbdev = container_of(data, struct kbase_device, csf.reset.work); + bool gpu_sleep_mode_active = false; bool firmware_inited; unsigned long flags; int err = 0; const enum kbase_csf_reset_gpu_state initial_reset_state = atomic_read(&kbdev->csf.reset.state); + const bool silent = + kbase_csf_reset_state_is_silent(initial_reset_state); /* Ensure any threads (e.g. executing the CSF scheduler) have finished * using the HW @@ -474,14 +477,30 @@ static void kbase_csf_reset_gpu_worker(struct work_struct *data) spin_lock_irqsave(&kbdev->hwaccess_lock, flags); firmware_inited = kbdev->csf.firmware_inited; +#ifdef KBASE_PM_RUNTIME + gpu_sleep_mode_active = kbdev->pm.backend.gpu_sleep_mode_active; +#endif spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); - if (!kbase_pm_context_active_handle_suspend(kbdev, - KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE)) { - bool silent = - kbase_csf_reset_state_is_silent(initial_reset_state); + if (unlikely(gpu_sleep_mode_active)) { +#ifdef KBASE_PM_RUNTIME + /* As prior to GPU reset all on-slot groups are suspended, + * need to wake up the MCU from sleep. + * No pm active reference is taken here since GPU is in sleep + * state and both runtime & system suspend synchronize with the + * GPU reset before they wake up the GPU to suspend on-slot + * groups. GPUCORE-29850 would add the proper handling. + */ + kbase_pm_lock(kbdev); + if (kbase_pm_force_mcu_wakeup_after_sleep(kbdev)) + dev_warn(kbdev->dev, "Wait for MCU wake up failed on GPU reset"); + kbase_pm_unlock(kbdev); err = kbase_csf_reset_gpu_now(kbdev, firmware_inited, silent); +#endif + } else if (!kbase_pm_context_active_handle_suspend(kbdev, + KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE)) { + err = kbase_csf_reset_gpu_now(kbdev, firmware_inited, silent); kbase_pm_context_idle(kbdev); } @@ -599,6 +618,8 @@ int kbase_reset_gpu_wait(struct kbase_device *kbdev) if (!remaining) { dev_warn(kbdev->dev, "Timed out waiting for the GPU reset to complete"); + + return -ETIMEDOUT; } else if (atomic_read(&kbdev->csf.reset.state) == KBASE_CSF_RESET_GPU_FAILED) { diff --git a/mali_kbase/csf/mali_kbase_csf_scheduler.c b/mali_kbase/csf/mali_kbase_csf_scheduler.c index 8109570..f22a5d7 100644 --- a/mali_kbase/csf/mali_kbase_csf_scheduler.c +++ b/mali_kbase/csf/mali_kbase_csf_scheduler.c @@ -30,14 +30,16 @@ #include <linux/export.h> #include <uapi/gpu/arm/midgard/csf/mali_gpu_csf_registers.h> #include <uapi/gpu/arm/midgard/mali_base_kernel.h> +#include <mali_kbase_hwaccess_time.h> /* Value to indicate that a queue group is not groups_to_schedule list */ #define KBASEP_GROUP_PREPARED_SEQ_NUM_INVALID (U32_MAX) -/* Waiting timeout for scheduler state change for descheduling a CSG */ -#define CSG_SCHED_STOP_TIMEOUT_MS (50) - -#define CSG_SUSPEND_ON_RESET_WAIT_TIMEOUT_MS DEFAULT_RESET_TIMEOUT_MS +/* This decides the upper limit on the waiting time for the Scheduler + * to exit the sleep state. Usually the value of autosuspend_delay is + * expected to be around 100 milli seconds. + */ +#define MAX_AUTO_SUSPEND_DELAY_MS (5000) /* Maximum number of endpoints which may run tiler jobs. */ #define CSG_TILER_MAX ((u8)1) @@ -75,10 +77,8 @@ /* CS suspended and is wait for a CQS condition */ #define CS_WAIT_SYNC_FLAG (1 << 1) -/* 2 GPU address space slots are reserved for MCU and privileged context for HW - * counter dumping. TODO remove the slot reserved for latter in GPUCORE-26293. - */ -#define NUM_RESERVED_AS_SLOTS (2) +/* A GPU address space slot is reserved for MCU. */ +#define NUM_RESERVED_AS_SLOTS (1) static int scheduler_group_schedule(struct kbase_queue_group *group); static void remove_group_from_idle_wait(struct kbase_queue_group *const group); @@ -94,14 +94,116 @@ static struct kbase_queue_group *get_tock_top_group( static void scheduler_enable_tick_timer_nolock(struct kbase_device *kbdev); static int suspend_active_queue_groups(struct kbase_device *kbdev, unsigned long *slot_mask); +static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev, + bool system_suspend); static void schedule_in_cycle(struct kbase_queue_group *group, bool force); #define kctx_as_enabled(kctx) (!kbase_ctx_flag(kctx, KCTX_AS_DISABLED_ON_FAULT)) +#ifdef KBASE_PM_RUNTIME +/** + * wait_for_scheduler_to_exit_sleep() - Wait for Scheduler to exit the + * sleeping state. + * + * @kbdev: Pointer to the device + * + * This function waits until the Scheduler has exited the sleep state and + * it is called when an on-slot group is terminated or when the suspend + * buffer of an on-slot group needs to be captured. + * + * Return: 0 when the wait is successful, otherwise an error code. + */ +static int wait_for_scheduler_to_exit_sleep(struct kbase_device *kbdev) +{ + struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; + int autosuspend_delay = kbdev->dev->power.autosuspend_delay; + unsigned int sleep_exit_wait_time; + long remaining; + int ret = 0; + + lockdep_assert_held(&scheduler->lock); + WARN_ON(scheduler->state != SCHED_SLEEPING); + + /* No point in waiting if autosuspend_delay value is negative. + * For the negative value of autosuspend_delay Driver will directly + * go for the suspend of Scheduler, but the autosuspend_delay value + * could have been changed after the sleep was initiated. + */ + if (autosuspend_delay < 0) + return -EINVAL; + + if (autosuspend_delay > MAX_AUTO_SUSPEND_DELAY_MS) + autosuspend_delay = MAX_AUTO_SUSPEND_DELAY_MS; + + /* Usually Scheduler would remain in sleeping state until the + * auto-suspend timer expires and all active CSGs are suspended. + */ + sleep_exit_wait_time = autosuspend_delay + kbdev->reset_timeout_ms; + + remaining = kbase_csf_timeout_in_jiffies(sleep_exit_wait_time); + + while ((scheduler->state == SCHED_SLEEPING) && !ret) { + mutex_unlock(&scheduler->lock); + remaining = wait_event_timeout( + kbdev->csf.event_wait, + (scheduler->state != SCHED_SLEEPING), + remaining); + mutex_lock(&scheduler->lock); + if (!remaining && (scheduler->state == SCHED_SLEEPING)) + ret = -ETIMEDOUT; + } + + return ret; +} + +/** + * force_scheduler_to_exit_sleep() - Force scheduler to exit sleep state + * + * @kbdev: Pointer to the device + * + * This function will force the Scheduler to exit the sleep state by doing the + * wake up of MCU and suspension of on-slot groups. It is called at the time of + * system suspend. + */ +static void force_scheduler_to_exit_sleep(struct kbase_device *kbdev) +{ + struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; + unsigned long flags; + int ret; + + lockdep_assert_held(&scheduler->lock); + WARN_ON(scheduler->state != SCHED_SLEEPING); + WARN_ON(!kbdev->pm.backend.gpu_sleep_mode_active); + + kbase_pm_lock(kbdev); + ret = kbase_pm_force_mcu_wakeup_after_sleep(kbdev); + if (ret) + dev_warn(kbdev->dev, "[%llu] Wait for MCU wake up failed on forced scheduler suspend", + kbase_backend_get_cycle_cnt(kbdev)); + kbase_pm_unlock(kbdev); + + suspend_active_groups_on_powerdown(kbdev, true); + + kbase_pm_lock(kbdev); + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + kbdev->pm.backend.gpu_sleep_mode_active = false; + kbdev->pm.backend.gpu_wakeup_override = false; + kbase_pm_update_state(kbdev); + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + ret = kbase_pm_wait_for_desired_state(kbdev); + if (ret) + dev_warn(kbdev->dev, "[%llu] Wait for pm state change failed on forced scheduler suspend", + kbase_backend_get_cycle_cnt(kbdev)); + kbase_pm_unlock(kbdev); + + scheduler->state = SCHED_SUSPENDED; +} +#endif + /** * tick_timer_callback() - Callback function for the scheduling tick hrtimer * - * @timer: Pointer to the device + * @timer: Pointer to the scheduling tick hrtimer * * This function will enqueue the scheduling tick work item for immediate * execution, if it has not been queued already. @@ -173,14 +275,10 @@ static void cancel_tick_timer(struct kbase_device *kbdev) static void enqueue_tick_work(struct kbase_device *kbdev) { struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler; - unsigned long flags; lockdep_assert_held(&scheduler->lock); - spin_lock_irqsave(&scheduler->interrupt_lock, flags); - WARN_ON(scheduler->tick_timer_active); - queue_work(scheduler->wq, &scheduler->tick_work); - spin_unlock_irqrestore(&scheduler->interrupt_lock, flags); + kbase_csf_scheduler_invoke_tick(kbdev); } static void release_doorbell(struct kbase_device *kbdev, int doorbell_nr) @@ -288,11 +386,11 @@ static void scheduler_doorbell_init(struct kbase_device *kbdev) WARN_ON(doorbell_nr != CSF_KERNEL_DOORBELL_NR); } -static u32 get_nr_active_csgs(struct kbase_device *kbdev) +u32 kbase_csf_scheduler_get_nr_active_csgs_locked(struct kbase_device *kbdev) { u32 nr_active_csgs; - lockdep_assert_held(&kbdev->csf.scheduler.lock); + lockdep_assert_held(&kbdev->csf.scheduler.interrupt_lock); nr_active_csgs = bitmap_weight(kbdev->csf.scheduler.csg_inuse_bitmap, kbdev->csf.global_iface.group_num); @@ -300,27 +398,16 @@ static u32 get_nr_active_csgs(struct kbase_device *kbdev) return nr_active_csgs; } -/** - * csgs_active - returns true if any of CSG slots are in use - * - * @kbdev: Instance of a GPU platform device that implements a CSF interface. - * - * Return: the interface is actively engaged flag. - */ -static bool csgs_active(struct kbase_device *kbdev) +u32 kbase_csf_scheduler_get_nr_active_csgs(struct kbase_device *kbdev) { u32 nr_active_csgs; + unsigned long flags; - mutex_lock(&kbdev->csf.scheduler.lock); - nr_active_csgs = get_nr_active_csgs(kbdev); - mutex_unlock(&kbdev->csf.scheduler.lock); + spin_lock_irqsave(&kbdev->csf.scheduler.interrupt_lock, flags); + nr_active_csgs = kbase_csf_scheduler_get_nr_active_csgs_locked(kbdev); + spin_unlock_irqrestore(&kbdev->csf.scheduler.interrupt_lock, flags); - /* Right now if any of the CSG interfaces are in use - * then we need to assume that there is some work pending. - * In future when we have IDLE notifications from firmware implemented - * then we would have a better idea of the pending work. - */ - return (nr_active_csgs != 0); + return nr_active_csgs; } /** @@ -395,7 +482,9 @@ static void scheduler_wait_protm_quit(struct kbase_device *kbdev) !kbase_csf_scheduler_protected_mode_in_use(kbdev), wt); if (!remaining) - dev_warn(kbdev->dev, "Timeout, protm_quit wait skipped"); + dev_warn(kbdev->dev, "[%llu] Timeout (%d ms), protm_quit wait skipped", + kbase_backend_get_cycle_cnt(kbdev), + kbdev->csf.fw_timeout_ms); KBASE_KTRACE_ADD(kbdev, SCHEDULER_WAIT_PROTM_QUIT_DONE, NULL, jiffies_to_msecs(remaining)); @@ -483,20 +572,198 @@ static void disable_gpu_idle_fw_timer(struct kbase_device *kbdev) spin_unlock_irqrestore(&scheduler->interrupt_lock, flags); } +/** + * scheduler_pm_active_handle_suspend() - Acquire the PM reference count for + * Scheduler + * + * @kbdev: Pointer to the device + * @suspend_handler: Handler code for how to handle a suspend that might occur. + * + * This function is usually called when Scheduler needs to be activated. + * The PM reference count is acquired for the Scheduler and the power on + * of GPU is initiated. + */ +static int scheduler_pm_active_handle_suspend(struct kbase_device *kbdev, + enum kbase_pm_suspend_handler suspend_handler) +{ + unsigned long flags; + u32 prev_count; + int ret = 0; + + lockdep_assert_held(&kbdev->csf.scheduler.lock); + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + prev_count = kbdev->csf.scheduler.pm_active_count; + if (!WARN_ON(prev_count == U32_MAX)) + kbdev->csf.scheduler.pm_active_count++; + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + /* On 0 => 1, make a pm_ctx_active request */ + if (!prev_count) { + ret = kbase_pm_context_active_handle_suspend(kbdev, + suspend_handler); + if (ret) { + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + kbdev->csf.scheduler.pm_active_count--; + kbase_pm_update_state(kbdev); + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + } + } + + return ret; +} + +#ifdef KBASE_PM_RUNTIME +/** + * scheduler_pm_active_after_sleep() - Acquire the PM reference count for + * Scheduler + * + * @kbdev: Pointer to the device + * @flags: flags containing previous interrupt state + * + * This function is called when Scheduler needs to be activated from the + * sleeping state. + * The PM reference count is acquired for the Scheduler and the wake up of + * MCU is initiated. It resets the flag that indicates to the MCU state + * machine that MCU needs to be put in sleep state. + * + * Note: This function shall be called with hwaccess lock held and it will + * release that lock. + * + * Return: zero when the PM reference was taken and non-zero when the + * system is being suspending/suspended. + */ +static int scheduler_pm_active_after_sleep(struct kbase_device *kbdev, + unsigned long flags) +{ + u32 prev_count; + int ret = 0; + + lockdep_assert_held(&kbdev->csf.scheduler.lock); + lockdep_assert_held(&kbdev->hwaccess_lock); + + prev_count = kbdev->csf.scheduler.pm_active_count; + if (!WARN_ON(prev_count == U32_MAX)) + kbdev->csf.scheduler.pm_active_count++; + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + /* On 0 => 1, make a pm_ctx_active request */ + if (!prev_count) { + ret = kbase_pm_context_active_handle_suspend(kbdev, + KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE); + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + if (ret) + kbdev->csf.scheduler.pm_active_count--; + else + kbdev->pm.backend.gpu_sleep_mode_active = false; + kbase_pm_update_state(kbdev); + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + } + + return ret; +} +#endif + +/** + * scheduler_pm_idle() - Release the PM reference count held by Scheduler + * + * @kbdev: Pointer to the device + * + * This function is usually called after Scheduler is suspended. + * The PM reference count held by the Scheduler is released to trigger the + * power down of GPU. + */ +static void scheduler_pm_idle(struct kbase_device *kbdev) +{ + unsigned long flags; + u32 prev_count; + + lockdep_assert_held(&kbdev->csf.scheduler.lock); + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + prev_count = kbdev->csf.scheduler.pm_active_count; + if (!WARN_ON(prev_count == 0)) + kbdev->csf.scheduler.pm_active_count--; + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + if (prev_count == 1) + kbase_pm_context_idle(kbdev); +} + +#ifdef KBASE_PM_RUNTIME +/** + * scheduler_pm_idle_before_sleep() - Release the PM reference count and + * trigger the tranistion to sleep state. + * + * @kbdev: Pointer to the device + * + * This function is called on the GPU idle notification. It releases the + * Scheduler's PM reference count and sets the flag to indicate to the + * MCU state machine that MCU needs to be put in sleep state. + */ +static void scheduler_pm_idle_before_sleep(struct kbase_device *kbdev) +{ + unsigned long flags; + u32 prev_count; + + lockdep_assert_held(&kbdev->csf.scheduler.lock); + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + prev_count = kbdev->csf.scheduler.pm_active_count; + if (!WARN_ON(prev_count == 0)) + kbdev->csf.scheduler.pm_active_count--; + kbdev->pm.backend.gpu_sleep_mode_active = true; + kbdev->pm.backend.exit_gpu_sleep_mode = false; + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + if (prev_count == 1) + kbase_pm_context_idle(kbdev); +} +#endif + static void scheduler_wakeup(struct kbase_device *kbdev, bool kick) { struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler; + int ret; lockdep_assert_held(&scheduler->lock); + if ((scheduler->state != SCHED_SUSPENDED) && + (scheduler->state != SCHED_SLEEPING)) + return; + if (scheduler->state == SCHED_SUSPENDED) { - dev_dbg(kbdev->dev, "Re-activating the Scheduler"); - kbase_csf_scheduler_pm_active(kbdev); - scheduler->state = SCHED_INACTIVE; + dev_dbg(kbdev->dev, + "Re-activating the Scheduler after suspend"); + ret = scheduler_pm_active_handle_suspend(kbdev, + KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE); + } else { +#ifdef KBASE_PM_RUNTIME + unsigned long flags; - if (kick) - scheduler_enable_tick_timer_nolock(kbdev); + dev_dbg(kbdev->dev, + "Re-activating the Scheduler out of sleep"); + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + ret = scheduler_pm_active_after_sleep(kbdev, flags); + /* hwaccess_lock is released in the previous function call. */ +#endif } + + if (ret) { + /* GPUCORE-29850 would add the handling for the case where + * Scheduler could not be activated due to system suspend. + */ + dev_info(kbdev->dev, + "Couldn't wakeup Scheduler due to system suspend"); + return; + } + + scheduler->state = SCHED_INACTIVE; + + if (kick) + scheduler_enable_tick_timer_nolock(kbdev); } static void scheduler_suspend(struct kbase_device *kbdev) @@ -507,7 +774,7 @@ static void scheduler_suspend(struct kbase_device *kbdev) if (!WARN_ON(scheduler->state == SCHED_SUSPENDED)) { dev_dbg(kbdev->dev, "Suspending the Scheduler"); - kbase_csf_scheduler_pm_idle(kbdev); + scheduler_pm_idle(kbdev); scheduler->state = SCHED_SUSPENDED; } } @@ -542,11 +809,30 @@ static void update_idle_suspended_group_state(struct kbase_queue_group *group) /* If scheduler is not suspended and the given group's * static priority (reflected by the scan_seq_num) is inside - * the current tick slot-range, schedules an async tock. + * the current tick slot-range, or there are some on_slot + * idle groups, schedule an async tock. */ - if (scheduler->state != SCHED_SUSPENDED && - group->scan_seq_num < scheduler->num_csg_slots_for_tick) - schedule_in_cycle(group, true); + if (scheduler->state != SCHED_SUSPENDED) { + unsigned long flags; + int n_idle; + int n_used; + int n_slots = + group->kctx->kbdev->csf.global_iface.group_num; + + spin_lock_irqsave(&scheduler->interrupt_lock, flags); + n_idle = bitmap_weight(scheduler->csg_slots_idle_mask, + n_slots); + n_used = bitmap_weight(scheduler->csg_inuse_bitmap, + n_slots); + spin_unlock_irqrestore(&scheduler->interrupt_lock, + flags); + + if (n_idle || + n_used < scheduler->num_csg_slots_for_tick || + group->scan_seq_num < + scheduler->num_csg_slots_for_tick) + schedule_in_cycle(group, true); + } } else return; @@ -586,6 +872,14 @@ int kbase_csf_scheduler_group_get_slot(struct kbase_queue_group *group) return slot_num; } +/* kbasep_csf_scheduler_group_is_on_slot_locked() - Check if CSG is on slot. + * + * @group: GPU queue group to be checked + * + * This function needs to be called with scheduler's lock held + * + * Return: true if @group is on slot. + */ static bool kbasep_csf_scheduler_group_is_on_slot_locked( struct kbase_queue_group *group) { @@ -653,11 +947,13 @@ static int halt_stream_sync(struct kbase_queue *queue) == CS_ACK_STATE_START), remaining); if (!remaining) { - dev_warn(kbdev->dev, "Timed out waiting for queue to start on csi %d bound to group %d on slot %d", + dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for queue to start on csi %d bound to group %d on slot %d", + kbase_backend_get_cycle_cnt(kbdev), kbdev->csf.fw_timeout_ms, csi_index, group->handle, group->csg_nr); if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE)) kbase_reset_gpu(kbdev); + return -ETIMEDOUT; } @@ -678,7 +974,8 @@ static int halt_stream_sync(struct kbase_queue *queue) == CS_ACK_STATE_STOP), remaining); if (!remaining) { - dev_warn(kbdev->dev, "Timed out waiting for queue to stop on csi %d bound to group %d on slot %d", + dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for queue to stop on csi %d bound to group %d on slot %d", + kbase_backend_get_cycle_cnt(kbdev), kbdev->csf.fw_timeout_ms, queue->csi_index, group->handle, group->csg_nr); /* TODO GPUCORE-25328: The CSG can't be terminated, the GPU @@ -686,6 +983,8 @@ static int halt_stream_sync(struct kbase_queue *queue) */ if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE)) kbase_reset_gpu(kbdev); + + } return (remaining) ? 0 : -ETIMEDOUT; } @@ -739,6 +1038,8 @@ static int sched_halt_stream(struct kbase_queue *queue) long remaining; int slot; int err = 0; + const u32 group_schedule_timeout = + 20 * kbdev->csf.scheduler.csg_scheduling_period_ms; if (WARN_ON(!group)) return -EINVAL; @@ -782,8 +1083,7 @@ retry: */ remaining = wait_event_timeout( kbdev->csf.event_wait, can_halt_stream(kbdev, group), - kbase_csf_timeout_in_jiffies( - 20 * kbdev->csf.scheduler.csg_scheduling_period_ms)); + kbase_csf_timeout_in_jiffies(group_schedule_timeout)); mutex_lock(&scheduler->lock); @@ -845,26 +1145,62 @@ retry: kbase_csf_firmware_cs_output( stream, CS_ACK)) == CS_ACK_STATE_STOP), - kbdev->csf.fw_timeout_ms); + kbase_csf_timeout_in_jiffies(kbdev->csf.fw_timeout_ms)); if (!remaining) { dev_warn(kbdev->dev, - "Timed out waiting for queue stop ack on csi %d bound to group %d on slot %d", + "[%llu] Timeout (%d ms) waiting for queue stop ack on csi %d bound to group %d on slot %d", + kbase_backend_get_cycle_cnt(kbdev), kbdev->csf.fw_timeout_ms, queue->csi_index, group->handle, group->csg_nr); + + err = -ETIMEDOUT; } } } } else if (!remaining) { - dev_warn(kbdev->dev, "Group-%d failed to get a slot for stopping the queue on csi %d", - group->handle, queue->csi_index); + dev_warn(kbdev->dev, "[%llu] Group-%d failed to get a slot for stopping the queue on csi %d (timeout %d ms)", + kbase_backend_get_cycle_cnt(kbdev), + group->handle, queue->csi_index, + group_schedule_timeout); + + err = -ETIMEDOUT; } return err; } +/** + * scheduler_activate_on_queue_stop() - Activate the Scheduler when the GPU + * queue needs to be stopped. + * + * @queue: Pointer the GPU command queue + * + * This function is called when the CSI to which GPU queue is bound needs to + * be stopped. For that the corresponding queue group needs to be resident on + * the CSG slot and MCU firmware should be running. So this function makes the + * Scheduler exit the sleeping or suspended state. + */ +static void scheduler_activate_on_queue_stop(struct kbase_queue *queue) +{ + struct kbase_device *kbdev = queue->kctx->kbdev; + + scheduler_wakeup(kbdev, true); + + /* Wait for MCU firmware to start running */ + if (kbase_csf_scheduler_wait_mcu_active(kbdev)) { + dev_warn( + kbdev->dev, + "[%llu] Wait for MCU active failed for stopping queue on csi %d bound to group %d of context %d_%d on slot %d", + kbase_backend_get_cycle_cnt(kbdev), + queue->csi_index, queue->group->handle, + queue->kctx->tgid, queue->kctx->id, + queue->group->csg_nr); + } +} + int kbase_csf_scheduler_queue_stop(struct kbase_queue *queue) { struct kbase_device *kbdev = queue->kctx->kbdev; @@ -890,7 +1226,7 @@ int kbase_csf_scheduler_queue_stop(struct kbase_queue *queue) /* Since the group needs to be resumed in order to stop the queue, * check if GPU needs to be powered up. */ - scheduler_wakeup(kbdev, true); + scheduler_activate_on_queue_stop(queue); if ((slot >= 0) && (atomic_read(&csg_slot[slot].state) == CSG_SLOT_RUNNING)) @@ -1228,7 +1564,9 @@ static void halt_csg_slot(struct kbase_queue_group *group, bool suspend) csg_slot_running(kbdev, slot), remaining); if (!remaining) dev_warn(kbdev->dev, - "slot %d timed out on up-running\n", slot); + "[%llu] slot %d timeout (%d ms) on up-running\n", + kbase_backend_get_cycle_cnt(kbdev), + slot, kbdev->csf.fw_timeout_ms); } if (csg_slot_running(kbdev, slot)) { @@ -1251,6 +1589,8 @@ static void halt_csg_slot(struct kbase_queue_group *group, bool suspend) csg_slot[slot].trigger_jiffies = jiffies; KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_SLOT_STOP, group, halt_cmd); + KBASE_TLSTREAM_TL_KBASE_DEVICE_HALT_CSG( + kbdev, kbdev->gpu_props.props.raw_props.gpu_id, slot); kbase_csf_ring_csg_doorbell(kbdev, slot); } } @@ -1399,37 +1739,6 @@ bool save_slot_cs(struct kbase_csf_cmd_stream_group_info const *const ginfo, return is_waiting; } -/** - * Calculate how far in the future an event should be scheduled. - * - * The objective of this function is making sure that a minimum period of - * time is guaranteed between handling two consecutive events. - * - * This function guarantees a minimum period of time between two consecutive - * events: given the minimum period and the distance between the current time - * and the last event, the function returns the difference between the two. - * However, if more time than the minimum period has already elapsed - * since the last event, the function will return 0 to schedule work to handle - * the event with the lowest latency possible. - * - * @last_event: Timestamp of the last event, in jiffies. - * @time_now: Timestamp of the new event to handle, in jiffies. - * Must be successive to last_event. - * @period: Minimum period between two events, in jiffies. - * - * Return: Time to delay work to handle the current event, in jiffies - */ -static unsigned long get_schedule_delay(unsigned long last_event, - unsigned long time_now, - unsigned long period) -{ - const unsigned long t_distance = time_now - last_event; - const unsigned long delay_t = (t_distance < period) ? - (period - t_distance) : 0; - - return delay_t; -} - static void schedule_in_cycle(struct kbase_queue_group *group, bool force) { struct kbase_context *kctx = group->kctx; @@ -1446,13 +1755,10 @@ static void schedule_in_cycle(struct kbase_queue_group *group, bool force) */ if ((likely(scheduler_timer_is_enabled_nolock(kbdev)) || force) && !scheduler->tock_pending_request) { - const unsigned long delay = - get_schedule_delay(scheduler->last_schedule, jiffies, - CSF_SCHEDULER_TIME_TOCK_JIFFIES); scheduler->tock_pending_request = true; dev_dbg(kbdev->dev, "Kicking async for group %d\n", group->handle); - mod_delayed_work(scheduler->wq, &scheduler->tock_work, delay); + mod_delayed_work(scheduler->wq, &scheduler->tock_work, 0); } } @@ -1494,7 +1800,8 @@ void insert_group_to_runnable(struct kbase_csf_scheduler *const scheduler, if (likely(scheduler_timer_is_enabled_nolock(kbdev)) && (scheduler->total_runnable_grps == 1 || - scheduler->state == SCHED_SUSPENDED)) { + scheduler->state == SCHED_SUSPENDED || + scheduler->state == SCHED_SLEEPING)) { dev_dbg(kbdev->dev, "Kicking scheduler on first runnable group\n"); /* Fire a scheduling to start the time-slice */ enqueue_tick_work(kbdev); @@ -1516,6 +1823,7 @@ void remove_group_from_runnable(struct kbase_csf_scheduler *const scheduler, struct kbase_queue_group *new_head_grp; struct list_head *list = &kctx->csf.sched.runnable_groups[group->priority]; + unsigned long flags; lockdep_assert_held(&scheduler->lock); @@ -1524,6 +1832,30 @@ void remove_group_from_runnable(struct kbase_csf_scheduler *const scheduler, group->run_state = run_state; list_del_init(&group->link); + spin_lock_irqsave(&scheduler->interrupt_lock, flags); + /* The below condition will be true when the group running in protected + * mode is being terminated but the protected mode exit interrupt was't + * received. This can happen if the FW got stuck during protected mode + * for some reason (like GPU page fault or some internal error). + * In normal cases FW is expected to send the protected mode exit + * interrupt before it handles the CSG termination request. + */ + if (unlikely(scheduler->active_protm_grp == group)) { + /* CSG slot cleanup should have happened for the pmode group */ + WARN_ON(kbasep_csf_scheduler_group_is_on_slot_locked(group)); + WARN_ON(group->run_state != KBASE_CSF_GROUP_INACTIVE); + /* Initiate a GPU reset, in case it wasn't initiated yet, + * in order to rectify the anomaly. + */ + if (kbase_prepare_to_reset_gpu(kctx->kbdev, RESET_FLAGS_NONE)) + kbase_reset_gpu(kctx->kbdev); + + KBASE_KTRACE_ADD_CSF_GRP(kctx->kbdev, SCHEDULER_EXIT_PROTM, + scheduler->active_protm_grp, 0u); + scheduler->active_protm_grp = NULL; + } + spin_unlock_irqrestore(&scheduler->interrupt_lock, flags); + if (scheduler->top_grp == group) { /* * Note: this disables explicit rotation in the next scheduling @@ -2025,6 +2357,9 @@ static void program_csg_slot(struct kbase_queue_group *group, s8 slot, kbase_csf_firmware_csg_input(ginfo, CSG_ALLOW_OTHER, tiler_mask & U32_MAX); + /* Register group UID with firmware */ + kbase_csf_firmware_csg_input(ginfo, CSG_ITER_TRACE_CONFIG, + group->group_uid); ep_cfg = CSG_EP_REQ_COMPUTE_EP_SET(ep_cfg, compute_max); ep_cfg = CSG_EP_REQ_FRAGMENT_EP_SET(ep_cfg, fragment_max); @@ -2077,8 +2412,9 @@ static void program_csg_slot(struct kbase_queue_group *group, s8 slot, csg_slot->priority = prio; /* Trace the programming of the CSG on the slot */ - KBASE_TLSTREAM_TL_KBASE_DEVICE_PROGRAM_CSG(kbdev, - kbdev->gpu_props.props.raw_props.gpu_id, group->handle, slot); + KBASE_TLSTREAM_TL_KBASE_DEVICE_PROGRAM_CSG( + kbdev, kbdev->gpu_props.props.raw_props.gpu_id, group->kctx->id, + group->handle, slot); dev_dbg(kbdev->dev, "Starting group %d of context %d_%d on slot %d with priority %u\n", group->handle, kctx->tgid, kctx->id, slot, prio); @@ -2175,11 +2511,14 @@ static int term_group_sync(struct kbase_queue_group *group) csg_slot_stopped_locked(kbdev, group->csg_nr), remaining); if (!remaining) { - dev_warn(kbdev->dev, "term request timed out for group %d of context %d_%d on slot %d", + dev_warn(kbdev->dev, "[%llu] term request timeout (%d ms) for group %d of context %d_%d on slot %d", + kbase_backend_get_cycle_cnt(kbdev), kbdev->csf.fw_timeout_ms, group->handle, group->kctx->tgid, group->kctx->id, group->csg_nr); if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE)) kbase_reset_gpu(kbdev); + + err = -ETIMEDOUT; } @@ -2190,46 +2529,70 @@ void kbase_csf_scheduler_group_deschedule(struct kbase_queue_group *group) { struct kbase_device *kbdev = group->kctx->kbdev; struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; - long remaining = - kbase_csf_timeout_in_jiffies(CSG_SCHED_STOP_TIMEOUT_MS); - bool force = false; + bool on_slot; kbase_reset_gpu_assert_failed_or_prevented(kbdev); lockdep_assert_held(&group->kctx->csf.lock); mutex_lock(&scheduler->lock); KBASE_KTRACE_ADD_CSF_GRP(kbdev, GROUP_DESCHEDULE, group, group->run_state); - while (queue_group_scheduled_locked(group)) { - u32 saved_state = scheduler->state; - - if (!kbasep_csf_scheduler_group_is_on_slot_locked(group)) { - sched_evict_group(group, false, true); - } else if (saved_state == SCHED_INACTIVE || force) { - bool as_faulty; - - term_group_sync(group); - /* Treat the csg been terminated */ - as_faulty = cleanup_csg_slot(group); - /* remove from the scheduler list */ - sched_evict_group(group, as_faulty, false); - } + if (!queue_group_scheduled_locked(group)) + goto unlock; - /* waiting scheduler state to change */ - if (queue_group_scheduled_locked(group)) { - mutex_unlock(&scheduler->lock); - remaining = wait_event_timeout( - kbdev->csf.event_wait, - saved_state != scheduler->state, - remaining); - if (!remaining) { - dev_warn(kbdev->dev, "Scheduler state change wait timed out for group %d on slot %d", - group->handle, group->csg_nr); - force = true; - } - mutex_lock(&scheduler->lock); + on_slot = kbasep_csf_scheduler_group_is_on_slot_locked(group); + +#ifdef KBASE_PM_RUNTIME + /* If the queue group is on slot and Scheduler is in SLEEPING state, + * then we need to wait here for Scheduler to exit the sleep state + * (i.e. wait for the runtime suspend or power down of GPU). This would + * be better than aborting the power down. The group will be suspended + * anyways on power down, so won't have to send the CSG termination + * request to FW. + */ + if (on_slot && (scheduler->state == SCHED_SLEEPING)) { + if (wait_for_scheduler_to_exit_sleep(kbdev)) { + dev_warn( + kbdev->dev, + "Wait for scheduler to exit sleep state timedout when terminating group %d of context %d_%d on slot %d", + group->handle, group->kctx->tgid, + group->kctx->id, group->csg_nr); + + scheduler_wakeup(kbdev, true); + + /* Wait for MCU firmware to start running */ + if (kbase_csf_scheduler_wait_mcu_active(kbdev)) + dev_warn( + kbdev->dev, + "[%llu] Wait for MCU active failed when when terminating group %d of context %d_%d on slot %d", + kbase_backend_get_cycle_cnt(kbdev), + group->handle, group->kctx->tgid, + group->kctx->id, group->csg_nr); } + + /* Check the group state again as scheduler lock would have been + * released when waiting for the exit from SLEEPING state. + */ + if (!queue_group_scheduled_locked(group)) + goto unlock; + + on_slot = kbasep_csf_scheduler_group_is_on_slot_locked(group); + } +#endif + if (!on_slot) { + sched_evict_group(group, false, true); + } else { + bool as_faulty; + + term_group_sync(group); + /* Treat the csg been terminated */ + as_faulty = cleanup_csg_slot(group); + /* remove from the scheduler list */ + sched_evict_group(group, as_faulty, false); } + WARN_ON(queue_group_scheduled_locked(group)); + +unlock: mutex_unlock(&scheduler->lock); } @@ -2684,9 +3047,11 @@ static void program_suspending_csg_slots(struct kbase_device *kbdev) */ dev_warn( kbdev->dev, - "Group %d of context %d_%d on slot %u failed to suspend", + "[%llu] Group %d of context %d_%d on slot %u failed to suspend (timeout %d ms)", + kbase_backend_get_cycle_cnt(kbdev), group->handle, group->kctx->tgid, - group->kctx->id, i); + group->kctx->id, i, + kbdev->csf.fw_timeout_ms); /* The group has failed suspension, stop * further examination. @@ -2784,7 +3149,9 @@ static void wait_csg_slots_start(struct kbase_device *kbdev) group->run_state = KBASE_CSF_GROUP_RUNNABLE; } } else { - dev_warn(kbdev->dev, "Timed out waiting for CSG slots to start, slots: 0x%*pb\n", + dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for CSG slots to start, slots: 0x%*pb\n", + kbase_backend_get_cycle_cnt(kbdev), + kbdev->csf.fw_timeout_ms, num_groups, slot_mask); if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE)) @@ -2904,9 +3271,12 @@ static int wait_csg_slots_handshake_ack(struct kbase_device *kbdev, if (remaining) bitmap_andnot(slot_mask, slot_mask, dones, num_groups); - else + else { + + /* Timed-out on the wait */ return -ETIMEDOUT; + } } return 0; @@ -2929,7 +3299,9 @@ static void wait_csg_slots_finish_prio_update(struct kbase_device *kbdev) */ dev_warn( kbdev->dev, - "Timeout on CSG_REQ:EP_CFG, skipping the update wait: slot mask=0x%lx", + "[%llu] Timeout (%d ms) on CSG_REQ:EP_CFG, skipping the update wait: slot mask=0x%lx", + kbase_backend_get_cycle_cnt(kbdev), + kbdev->csf.fw_timeout_ms, slot_mask[0]); } } @@ -3075,7 +3447,11 @@ static void scheduler_group_check_protm_enter(struct kbase_device *const kbdev, spin_lock_irqsave(&scheduler->interrupt_lock, flags); - protm_in_use = kbase_csf_scheduler_protected_mode_in_use(kbdev); + /* Check if the previous transition to enter & exit the protected + * mode has completed or not. + */ + protm_in_use = kbase_csf_scheduler_protected_mode_in_use(kbdev) || + kbdev->protected_mode; KBASE_KTRACE_ADD_CSF_GRP(kbdev, SCHEDULER_CHECK_PROTM_ENTER, input_grp, protm_in_use); @@ -3123,8 +3499,10 @@ static void scheduler_group_check_protm_enter(struct kbase_device *const kbdev, KBASE_KTRACE_ADD_CSF_GRP(kbdev, SCHEDULER_ENTER_PROTM, input_grp, 0u); - spin_unlock_irqrestore(&scheduler->interrupt_lock, flags); kbase_csf_enter_protected_mode(kbdev); + spin_unlock_irqrestore(&scheduler->interrupt_lock, flags); + + kbase_csf_wait_protected_mode_enter(kbdev); return; } } @@ -3433,7 +3811,9 @@ static void scheduler_update_idle_slots_status(struct kbase_device *kbdev, CSG_REQ_STATUS_UPDATE_MASK, csg_bitmap, wt)) { dev_warn( kbdev->dev, - "Timeout on CSG_REQ:STATUS_UPDATE, treat groups as not idle: slot mask=0x%lx", + "[%llu] Timeout (%d ms) on CSG_REQ:STATUS_UPDATE, treat groups as not idle: slot mask=0x%lx", + kbase_backend_get_cycle_cnt(kbdev), + kbdev->csf.fw_timeout_ms, csg_bitmap[0]); /* Store the bitmap of timed out slots */ @@ -3576,7 +3956,7 @@ static struct kbase_queue_group *get_tock_top_group( } static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev, - bool is_suspend) + bool system_suspend) { struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler; DECLARE_BITMAP(slot_mask, MAX_SUPPORTED_CSGS) = { 0 }; @@ -3587,15 +3967,19 @@ static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev, /* The suspend of CSGs failed, trigger the GPU reset and wait * for it to complete to be in a deterministic state. */ - dev_warn(kbdev->dev, "Timed out waiting for CSG slots to suspend on power down, slot_mask: 0x%*pb\n", + dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for CSG slots to suspend on power down, slot_mask: 0x%*pb\n", + kbase_backend_get_cycle_cnt(kbdev), + kbdev->csf.fw_timeout_ms, kbdev->csf.global_iface.group_num, slot_mask); if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE)) kbase_reset_gpu(kbdev); - if (is_suspend) { + if (system_suspend) { mutex_unlock(&scheduler->lock); + kbase_reset_gpu_allow(kbdev); kbase_reset_gpu_wait(kbdev); + kbase_reset_gpu_prevent_and_wait(kbdev); mutex_lock(&scheduler->lock); } return -1; @@ -3604,7 +3988,7 @@ static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev, /* Check if the groups became active whilst the suspend was ongoing, * but only for the case where the system suspend is not in progress */ - if (!is_suspend && atomic_read(&scheduler->non_idle_offslot_grps)) + if (!system_suspend && atomic_read(&scheduler->non_idle_offslot_grps)) return -1; return 0; @@ -3618,7 +4002,8 @@ static bool scheduler_idle_suspendable(struct kbase_device *kbdev) lockdep_assert_held(&scheduler->lock); - if (scheduler->state == SCHED_SUSPENDED) + if ((scheduler->state == SCHED_SUSPENDED) || + (scheduler->state == SCHED_SLEEPING)) return false; spin_lock_irqsave(&kbdev->hwaccess_lock, flags); @@ -3639,12 +4024,66 @@ static bool scheduler_idle_suspendable(struct kbase_device *kbdev) return suspend; } +#ifdef KBASE_PM_RUNTIME +/** + * scheduler_sleep_on_idle - Put the Scheduler in sleeping state on GPU + * becoming idle. + * + * @kbdev: Pointer to the device. + * + * This function is called on GPU idle notification to trigger the transition of + * GPU to sleep state, where MCU firmware pauses execution and L2 cache is + * turned off. Scheduler's state is changed to sleeping and all the active queue + * groups remain on the CSG slots. + */ +static void scheduler_sleep_on_idle(struct kbase_device *kbdev) +{ + struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler; + + lockdep_assert_held(&scheduler->lock); + + dev_dbg(kbdev->dev, + "Scheduler to be put to sleep on GPU becoming idle"); + cancel_tick_timer(kbdev); + scheduler_pm_idle_before_sleep(kbdev); + scheduler->state = SCHED_SLEEPING; +} +#endif + +/** + * scheduler_suspend_on_idle - Put the Scheduler in suspended state on GPU + * becoming idle. + * + * @kbdev: Pointer to the device. + * + * This function is called on GPU idle notification to trigger the power down of + * GPU. Scheduler's state is changed to suspended and all the active queue + * groups are suspended before halting the MCU firmware. + */ +static bool scheduler_suspend_on_idle(struct kbase_device *kbdev) +{ + int ret = suspend_active_groups_on_powerdown(kbdev, false); + + if (ret) { + dev_dbg(kbdev->dev, "Aborting suspend scheduler (grps: %d)", + atomic_read( + &kbdev->csf.scheduler.non_idle_offslot_grps)); + /* Bring forward the next tick */ + kbase_csf_scheduler_advance_tick(kbdev); + return false; + } + + dev_dbg(kbdev->dev, "Scheduler to be suspended on GPU becoming idle"); + scheduler_suspend(kbdev); + cancel_tick_timer(kbdev); + return true; +} + static void gpu_idle_worker(struct work_struct *work) { struct kbase_device *kbdev = container_of( work, struct kbase_device, csf.scheduler.gpu_idle_work); struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler; - bool reset_active = false; bool scheduler_is_idle_suspendable = false; bool all_groups_suspended = false; @@ -3664,27 +4103,22 @@ static void gpu_idle_worker(struct work_struct *work) /* Cycle completed, disable the firmware idle timer */ disable_gpu_idle_fw_timer(kbdev); scheduler_is_idle_suspendable = scheduler_idle_suspendable(kbdev); - reset_active = kbase_reset_gpu_is_active(kbdev); - if (scheduler_is_idle_suspendable && !reset_active) { - all_groups_suspended = - !suspend_active_groups_on_powerdown(kbdev, false); - - if (all_groups_suspended) { - dev_dbg(kbdev->dev, "Scheduler becomes idle suspended now"); - scheduler_suspend(kbdev); - cancel_tick_timer(kbdev); - } else { - dev_dbg(kbdev->dev, "Aborting suspend scheduler (grps: %d)", - atomic_read(&scheduler->non_idle_offslot_grps)); - /* Bring forward the next tick */ - kbase_csf_scheduler_advance_tick(kbdev); - } + if (scheduler_is_idle_suspendable) { +#ifdef KBASE_PM_RUNTIME + if (kbase_pm_gpu_sleep_allowed(kbdev) && + scheduler->total_runnable_grps) + scheduler_sleep_on_idle(kbdev); + else +#endif + all_groups_suspended = scheduler_suspend_on_idle(kbdev); } mutex_unlock(&scheduler->lock); kbase_reset_gpu_allow(kbdev); KBASE_KTRACE_ADD(kbdev, IDLE_WORKER_END, NULL, - __ENCODE_KTRACE_INFO(reset_active, scheduler_is_idle_suspendable, all_groups_suspended)); + __ENCODE_KTRACE_INFO(false, + scheduler_is_idle_suspendable, + all_groups_suspended)); #undef __ENCODE_KTRACE_INFO } @@ -3777,32 +4211,151 @@ static void scheduler_handle_idle_timer_onoff(struct kbase_device *kbdev) enable_gpu_idle_fw_timer(kbdev); } -static void schedule_actions(struct kbase_device *kbdev) +/** + * keep_lru_on_slots() - Check the condition for LRU is met. + * + * This function tries to maintain the Last-Recent-Use case on slots, when + * the scheduler has no non-idle off-slot CSGs for a replacement + * consideration. This effectively extends the previous scheduling results + * for the new one. That is, the last recent used CSGs are retained on slots + * for the new tick/tock action. + * + * @kbdev: Pointer to the device. + * + * Return: true for avoiding on-slot CSGs changes (i.e. keep existing LRU), + * otherwise false. + */ +static bool keep_lru_on_slots(struct kbase_device *kbdev) +{ + struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; + bool keep_lru = false; + int on_slots = bitmap_weight(scheduler->csg_inuse_bitmap, + kbdev->csf.global_iface.group_num); + + lockdep_assert_held(&scheduler->lock); + + if (on_slots && !atomic_read(&scheduler->non_idle_offslot_grps)) { + unsigned long flags; + + spin_lock_irqsave(&scheduler->interrupt_lock, flags); + /* All on-slots are idle, no non-idle off-slot CSGs available + * for considering a meaningful change. Set keep_lru. + */ + keep_lru = kbase_csf_scheduler_all_csgs_idle(kbdev); + + if (keep_lru && !scheduler->gpu_idle_fw_timer_enabled) { + scheduler->gpu_idle_fw_timer_enabled = true; + kbase_csf_firmware_enable_gpu_idle_timer(kbdev); + } + spin_unlock_irqrestore(&scheduler->interrupt_lock, flags); + + dev_dbg(kbdev->dev, "Keep_LRU: %d, CSGs on-slots: %d\n", + keep_lru, on_slots); + } + + return keep_lru; +} + +/** + * prepare_fast_local_tock() - making preparation arrangement for exercizing + * a fast local tock inside scheduling-actions. + * + * The function assumes that a scheduling action of firing a fast local tock + * call (i.e. an equivalent tock action without dropping the lock) is desired + * if there are idle onslot CSGs. The function updates those affected CSGs' + * run-state as a preparation. This should only be called from inside the + * schedule_actions(), where the previous idle-flags are still considered to + * be reflective, following its earlier idle confirmation operational call, + * plus some potential newly idle CSGs in the scheduling action committing + * steps. + * + * @kbdev: Pointer to the GPU device. + * + * Return: number of on-slots CSGs that can be considered for replacing. + */ +static int prepare_fast_local_tock(struct kbase_device *kbdev) +{ + struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; + u32 num_groups = kbdev->csf.global_iface.group_num; + unsigned long flags, i; + DECLARE_BITMAP(csg_bitmap, MAX_SUPPORTED_CSGS) = { 0 }; + + lockdep_assert_held(&scheduler->lock); + + spin_lock_irqsave(&scheduler->interrupt_lock, flags); + bitmap_copy(csg_bitmap, scheduler->csg_slots_idle_mask, num_groups); + spin_unlock_irqrestore(&scheduler->interrupt_lock, flags); + + /* Marking the flagged idle CSGs' run state to IDLE, so + * the intended fast local tock can replacing them with off-slots + * non-idle CSGs. + */ + for_each_set_bit(i, csg_bitmap, num_groups) { + struct kbase_csf_csg_slot *csg_slot = &scheduler->csg_slots[i]; + struct kbase_queue_group *group = csg_slot->resident_group; + + if (!queue_group_idle_locked(group)) + group->run_state = KBASE_CSF_GROUP_IDLE; + } + + /* Return the number of idle slots for potential replacement */ + return bitmap_weight(csg_bitmap, num_groups); +} + +static void schedule_actions(struct kbase_device *kbdev, bool is_tick) { struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; unsigned long flags; struct kbase_queue_group *protm_grp; int ret; + bool skip_scheduling_actions; bool skip_idle_slots_update; bool new_protm_top_grp = false; + int local_tock_slots = 0; kbase_reset_gpu_assert_prevented(kbdev); lockdep_assert_held(&scheduler->lock); - ret = kbase_pm_wait_for_desired_state(kbdev); + ret = kbase_csf_scheduler_wait_mcu_active(kbdev); if (ret) { - dev_err(kbdev->dev, "Wait for MCU power on failed"); + dev_err(kbdev->dev, + "Wait for MCU power on failed on scheduling tick/tock"); return; } spin_lock_irqsave(&scheduler->interrupt_lock, flags); skip_idle_slots_update = kbase_csf_scheduler_protected_mode_in_use(kbdev); + skip_scheduling_actions = + !skip_idle_slots_update && kbdev->protected_mode; spin_unlock_irqrestore(&scheduler->interrupt_lock, flags); - /* Skip updating on-slot idle CSGs if GPU is in protected mode. */ - if (!skip_idle_slots_update) + /* Skip scheduling actions as GPU reset hasn't been performed yet to + * rectify the anomaly that happened when pmode exit interrupt wasn't + * received before the termination of group running in pmode. + */ + if (unlikely(skip_scheduling_actions)) { + dev_info(kbdev->dev, + "Scheduling actions skipped due to anomaly in pmode"); + return; + } + + if (!skip_idle_slots_update) { + /* Updating on-slot idle CSGs when not in protected mode. */ scheduler_handle_idle_slots(kbdev); + /* Determine whether the condition is met for keeping the + * Last-Recent-Use. If true, skipping the remaining action + * steps and thus extending the previous tick's arrangement, + * in particular, no alterations to on-slot CSGs. + */ + if (keep_lru_on_slots(kbdev)) + return; + } + + if (is_tick) + scheduler_rotate(kbdev); + +redo_local_tock: scheduler_prepare(kbdev); spin_lock_irqsave(&scheduler->interrupt_lock, flags); protm_grp = scheduler->active_protm_grp; @@ -3866,6 +4419,21 @@ static void schedule_actions(struct kbase_device *kbdev) if (new_protm_top_grp) { scheduler_group_check_protm_enter(kbdev, scheduler->top_grp); + } else if (!local_tock_slots && + atomic_read(&scheduler->non_idle_offslot_grps)) { + /* If during the scheduling action, we have off-slot + * non-idle CSGs in waiting, if it happens to have + * some new idle slots emerging during the committed + * action steps, trigger a one-off fast local tock. + */ + local_tock_slots = prepare_fast_local_tock(kbdev); + + if (local_tock_slots) { + dev_dbg(kbdev->dev, + "In-cycle %d idle slots available\n", + local_tock_slots); + goto redo_local_tock; + } } return; @@ -3875,13 +4443,66 @@ static void schedule_actions(struct kbase_device *kbdev) return; } +/** + * can_skip_scheduling() - Check if the scheduling actions can be skipped. + * + * @kbdev: Pointer to the device + * + * This function is called on a scheduling tick or tock to determine if the + * scheduling actions can be skipped. + * If Scheduler is in sleeping state and exit from the sleep state is allowed + * then activation of MCU will be triggered. The tick or tock work item could + * have been in flight when the state of Scheduler was changed to sleeping. + * + * Return: true if the scheduling actions can be skipped. + */ +static bool can_skip_scheduling(struct kbase_device *kbdev) +{ + struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler; + + lockdep_assert_held(&scheduler->lock); + + if (scheduler->state == SCHED_SUSPENDED) + return true; + +#ifdef KBASE_PM_RUNTIME + if (scheduler->state == SCHED_SLEEPING) { + unsigned long flags; + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + if (kbdev->pm.backend.exit_gpu_sleep_mode) { + int ret = scheduler_pm_active_after_sleep(kbdev, flags); + /* hwaccess_lock is released in the previous function + * call. + */ + if (!ret) { + scheduler->state = SCHED_INACTIVE; + return false; + } + + dev_info(kbdev->dev, + "Skip scheduling due to system suspend"); + return true; + } + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + return true; + } +#endif + + return false; +} + static void schedule_on_tock(struct work_struct *work) { struct kbase_device *kbdev = container_of(work, struct kbase_device, csf.scheduler.tock_work.work); struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler; + int err; - int err = kbase_reset_gpu_try_prevent(kbdev); + /* Tock work item is serviced */ + scheduler->tock_pending_request = false; + + err = kbase_reset_gpu_try_prevent(kbdev); /* Regardless of whether reset failed or is currently happening, exit * early */ @@ -3889,7 +4510,7 @@ static void schedule_on_tock(struct work_struct *work) return; mutex_lock(&scheduler->lock); - if (scheduler->state == SCHED_SUSPENDED) + if (can_skip_scheduling(kbdev)) goto exit_no_schedule_unlock; WARN_ON(!(scheduler->state == SCHED_INACTIVE)); @@ -3897,15 +4518,14 @@ static void schedule_on_tock(struct work_struct *work) /* Undertaking schedule action steps */ KBASE_KTRACE_ADD(kbdev, SCHEDULER_TOCK, NULL, 0u); - schedule_actions(kbdev); + schedule_actions(kbdev, false); - /* Record time information */ + /* Record time information on a non-skipped tock */ scheduler->last_schedule = jiffies; - /* Tock is serviced */ - scheduler->tock_pending_request = false; - scheduler->state = SCHED_INACTIVE; + if (!scheduler->total_runnable_grps) + queue_work(system_wq, &scheduler->gpu_idle_work); mutex_unlock(&scheduler->lock); kbase_reset_gpu_allow(kbdev); @@ -3936,17 +4556,15 @@ static void schedule_on_tick(struct work_struct *work) mutex_lock(&scheduler->lock); WARN_ON(scheduler->tick_timer_active); - if (scheduler->state == SCHED_SUSPENDED) + if (can_skip_scheduling(kbdev)) goto exit_no_schedule_unlock; scheduler->state = SCHED_BUSY; - /* Do scheduling stuff */ - scheduler_rotate(kbdev); /* Undertaking schedule action steps */ KBASE_KTRACE_ADD(kbdev, SCHEDULER_TICK, NULL, scheduler->total_runnable_grps); - schedule_actions(kbdev); + schedule_actions(kbdev, true); /* Record time information */ scheduler->last_schedule = jiffies; @@ -3958,7 +4576,8 @@ static void schedule_on_tick(struct work_struct *work) dev_dbg(kbdev->dev, "scheduling for next tick, num_runnable_groups:%u\n", scheduler->total_runnable_grps); - } + } else if (!scheduler->total_runnable_grps) + queue_work(system_wq, &scheduler->gpu_idle_work); scheduler->state = SCHED_INACTIVE; mutex_unlock(&scheduler->lock); @@ -4024,8 +4643,11 @@ static int wait_csg_slots_suspend(struct kbase_device *kbdev, } } } else { - dev_warn(kbdev->dev, "Timed out waiting for CSG slots to suspend, slot_mask: 0x%*pb\n", + dev_warn(kbdev->dev, "[%llu] Timeout waiting for CSG slots to suspend, slot_mask: 0x%*pb\n", + kbase_backend_get_cycle_cnt(kbdev), num_groups, slot_mask_local); + + err = -ETIMEDOUT; } } @@ -4069,7 +4691,7 @@ static int suspend_active_queue_groups_on_reset(struct kbase_device *kbdev) ret = suspend_active_queue_groups(kbdev, slot_mask); if (ret) { - dev_warn(kbdev->dev, "Timed out waiting for CSG slots to suspend before reset, slot_mask: 0x%*pb\n", + dev_warn(kbdev->dev, "Timeout waiting for CSG slots to suspend before reset, slot_mask: 0x%*pb\n", kbdev->csf.global_iface.group_num, slot_mask); } @@ -4088,7 +4710,8 @@ static int suspend_active_queue_groups_on_reset(struct kbase_device *kbdev) ret2 = kbase_gpu_wait_cache_clean_timeout(kbdev, kbdev->reset_timeout_ms); if (ret2) { - dev_warn(kbdev->dev, "Timed out waiting for cache clean to complete before reset"); + dev_warn(kbdev->dev, "[%llu] Timeout waiting for cache clean to complete before reset", + kbase_backend_get_cycle_cnt(kbdev)); if (!ret) ret = ret2; } @@ -4125,7 +4748,8 @@ static bool scheduler_handle_reset_in_protected_mode(struct kbase_device *kbdev) struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; u32 const num_groups = kbdev->csf.global_iface.group_num; struct kbase_queue_group *protm_grp; - bool suspend_on_slot_groups; + bool suspend_on_slot_groups = true; + bool pmode_active; unsigned long flags; u32 csg_nr; @@ -4133,20 +4757,51 @@ static bool scheduler_handle_reset_in_protected_mode(struct kbase_device *kbdev) spin_lock_irqsave(&scheduler->interrupt_lock, flags); protm_grp = scheduler->active_protm_grp; + pmode_active = kbdev->protected_mode; + + if (likely(!protm_grp && !pmode_active)) { + /* Case 1: GPU is not in protected mode or it successfully + * exited protected mode. All on-slot groups can be suspended in + * the regular way before reset. + */ + suspend_on_slot_groups = true; + } else if (protm_grp && pmode_active) { + /* Case 2: GPU went successfully into protected mode and hasn't + * exited from it yet and the protected mode group is still + * active. If there was no fault for the protected mode group + * then it can be suspended in the regular way before reset. + * The other normal mode on-slot groups were already implicitly + * suspended on entry to protected mode so they can be marked as + * suspended right away. + */ + suspend_on_slot_groups = !protm_grp->faulted; + } else if (!protm_grp && pmode_active) { + /* Case 3: GPU went successfully into protected mode and hasn't + * exited from it yet but the protected mode group got deleted. + * This would have happened if the FW got stuck during protected + * mode for some reason (like GPU page fault or some internal + * error). In normal cases FW is expected to send the pmode exit + * interrupt before it handles the CSG termination request. + * The other normal mode on-slot groups would already have been + * implicitly suspended on entry to protected mode so they can be + * marked as suspended right away. + */ + suspend_on_slot_groups = false; + } else if (protm_grp && !pmode_active) { + /* Case 4: GPU couldn't successfully enter protected mode, i.e. + * PROTM_ENTER request had timed out. + * All the on-slot groups need to be suspended in the regular + * way before reset. + */ + suspend_on_slot_groups = true; + } - /* If GPU wasn't in protected mode or had exited it before the GPU reset - * then all the on-slot groups can be suspended in the regular way by - * sending CSG SUSPEND requests to FW. - * If there wasn't a fault for protected mode group, then it would - * also need to be suspended in the regular way before the reset. - */ - suspend_on_slot_groups = !(protm_grp && protm_grp->faulted); spin_unlock_irqrestore(&scheduler->interrupt_lock, flags); - if (!protm_grp) + if (likely(!pmode_active)) goto unlock; - /* GPU is in protected mode, so all the on-slot groups barring the + /* GPU hasn't exited protected mode, so all the on-slot groups barring * the protected mode group can be marked as suspended right away. */ for (csg_nr = 0; csg_nr < num_groups; csg_nr++) { @@ -4174,19 +4829,25 @@ unlock: return suspend_on_slot_groups; } +static void cancel_tock_work(struct kbase_csf_scheduler *const scheduler) +{ + cancel_delayed_work_sync(&scheduler->tock_work); + scheduler->tock_pending_request = false; +} + static void scheduler_inner_reset(struct kbase_device *kbdev) { u32 const num_groups = kbdev->csf.global_iface.group_num; struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; unsigned long flags; - WARN_ON(csgs_active(kbdev)); + WARN_ON(kbase_csf_scheduler_get_nr_active_csgs(kbdev)); /* Cancel any potential queued delayed work(s) */ cancel_work_sync(&kbdev->csf.scheduler.gpu_idle_work); cancel_tick_timer(kbdev); cancel_work_sync(&scheduler->tick_work); - cancel_delayed_work_sync(&scheduler->tock_work); + cancel_tock_work(scheduler); cancel_delayed_work_sync(&scheduler->ping_work); mutex_lock(&scheduler->lock); @@ -4292,10 +4953,11 @@ static void firmware_aliveness_monitor(struct work_struct *work) } #endif - if (kbdev->csf.scheduler.state == SCHED_SUSPENDED) + if (kbdev->csf.scheduler.state == SCHED_SUSPENDED || + kbdev->csf.scheduler.state == SCHED_SLEEPING) goto exit; - if (get_nr_active_csgs(kbdev) != 1) + if (kbase_csf_scheduler_get_nr_active_csgs(kbdev) != 1) goto exit; if (kbase_csf_scheduler_protected_mode_in_use(kbdev)) @@ -4307,7 +4969,7 @@ static void firmware_aliveness_monitor(struct work_struct *work) goto exit; } - kbase_pm_wait_for_desired_state(kbdev); + kbase_csf_scheduler_wait_mcu_active(kbdev); err = kbase_csf_firmware_ping_wait(kbdev); @@ -4318,7 +4980,7 @@ static void firmware_aliveness_monitor(struct work_struct *work) if (kbase_prepare_to_reset_gpu( kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR)) kbase_reset_gpu(kbdev); - } else if (get_nr_active_csgs(kbdev) == 1) { + } else if (kbase_csf_scheduler_get_nr_active_csgs(kbdev) == 1) { queue_delayed_work(system_long_wq, &kbdev->csf.scheduler.ping_work, msecs_to_jiffies(FIRMWARE_PING_INTERVAL_MS)); @@ -4337,13 +4999,42 @@ int kbase_csf_scheduler_group_copy_suspend_buf(struct kbase_queue_group *group, struct kbase_context *const kctx = group->kctx; struct kbase_device *const kbdev = kctx->kbdev; struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler; + bool on_slot; int err = 0; kbase_reset_gpu_assert_prevented(kbdev); lockdep_assert_held(&kctx->csf.lock); mutex_lock(&scheduler->lock); - if (kbasep_csf_scheduler_group_is_on_slot_locked(group)) { + on_slot = kbasep_csf_scheduler_group_is_on_slot_locked(group); + +#ifdef KBASE_PM_RUNTIME + if (on_slot && (scheduler->state == SCHED_SLEEPING)) { + if (wait_for_scheduler_to_exit_sleep(kbdev)) { + dev_warn( + kbdev->dev, + "Wait for scheduler to exit sleep state timedout when copying suspend buffer for group %d of ctx %d_%d on slot %d", + group->handle, group->kctx->tgid, + group->kctx->id, group->csg_nr); + + scheduler_wakeup(kbdev, true); + + /* Wait for MCU firmware to start running */ + if (kbase_csf_scheduler_wait_mcu_active(kbdev)) + dev_warn( + kbdev->dev, + "Wait for MCU active failed when copying suspend buffer for group %d of ctx %d_%d on slot %d", + group->handle, group->kctx->tgid, + group->kctx->id, group->csg_nr); + } + + /* Check the group state again as scheduler lock would have been + * released when waiting for the exit from SLEEPING state. + */ + on_slot = kbasep_csf_scheduler_group_is_on_slot_locked(group); + } +#endif + if (on_slot) { DECLARE_BITMAP(slot_mask, MAX_SUPPORTED_CSGS) = {0}; set_bit(kbase_csf_scheduler_group_get_slot(group), slot_mask); @@ -4353,8 +5044,9 @@ int kbase_csf_scheduler_group_copy_suspend_buf(struct kbase_queue_group *group, err = wait_csg_slots_suspend(kbdev, slot_mask, kbdev->csf.fw_timeout_ms); if (err) { - dev_warn(kbdev->dev, "Timed out waiting for the group %d to suspend on slot %d", - group->handle, group->csg_nr); + dev_warn(kbdev->dev, "[%llu] Timeout waiting for the group %d to suspend on slot %d", + kbase_backend_get_cycle_cnt(kbdev), + group->handle, group->csg_nr); goto exit; } } @@ -4547,20 +5239,22 @@ void kbase_csf_scheduler_group_protm_enter(struct kbase_queue_group *group) } /** - * check_sync_update_for_idle_group_protm() - Check the sync wait condition - * for all the queues bound to - * the given group. + * check_sync_update_for_on_slot_group() - Check the sync wait condition + * for all the queues bound to + * the given on-slot group. * - * @group: Pointer to the group that requires evaluation. + * @group: Pointer to the on-slot group that requires evaluation. * * This function is called if the GPU is in protected mode and there are on - * slot idle groups with higher priority than the active protected mode group. + * slot idle groups with higher priority than the active protected mode group + * or this function is called when CQS object is signaled whilst GPU is in + * sleep state. * This function will evaluate the sync condition, if any, of all the queues * bound to the given group. * * Return true if the sync condition of at least one queue has been satisfied. */ -static bool check_sync_update_for_idle_group_protm( +static bool check_sync_update_for_on_slot_group( struct kbase_queue_group *group) { struct kbase_device *const kbdev = group->kctx->kbdev; @@ -4680,7 +5374,7 @@ static bool check_sync_update_for_idle_groups_protm(struct kbase_device *kbdev) * has a higher priority than the protm group, then we * need to exit protected mode. */ - if (check_sync_update_for_idle_group_protm(group)) + if (check_sync_update_for_on_slot_group(group)) exit_protm = true; } } @@ -4688,6 +5382,28 @@ static bool check_sync_update_for_idle_groups_protm(struct kbase_device *kbdev) return exit_protm; } +static void check_sync_update_in_sleep_mode(struct kbase_device *kbdev) +{ + struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; + u32 const num_groups = kbdev->csf.global_iface.group_num; + u32 csg_nr; + + lockdep_assert_held(&scheduler->lock); + + for (csg_nr = 0; csg_nr < num_groups; csg_nr++) { + struct kbase_queue_group *const group = + kbdev->csf.scheduler.csg_slots[csg_nr].resident_group; + + if (!group) + continue; + + if (check_sync_update_for_on_slot_group(group)) { + scheduler_wakeup(kbdev, true); + return; + } + } +} + /** * check_group_sync_update_worker() - Check the sync wait condition for all the * blocked queue groups @@ -4709,6 +5425,7 @@ static void check_group_sync_update_worker(struct work_struct *work) struct kbase_context, csf.sched.sync_update_work); struct kbase_device *const kbdev = kctx->kbdev; struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler; + bool sync_updated = false; mutex_lock(&scheduler->lock); @@ -4719,6 +5436,7 @@ static void check_group_sync_update_worker(struct work_struct *work) list_for_each_entry_safe(group, temp, &kctx->csf.sched.idle_wait_groups, link) { if (group_sync_updated(group)) { + sync_updated = true; /* Move this group back in to the runnable * groups list of the context. */ @@ -4730,8 +5448,17 @@ static void check_group_sync_update_worker(struct work_struct *work) WARN_ON(!list_empty(&kctx->csf.sched.idle_wait_groups)); } - if (check_sync_update_for_idle_groups_protm(kbdev)) + if (check_sync_update_for_idle_groups_protm(kbdev)) { scheduler_force_protm_exit(kbdev); + sync_updated = true; + } + + /* If scheduler is in sleep or suspended state, re-activate it + * to serve on-slot CSGs blocked on CQS which has been signaled. + */ + if (!sync_updated && (scheduler->state == SCHED_SLEEPING)) + check_sync_update_in_sleep_mode(kbdev); + KBASE_KTRACE_ADD(kbdev, GROUP_SYNC_UPDATE_WORKER_END, kctx, 0u); mutex_unlock(&scheduler->lock); @@ -4829,7 +5556,6 @@ int kbase_csf_scheduler_early_init(struct kbase_device *kbdev) INIT_DEFERRABLE_WORK(&scheduler->tock_work, schedule_on_tock); INIT_DEFERRABLE_WORK(&scheduler->ping_work, firmware_aliveness_monitor); - BUILD_BUG_ON(CSF_FIRMWARE_TIMEOUT_MS >= FIRMWARE_PING_INTERVAL_MS); mutex_init(&scheduler->lock); spin_lock_init(&scheduler->interrupt_lock); @@ -4869,16 +5595,22 @@ void kbase_csf_scheduler_term(struct kbase_device *kbdev) { if (kbdev->csf.scheduler.csg_slots) { WARN_ON(atomic_read(&kbdev->csf.scheduler.non_idle_offslot_grps)); - WARN_ON(csgs_active(kbdev)); + /* The unload of Driver can take place only when all contexts have + * been terminated. The groups that were not terminated by the User + * are terminated on context termination. So no CSGs are expected + * to be active at the time of Driver unload. + */ + WARN_ON(kbase_csf_scheduler_get_nr_active_csgs(kbdev)); flush_work(&kbdev->csf.scheduler.gpu_idle_work); mutex_lock(&kbdev->csf.scheduler.lock); + if (WARN_ON(kbdev->csf.scheduler.state != SCHED_SUSPENDED)) scheduler_suspend(kbdev); mutex_unlock(&kbdev->csf.scheduler.lock); cancel_delayed_work_sync(&kbdev->csf.scheduler.ping_work); cancel_tick_timer(kbdev); cancel_work_sync(&kbdev->csf.scheduler.tick_work); - cancel_delayed_work_sync(&kbdev->csf.scheduler.tock_work); + cancel_tock_work(&kbdev->csf.scheduler); mutex_destroy(&kbdev->csf.scheduler.lock); kfree(kbdev->csf.scheduler.csg_slots); kbdev->csf.scheduler.csg_slots = NULL; @@ -4911,7 +5643,8 @@ static void scheduler_enable_tick_timer_nolock(struct kbase_device *kbdev) return; WARN_ON((scheduler->state != SCHED_INACTIVE) && - (scheduler->state != SCHED_SUSPENDED)); + (scheduler->state != SCHED_SUSPENDED) && + (scheduler->state != SCHED_SLEEPING)); if (scheduler->total_runnable_grps > 0) { enqueue_tick_work(kbdev); @@ -4953,6 +5686,7 @@ void kbase_csf_scheduler_timer_set_enabled(struct kbase_device *kbdev, scheduler->timer_enabled = false; cancel_tick_timer(kbdev); cancel_delayed_work(&scheduler->tock_work); + scheduler->tock_pending_request = false; mutex_unlock(&scheduler->lock); /* The non-sync version to cancel the normal work item is not * available, so need to drop the lock before cancellation. @@ -4990,7 +5724,7 @@ void kbase_csf_scheduler_pm_suspend(struct kbase_device *kbdev) /* Cancel any potential queued delayed work(s) */ cancel_work_sync(&scheduler->tick_work); - cancel_delayed_work_sync(&scheduler->tock_work); + cancel_tock_work(scheduler); if (kbase_reset_gpu_prevent_and_wait(kbdev)) { dev_warn(kbdev->dev, @@ -5002,6 +5736,15 @@ void kbase_csf_scheduler_pm_suspend(struct kbase_device *kbdev) disable_gpu_idle_fw_timer(kbdev); +#ifdef KBASE_PM_RUNTIME + /* If scheduler is in sleeping state, then MCU needs to be activated + * to suspend CSGs. + */ + if (scheduler->state == SCHED_SLEEPING) { + dev_info(kbdev->dev, "Activating MCU out of sleep on system suspend"); + force_scheduler_to_exit_sleep(kbdev); + } +#endif if (scheduler->state != SCHED_SUSPENDED) { suspend_active_groups_on_powerdown(kbdev, true); dev_info(kbdev->dev, "Scheduler PM suspend"); @@ -5019,9 +5762,8 @@ void kbase_csf_scheduler_pm_resume(struct kbase_device *kbdev) struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; mutex_lock(&scheduler->lock); - - if (scheduler->total_runnable_grps > 0) { - WARN_ON(scheduler->state != SCHED_SUSPENDED); + if ((scheduler->total_runnable_grps > 0) && + (scheduler->state == SCHED_SUSPENDED)) { dev_info(kbdev->dev, "Scheduler PM resume"); scheduler_wakeup(kbdev, true); } @@ -5031,33 +5773,141 @@ KBASE_EXPORT_TEST_API(kbase_csf_scheduler_pm_resume); void kbase_csf_scheduler_pm_active(struct kbase_device *kbdev) { + /* Here the lock is taken to synchronize against the runtime suspend + * callback function, which may need to wake up the MCU for suspending + * the CSGs before powering down the GPU. + */ + mutex_lock(&kbdev->csf.scheduler.lock); + scheduler_pm_active_handle_suspend(kbdev, + KBASE_PM_SUSPEND_HANDLER_NOT_POSSIBLE); + mutex_unlock(&kbdev->csf.scheduler.lock); +} +KBASE_EXPORT_TEST_API(kbase_csf_scheduler_pm_active); + +void kbase_csf_scheduler_pm_idle(struct kbase_device *kbdev) +{ + /* Here the lock is taken just to maintain symmetry with + * kbase_csf_scheduler_pm_active(). + */ + mutex_lock(&kbdev->csf.scheduler.lock); + scheduler_pm_idle(kbdev); + mutex_unlock(&kbdev->csf.scheduler.lock); +} +KBASE_EXPORT_TEST_API(kbase_csf_scheduler_pm_idle); + +int kbase_csf_scheduler_wait_mcu_active(struct kbase_device *kbdev) +{ + struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler; unsigned long flags; - u32 prev_count; + int err; + kbase_pm_lock(kbdev); + WARN_ON(!kbdev->pm.active_count); spin_lock_irqsave(&kbdev->hwaccess_lock, flags); - prev_count = kbdev->csf.scheduler.pm_active_count++; + WARN_ON(!scheduler->pm_active_count); spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + kbase_pm_unlock(kbdev); - /* On 0 => 1, make a pm_ctx_active request */ - if (!prev_count) - kbase_pm_context_active(kbdev); - else - WARN_ON(prev_count == U32_MAX); + kbase_pm_wait_for_poweroff_work_complete(kbdev); + + err = kbase_pm_wait_for_desired_state(kbdev); + if (!err) { + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + WARN_ON(kbdev->pm.backend.mcu_state != KBASE_MCU_ON); + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + } + + return err; } -KBASE_EXPORT_TEST_API(kbase_csf_scheduler_pm_active); +KBASE_EXPORT_TEST_API(kbase_csf_scheduler_wait_mcu_active); -void kbase_csf_scheduler_pm_idle(struct kbase_device *kbdev) +#ifdef KBASE_PM_RUNTIME +int kbase_csf_scheduler_handle_runtime_suspend(struct kbase_device *kbdev) { + struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler; unsigned long flags; - u32 prev_count; + int ret; + + dev_dbg(kbdev->dev, "Handling runtime suspend"); + + kbase_reset_gpu_assert_prevented(kbdev); + lockdep_assert_held(&scheduler->lock); + WARN_ON(scheduler->pm_active_count); + + if (scheduler->state == SCHED_SUSPENDED) { + WARN_ON(kbdev->pm.backend.gpu_sleep_mode_active); + return 0; + } + + ret = suspend_active_groups_on_powerdown(kbdev, false); + + if (ret) { + dev_dbg(kbdev->dev, "Aborting runtime suspend (grps: %d)", + atomic_read(&scheduler->non_idle_offslot_grps)); + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + kbdev->pm.backend.exit_gpu_sleep_mode = true; + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + kbase_csf_scheduler_invoke_tick(kbdev); + return ret; + } + + scheduler->state = SCHED_SUSPENDED; spin_lock_irqsave(&kbdev->hwaccess_lock, flags); - prev_count = kbdev->csf.scheduler.pm_active_count--; + kbdev->pm.backend.gpu_sleep_mode_active = false; spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); - if (prev_count == 1) - kbase_pm_context_idle(kbdev); - else - WARN_ON(prev_count == 0); + wake_up_all(&kbdev->csf.event_wait); + return 0; +} + +void kbase_csf_scheduler_reval_idleness_post_sleep(struct kbase_device *kbdev) +{ + u32 csg_nr; + + lockdep_assert_held(&kbdev->hwaccess_lock); + + WARN_ON(kbdev->pm.backend.mcu_state != KBASE_MCU_IN_SLEEP); + + for (csg_nr = 0; csg_nr < kbdev->csf.global_iface.group_num; csg_nr++) { + struct kbase_csf_cmd_stream_group_info *ginfo = + &kbdev->csf.global_iface.groups[csg_nr]; + bool csg_idle; + + if (!kbdev->csf.scheduler.csg_slots[csg_nr].resident_group) + continue; + + csg_idle = + kbase_csf_firmware_csg_output(ginfo, CSG_STATUS_STATE) & + CSG_STATUS_STATE_IDLE_MASK; + if (!csg_idle) { + dev_dbg(kbdev->dev, + "Re-activate Scheduler after MCU sleep"); + kbdev->pm.backend.exit_gpu_sleep_mode = true; + kbase_csf_scheduler_invoke_tick(kbdev); + break; + } + } +} + +void kbase_csf_scheduler_force_sleep(struct kbase_device *kbdev) +{ + struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler; + + mutex_lock(&scheduler->lock); + if (kbase_pm_gpu_sleep_allowed(kbdev) && + (scheduler->state == SCHED_INACTIVE)) + scheduler_sleep_on_idle(kbdev); + mutex_unlock(&scheduler->lock); +} +#endif + +void kbase_csf_scheduler_force_wakeup(struct kbase_device *kbdev) +{ + struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler; + + mutex_lock(&scheduler->lock); + scheduler_wakeup(kbdev, true); + mutex_unlock(&scheduler->lock); } -KBASE_EXPORT_TEST_API(kbase_csf_scheduler_pm_idle); diff --git a/mali_kbase/csf/mali_kbase_csf_scheduler.h b/mali_kbase/csf/mali_kbase_csf_scheduler.h index 428ecbe..73ebb66 100644 --- a/mali_kbase/csf/mali_kbase_csf_scheduler.h +++ b/mali_kbase/csf/mali_kbase_csf_scheduler.h @@ -374,7 +374,11 @@ static inline bool kbase_csf_scheduler_protected_mode_in_use( * kbase_csf_scheduler_pm_active - Perform scheduler power active operation * * Note: This function will increase the scheduler's internal pm_active_count - * value, ensuring that both GPU and MCU are powered for access. + * value, ensuring that both GPU and MCU are powered for access. The MCU may + * not have actually become active when this function returns, so need to + * call kbase_csf_scheduler_wait_mcu_active() for that. + * + * This function should not be called with global scheduler lock held. * * @kbdev: Instance of a GPU platform device that implements a CSF interface. */ @@ -384,13 +388,27 @@ void kbase_csf_scheduler_pm_active(struct kbase_device *kbdev); * kbase_csf_scheduler_pm_idle - Perform the scheduler power idle operation * * Note: This function will decrease the scheduler's internal pm_active_count - * value. On reaching 0, the MCU and GPU could be powered off. + * value. On reaching 0, the MCU and GPU could be powered off. This function + * should not be called with global scheduler lock held. * * @kbdev: Instance of a GPU platform device that implements a CSF interface. */ void kbase_csf_scheduler_pm_idle(struct kbase_device *kbdev); /** + * kbase_csf_scheduler_wait_mcu_active - Wait for the MCU to actually become active + * + * @kbdev: Instance of a GPU platform device that implements a CSF interface. + * + * This function will wait for the MCU to actually become active. It is supposed + * to be called after calling kbase_csf_scheduler_pm_active(). It is needed as + * kbase_csf_scheduler_pm_active() may not make the MCU active right away. + * + * Return: 0 if the MCU was successfully activated otherwise an error code. + */ +int kbase_csf_scheduler_wait_mcu_active(struct kbase_device *kbdev); + +/** * kbase_csf_scheduler_pm_resume - Reactivate the scheduler on system resume * * @kbdev: Instance of a GPU platform device that implements a CSF interface. @@ -472,6 +490,26 @@ static inline void kbase_csf_scheduler_advance_tick(struct kbase_device *kbdev) } /** + * kbase_csf_scheduler_invoke_tick() - Invoke the scheduling tick + * + * @kbdev: Pointer to the device + * + * This function will queue the scheduling tick work item for immediate + * execution if tick timer is not active. This can be called from interrupt + * context to resume the scheduling after GPU was put to sleep. + */ +static inline void kbase_csf_scheduler_invoke_tick(struct kbase_device *kbdev) +{ + struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler; + unsigned long flags; + + spin_lock_irqsave(&scheduler->interrupt_lock, flags); + if (!scheduler->tick_timer_active) + queue_work(scheduler->wq, &scheduler->tick_work); + spin_unlock_irqrestore(&scheduler->interrupt_lock, flags); +} + +/** * kbase_csf_scheduler_queue_has_trace() - report whether the queue has been * configured to operate with the * cs_trace feature. @@ -491,4 +529,97 @@ static inline bool kbase_csf_scheduler_queue_has_trace(struct kbase_queue *queue return (queue->trace_buffer_size && queue->trace_buffer_base); } +#ifdef KBASE_PM_RUNTIME +/** + * kbase_csf_scheduler_reval_idleness_post_sleep() - Check GPU's idleness after + * putting MCU to sleep state + * + * @kbdev: Pointer to the device + * + * This function re-evaluates the idleness of on-slot queue groups after MCU + * was put to the sleep state and invokes the scheduling tick if any of the + * on-slot queue group became non-idle. + * CSG_OUTPUT_BLOCK.CSG_STATUS_STATE.IDLE bit is checked to determine the + * idleness which is updated by MCU firmware on handling of the sleep request. + * + * This function is needed to detect if more work was flushed in the window + * between the GPU idle notification and the enabling of Doorbell mirror + * interrupt (from MCU state machine). Once Doorbell mirror interrupt is + * enabled, Host can receive the notification on User doorbell rings. + */ +void kbase_csf_scheduler_reval_idleness_post_sleep(struct kbase_device *kbdev); + +/** + * kbase_csf_scheduler_handle_runtime_suspend() - Handle runtime suspend by + * suspending CSGs. + * + * @kbdev: Pointer to the device + * + * This function is called from the runtime suspend callback function for + * suspending all the on-slot queue groups. If any of the group is found to + * be non-idle after the completion of CSG suspend operation or the CSG + * suspend operation times out, then the scheduling tick is invoked and an + * error is returned so that the GPU power down can be aborted. + * + * Return: 0 if all the CSGs were suspended, otherwise an error code. + */ +int kbase_csf_scheduler_handle_runtime_suspend(struct kbase_device *kbdev); +#endif + +/** + * kbase_csf_scheduler_get_nr_active_csgs() - Get the number of active CSGs + * + * @kbdev: Pointer to the device + * + * This function calculates the number of CSG slots that have a queue group + * resident on them. + * + * Note: This function should not be used if the interrupt_lock is held. Use + * kbase_csf_scheduler_get_nr_active_csgs_locked() instead. + * + * Return: number of active CSGs. + */ +u32 kbase_csf_scheduler_get_nr_active_csgs(struct kbase_device *kbdev); + +/** + * kbase_csf_scheduler_get_nr_active_csgs_locked() - Get the number of active + * CSGs + * + * @kbdev: Pointer to the device + * + * This function calculates the number of CSG slots that have a queue group + * resident on them. + * + * Note: This function should be called with interrupt_lock held. + * + * Return: number of active CSGs. + */ +u32 kbase_csf_scheduler_get_nr_active_csgs_locked(struct kbase_device *kbdev); + +/** + * kbase_csf_scheduler_force_wakeup() - Forcefully resume the scheduling of CSGs + * + * @kbdev: Pointer to the device + * + * This function is called to forcefully resume the scheduling of CSGs, even + * when there wasn't any work submitted for them. + * This function is only used for testing purpose. + */ +void kbase_csf_scheduler_force_wakeup(struct kbase_device *kbdev); + +#ifdef KBASE_PM_RUNTIME +/** + * kbase_csf_scheduler_force_sleep() - Forcefully put the Scheduler to sleeping + * state. + * + * @kbdev: Pointer to the device + * + * This function is called to forcefully put the Scheduler to sleeping state + * and trigger the sleep of MCU. If the CSGs are not idle, then the Scheduler + * would get reactivated again immediately. + * This function is only used for testing purpose. + */ +void kbase_csf_scheduler_force_sleep(struct kbase_device *kbdev); +#endif + #endif /* _KBASE_CSF_SCHEDULER_H_ */ diff --git a/mali_kbase/csf/mali_kbase_csf_tiler_heap.c b/mali_kbase/csf/mali_kbase_csf_tiler_heap.c index 8ecf235..06a7824 100644 --- a/mali_kbase/csf/mali_kbase_csf_tiler_heap.c +++ b/mali_kbase/csf/mali_kbase_csf_tiler_heap.c @@ -66,8 +66,6 @@ static u64 encode_chunk_ptr(u32 const chunk_size, u64 const chunk_addr) static struct kbase_csf_tiler_heap_chunk *get_last_chunk( struct kbase_csf_tiler_heap *const heap) { - lockdep_assert_held(&heap->kctx->csf.tiler_heaps.lock); - if (list_empty(&heap->chunks_list)) return NULL; @@ -176,7 +174,7 @@ static int init_chunk(struct kbase_csf_tiler_heap *const heap, * Return: 0 if successful or a negative error code on failure. */ static int create_chunk(struct kbase_csf_tiler_heap *const heap, - bool link_with_prev) + bool link_with_prev) { int err = 0; struct kbase_context *const kctx = heap->kctx; @@ -186,14 +184,17 @@ static int create_chunk(struct kbase_csf_tiler_heap *const heap, BASE_MEM_COHERENT_LOCAL; struct kbase_csf_tiler_heap_chunk *chunk = NULL; - flags |= base_mem_group_id_set(kctx->jit_group_id); + /* Calls to this function are inherently synchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC; + + flags |= kbase_mem_group_id_set(kctx->jit_group_id); #if defined(CONFIG_MALI_DEBUG) || defined(CONFIG_MALI_VECTOR_DUMP) flags |= BASE_MEM_PROT_CPU_RD; #endif - lockdep_assert_held(&kctx->csf.tiler_heaps.lock); - chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); if (unlikely(!chunk)) { dev_err(kctx->kbdev->dev, @@ -203,8 +204,8 @@ static int create_chunk(struct kbase_csf_tiler_heap *const heap, /* Allocate GPU memory for the new chunk. */ INIT_LIST_HEAD(&chunk->link); - chunk->region = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, - &flags, &chunk->gpu_va); + chunk->region = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, + &chunk->gpu_va, mmu_sync_info); if (unlikely(!chunk->region)) { dev_err(kctx->kbdev->dev, @@ -251,8 +252,6 @@ static void delete_chunk(struct kbase_csf_tiler_heap *const heap, { struct kbase_context *const kctx = heap->kctx; - lockdep_assert_held(&kctx->csf.tiler_heaps.lock); - kbase_gpu_vm_lock(kctx); chunk->region->flags &= ~KBASE_REG_NO_USER_FREE; kbase_mem_free_region(kctx, chunk->region); @@ -273,9 +272,6 @@ static void delete_chunk(struct kbase_csf_tiler_heap *const heap, static void delete_all_chunks(struct kbase_csf_tiler_heap *heap) { struct list_head *entry = NULL, *tmp = NULL; - struct kbase_context *const kctx = heap->kctx; - - lockdep_assert_held(&kctx->csf.tiler_heaps.lock); list_for_each_safe(entry, tmp, &heap->chunks_list) { struct kbase_csf_tiler_heap_chunk *chunk = list_entry( @@ -429,6 +425,9 @@ int kbase_csf_tiler_heap_init(struct kbase_context *const kctx, "Creating a tiler heap with %u chunks (limit: %u) of size %u\n", initial_chunks, max_chunks, chunk_size); + if (!kbase_mem_allow_alloc(kctx)) + return -EINVAL; + if (chunk_size == 0) return -EINVAL; @@ -459,11 +458,9 @@ int kbase_csf_tiler_heap_init(struct kbase_context *const kctx, heap->gpu_va = kbase_csf_heap_context_allocator_alloc(ctx_alloc); - mutex_lock(&kctx->csf.tiler_heaps.lock); - if (unlikely(!heap->gpu_va)) { - dev_err(kctx->kbdev->dev, - "Failed to allocate a tiler heap context\n"); + dev_dbg(kctx->kbdev->dev, + "Failed to allocate a tiler heap context"); err = -ENOMEM; } else { err = create_initial_chunks(heap, initial_chunks); @@ -480,13 +477,14 @@ int kbase_csf_tiler_heap_init(struct kbase_context *const kctx, list_first_entry(&heap->chunks_list, struct kbase_csf_tiler_heap_chunk, link); + *heap_gpu_va = heap->gpu_va; + *first_chunk_va = first_chunk->gpu_va; + + mutex_lock(&kctx->csf.tiler_heaps.lock); kctx->csf.tiler_heaps.nr_of_heaps++; heap->heap_id = kctx->csf.tiler_heaps.nr_of_heaps; list_add(&heap->link, &kctx->csf.tiler_heaps.list); - *heap_gpu_va = heap->gpu_va; - *first_chunk_va = first_chunk->gpu_va; - KBASE_TLSTREAM_AUX_TILER_HEAP_STATS( kctx->kbdev, kctx->id, heap->heap_id, PFN_UP(heap->chunk_size * heap->max_chunks), @@ -496,10 +494,9 @@ int kbase_csf_tiler_heap_init(struct kbase_context *const kctx, dev_dbg(kctx->kbdev->dev, "Created tiler heap 0x%llX\n", heap->gpu_va); + mutex_unlock(&kctx->csf.tiler_heaps.lock); } - mutex_unlock(&kctx->csf.tiler_heaps.lock); - return err; } diff --git a/mali_kbase/csf/mali_kbase_csf_timeout.c b/mali_kbase/csf/mali_kbase_csf_timeout.c index 4d93fe5..f52cbab 100644 --- a/mali_kbase/csf/mali_kbase_csf_timeout.c +++ b/mali_kbase/csf/mali_kbase_csf_timeout.c @@ -100,7 +100,7 @@ static ssize_t progress_timeout_store(struct device * const dev, if (!err) { kbase_csf_scheduler_pm_active(kbdev); - err = kbase_pm_wait_for_desired_state(kbdev); + err = kbase_csf_scheduler_wait_mcu_active(kbdev); if (!err) err = kbase_csf_firmware_set_timeout(kbdev, timeout); diff --git a/mali_kbase/csf/mali_kbase_csf_tl_reader.c b/mali_kbase/csf/mali_kbase_csf_tl_reader.c index 1824c2d..563faec 100644 --- a/mali_kbase/csf/mali_kbase_csf_tl_reader.c +++ b/mali_kbase/csf/mali_kbase_csf_tl_reader.c @@ -171,13 +171,12 @@ static int kbase_ts_converter_init( * * Return: The CPU timestamp. */ -static void kbase_ts_converter_convert( - const struct kbase_ts_converter *self, - u64 *gpu_ts) +void kbase_ts_converter_convert(const struct kbase_ts_converter *self, + u64 *gpu_ts) { u64 old_gpu_ts = *gpu_ts; - *gpu_ts = div64_u64(old_gpu_ts * self->multiplier, - self->divisor) + self->offset; + *gpu_ts = div64_u64(old_gpu_ts * self->multiplier, self->divisor) + + self->offset; } /** @@ -256,6 +255,7 @@ static void tl_reader_reset(struct kbase_csf_tl_reader *self) self->tl_header.btc = 0; } + int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self) { int ret = 0; @@ -280,6 +280,7 @@ int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self) return -EBUSY; } + /* Copying the whole buffer in a single shot. We assume * that the buffer will not contain partially written messages. */ @@ -330,9 +331,8 @@ int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self) { struct kbase_csffw_tl_message *msg = (struct kbase_csffw_tl_message *) csffw_data_it; - kbase_ts_converter_convert( - &self->ts_converter, - &msg->timestamp); + kbase_ts_converter_convert(&self->ts_converter, + &msg->timestamp); } /* Copy the message out to the tl_stream. */ diff --git a/mali_kbase/csf/mali_kbase_csf_tl_reader.h b/mali_kbase/csf/mali_kbase_csf_tl_reader.h index 1b0fcd7..891a8f3 100644 --- a/mali_kbase/csf/mali_kbase_csf_tl_reader.h +++ b/mali_kbase/csf/mali_kbase_csf_tl_reader.h @@ -43,9 +43,9 @@ struct kbase_device; * struct kbase_ts_converter - * System timestamp to CPU timestamp converter state. * - * @multiplier: Numerator of the converter's fraction. - * @divisor: Denominator of the converter's fraction. - * @offset: Converter's offset term. + * @multiplier: Numerator of the converter's fraction. + * @divisor: Denominator of the converter's fraction. + * @offset: Converter's offset term. * * According to Generic timer spec, system timer: * - Increments at a fixed frequency diff --git a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_codes_jm.h b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_codes_jm.h index f419f70..6ba98b7 100644 --- a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_codes_jm.h +++ b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_codes_jm.h @@ -56,6 +56,14 @@ int dummy_array[] = { */ /* info_val==exit code; gpu_addr==chain gpuaddr */ KBASE_KTRACE_CODE_MAKE_CODE(JM_JOB_DONE), + /* gpu_addr==JS_HEAD read + * info_val==event code + */ + KBASE_KTRACE_CODE_MAKE_CODE(JM_RETURN_ATOM_TO_JS), + /* gpu_addr==JS_HEAD read + * info_val==event code + */ + KBASE_KTRACE_CODE_MAKE_CODE(JM_MARK_FOR_RETURN_TO_JS), /* gpu_addr==JS_HEAD_NEXT written, info_val==lower 32 bits of * affinity */ @@ -120,6 +128,13 @@ int dummy_array[] = { KBASE_KTRACE_CODE_MAKE_CODE(JS_ADD_JOB), /* gpu_addr==last value written/would be written to JS_HEAD */ KBASE_KTRACE_CODE_MAKE_CODE(JS_REMOVE_JOB), + /* gpu_addr==value to write into JS_HEAD + * info_val==priority of atom as a KBASE_JS_ATOM_SCHED_PRIO_<...> value + * (0 highest) + */ + KBASE_KTRACE_CODE_MAKE_CODE(JS_PULL_JOB), + /* gpu_addr==value that would be written to JS_HEAD if run again */ + KBASE_KTRACE_CODE_MAKE_CODE(JS_UNPULL_JOB), KBASE_KTRACE_CODE_MAKE_CODE(JS_TRY_SCHEDULE_HEAD_CTX), /* gpu_addr==value to write into JS_HEAD */ KBASE_KTRACE_CODE_MAKE_CODE(JS_JOB_DONE_TRY_RUN_NEXT_JOB), @@ -146,6 +161,25 @@ int dummy_array[] = { KBASE_KTRACE_CODE_MAKE_CODE(JS_CTX_ATTR_NOW_OFF_CTX), /* info_val == the ctx attribute now off runpool */ KBASE_KTRACE_CODE_MAKE_CODE(JS_CTX_ATTR_NOW_OFF_RUNPOOL), + /* gpu_addr==value to write into JS_HEAD */ + KBASE_KTRACE_CODE_MAKE_CODE(JS_RETURN_WORKER), + /* gpu_addr==value to write into JS_HEAD */ + KBASE_KTRACE_CODE_MAKE_CODE(JS_RETURN_WORKER_END), + /* info_val==priority level blocked (0 highest) */ + KBASE_KTRACE_CODE_MAKE_CODE(JS_SLOT_PRIO_BLOCKED), + /* info_val==priority level unblocked (0 highest) + * note that the priority level may still be blocked on higher levels + */ + KBASE_KTRACE_CODE_MAKE_CODE(JS_SLOT_PRIO_UNBLOCKED), + /* gpu_addr==value to write into JS_HEAD + * info_val==priority level unblocked - priorities at this and higher + * are unblocked (0 highest) + */ + KBASE_KTRACE_CODE_MAKE_CODE(JS_SLOT_PRIO_AND_HIGHER_UNBLOCKED), + /* gpu_addr==value to write into JS_HEAD + * info_val==priority level blocked (0 highest) + */ + KBASE_KTRACE_CODE_MAKE_CODE(JS_SLOT_PRIO_IS_BLOCKED), /* * Scheduler Policy events */ diff --git a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_defs_jm.h b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_defs_jm.h index c01f930..efa8ab0 100644 --- a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_defs_jm.h +++ b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_defs_jm.h @@ -45,9 +45,12 @@ * * ftrace backend now outputs kctx field (as %d_%u format). * + * 2.2: + * Add tracing codes for pulling, unpulling, and returns atoms to JS for + * diagnosing soft-stop path and preemption problems */ #define KBASE_KTRACE_VERSION_MAJOR 2 -#define KBASE_KTRACE_VERSION_MINOR 1 +#define KBASE_KTRACE_VERSION_MINOR 2 #endif /* KBASE_KTRACE_TARGET_RBUF */ /* diff --git a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.c b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.c index fed9c1f..05d1677 100644 --- a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.c +++ b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.c @@ -71,10 +71,11 @@ void kbasep_ktrace_backend_format_msg(struct kbase_ktrace_msg *trace_msg, } void kbasep_ktrace_add_jm(struct kbase_device *kbdev, - enum kbase_ktrace_code code, struct kbase_context *kctx, - struct kbase_jd_atom *katom, u64 gpu_addr, - kbase_ktrace_flag_t flags, int refcount, int jobslot, - u64 info_val) + enum kbase_ktrace_code code, + struct kbase_context *kctx, + const struct kbase_jd_atom *katom, u64 gpu_addr, + kbase_ktrace_flag_t flags, int refcount, int jobslot, + u64 info_val) { unsigned long irqflags; struct kbase_ktrace_msg *trace_msg; diff --git a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.h b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.h index 8b09d05..ffae8d4 100644 --- a/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.h +++ b/mali_kbase/debug/backend/mali_kbase_debug_ktrace_jm.h @@ -41,10 +41,11 @@ * PRIVATE: do not use directly. Use KBASE_KTRACE_ADD_JM() instead. */ void kbasep_ktrace_add_jm(struct kbase_device *kbdev, - enum kbase_ktrace_code code, struct kbase_context *kctx, - struct kbase_jd_atom *katom, u64 gpu_addr, - kbase_ktrace_flag_t flags, int refcount, int jobslot, - u64 info_val); + enum kbase_ktrace_code code, + struct kbase_context *kctx, + const struct kbase_jd_atom *katom, u64 gpu_addr, + kbase_ktrace_flag_t flags, int refcount, int jobslot, + u64 info_val); #define KBASE_KTRACE_RBUF_ADD_JM(kbdev, code, kctx, katom, gpu_addr, flags, \ refcount, jobslot, info_val) \ diff --git a/mali_kbase/debug/backend/mali_kbase_debug_linux_ktrace_jm.h b/mali_kbase/debug/backend/mali_kbase_debug_linux_ktrace_jm.h index 2e88e69..8fa4e2a 100644 --- a/mali_kbase/debug/backend/mali_kbase_debug_linux_ktrace_jm.h +++ b/mali_kbase/debug/backend/mali_kbase_debug_linux_ktrace_jm.h @@ -50,6 +50,8 @@ DECLARE_EVENT_CLASS(mali_jm_slot_template, DEFINE_EVENT(mali_jm_slot_template, mali_##name, \ TP_PROTO(struct kbase_context *kctx, int jobslot, u64 info_val), \ TP_ARGS(kctx, jobslot, info_val)) +DEFINE_MALI_JM_SLOT_EVENT(JM_RETURN_ATOM_TO_JS); +DEFINE_MALI_JM_SLOT_EVENT(JM_MARK_FOR_RETURN_TO_JS); DEFINE_MALI_JM_SLOT_EVENT(JM_SUBMIT); DEFINE_MALI_JM_SLOT_EVENT(JM_JOB_DONE); DEFINE_MALI_JM_SLOT_EVENT(JM_UPDATE_HEAD); @@ -68,6 +70,7 @@ DEFINE_MALI_JM_SLOT_EVENT(JS_CORE_REF_REGISTER_ON_RECHECK_FAILED); DEFINE_MALI_JM_SLOT_EVENT(JS_AFFINITY_SUBMIT_TO_BLOCKED); DEFINE_MALI_JM_SLOT_EVENT(JS_AFFINITY_CURRENT); DEFINE_MALI_JM_SLOT_EVENT(JD_DONE_TRY_RUN_NEXT_JOB); +DEFINE_MALI_JM_SLOT_EVENT(JS_PULL_JOB); DEFINE_MALI_JM_SLOT_EVENT(JS_CORE_REF_REQUEST_CORES_FAILED); DEFINE_MALI_JM_SLOT_EVENT(JS_CORE_REF_REGISTER_INUSE_FAILED); DEFINE_MALI_JM_SLOT_EVENT(JS_CORE_REF_REQUEST_ON_RECHECK_FAILED); @@ -76,6 +79,10 @@ DEFINE_MALI_JM_SLOT_EVENT(JS_JOB_DONE_TRY_RUN_NEXT_JOB); DEFINE_MALI_JM_SLOT_EVENT(JS_JOB_DONE_RETRY_NEEDED); DEFINE_MALI_JM_SLOT_EVENT(JS_POLICY_DEQUEUE_JOB); DEFINE_MALI_JM_SLOT_EVENT(JS_POLICY_DEQUEUE_JOB_IRQ); +DEFINE_MALI_JM_SLOT_EVENT(JS_SLOT_PRIO_BLOCKED); +DEFINE_MALI_JM_SLOT_EVENT(JS_SLOT_PRIO_UNBLOCKED); +DEFINE_MALI_JM_SLOT_EVENT(JS_SLOT_PRIO_AND_HIGHER_UNBLOCKED); +DEFINE_MALI_JM_SLOT_EVENT(JS_SLOT_PRIO_IS_BLOCKED); #undef DEFINE_MALI_JM_SLOT_EVENT DECLARE_EVENT_CLASS(mali_jm_refcount_template, @@ -152,10 +159,13 @@ DEFINE_MALI_JM_ADD_EVENT(JM_ZAP_SCHEDULED); DEFINE_MALI_JM_ADD_EVENT(JM_ZAP_DONE); DEFINE_MALI_JM_ADD_EVENT(JM_SUBMIT_AFTER_RESET); DEFINE_MALI_JM_ADD_EVENT(JM_JOB_COMPLETE); +DEFINE_MALI_JM_ADD_EVENT(JS_UNPULL_JOB); DEFINE_MALI_JM_ADD_EVENT(JS_CTX_ATTR_NOW_ON_RUNPOOL); DEFINE_MALI_JM_ADD_EVENT(JS_CTX_ATTR_NOW_OFF_RUNPOOL); DEFINE_MALI_JM_ADD_EVENT(JS_CTX_ATTR_NOW_ON_CTX); DEFINE_MALI_JM_ADD_EVENT(JS_CTX_ATTR_NOW_OFF_CTX); +DEFINE_MALI_JM_ADD_EVENT(JS_RETURN_WORKER); +DEFINE_MALI_JM_ADD_EVENT(JS_RETURN_WORKER_END); DEFINE_MALI_JM_ADD_EVENT(JS_POLICY_TIMER_END); DEFINE_MALI_JM_ADD_EVENT(JS_POLICY_TIMER_START); DEFINE_MALI_JM_ADD_EVENT(JS_POLICY_ENQUEUE_JOB); diff --git a/mali_kbase/debug/mali_kbase_debug_ktrace_codes.h b/mali_kbase/debug/mali_kbase_debug_ktrace_codes.h index 3309834..1c6b4cd 100644 --- a/mali_kbase/debug/mali_kbase_debug_ktrace_codes.h +++ b/mali_kbase/debug/mali_kbase_debug_ktrace_codes.h @@ -138,6 +138,10 @@ int dummy_array[] = { /* info_val == policy number */ KBASE_KTRACE_CODE_MAKE_CODE(PM_CURRENT_POLICY_TERM), + KBASE_KTRACE_CODE_MAKE_CODE(PM_POWEROFF_WAIT_WQ), + KBASE_KTRACE_CODE_MAKE_CODE(PM_RUNTIME_SUSPEND_CALLBACK), + KBASE_KTRACE_CODE_MAKE_CODE(PM_RUNTIME_RESUME_CALLBACK), + /* * Context Scheduler events */ diff --git a/mali_kbase/debug/mali_kbase_debug_linux_ktrace.h b/mali_kbase/debug/mali_kbase_debug_linux_ktrace.h index b56dec4..5fac763 100644 --- a/mali_kbase/debug/mali_kbase_debug_linux_ktrace.h +++ b/mali_kbase/debug/mali_kbase_debug_linux_ktrace.h @@ -95,6 +95,9 @@ DEFINE_MALI_ADD_EVENT(PM_CA_SET_POLICY); DEFINE_MALI_ADD_EVENT(PM_CONTEXT_ACTIVE); DEFINE_MALI_ADD_EVENT(PM_CONTEXT_IDLE); DEFINE_MALI_ADD_EVENT(PM_WAKE_WAITERS); +DEFINE_MALI_ADD_EVENT(PM_POWEROFF_WAIT_WQ); +DEFINE_MALI_ADD_EVENT(PM_RUNTIME_SUSPEND_CALLBACK); +DEFINE_MALI_ADD_EVENT(PM_RUNTIME_RESUME_CALLBACK); DEFINE_MALI_ADD_EVENT(SCHED_RETAIN_CTX_NOLOCK); DEFINE_MALI_ADD_EVENT(SCHED_RELEASE_CTX); #ifdef CONFIG_MALI_ARBITER_SUPPORT diff --git a/mali_kbase/device/backend/mali_kbase_device_csf.c b/mali_kbase/device/backend/mali_kbase_device_csf.c index 0c5052b..7b37a96 100644 --- a/mali_kbase/device/backend/mali_kbase_device_csf.c +++ b/mali_kbase/device/backend/mali_kbase_device_csf.c @@ -37,6 +37,7 @@ #include <backend/gpu/mali_kbase_clk_rate_trace_mgr.h> #include <csf/mali_kbase_csf_csg_debugfs.h> #include <mali_kbase_hwcnt_virtualizer.h> +#include <mali_kbase_kinstr_prfcnt.h> #include <mali_kbase_vinstr.h> /** @@ -51,6 +52,7 @@ static void kbase_device_firmware_hwcnt_term(struct kbase_device *kbdev) { if (kbdev->csf.firmware_inited) { + kbase_kinstr_prfcnt_term(kbdev->kinstr_prfcnt_ctx); kbase_vinstr_term(kbdev->vinstr_ctx); kbase_hwcnt_virtualizer_term(kbdev->hwcnt_gpu_virt); kbase_hwcnt_backend_csf_metadata_term(&kbdev->hwcnt_gpu_iface); @@ -266,6 +268,8 @@ static const struct kbase_device_init dev_init[] = { "Timeline stream initialization failed" }, { kbase_clk_rate_trace_manager_init, kbase_clk_rate_trace_manager_term, "Clock rate trace manager initialization failed" }, + { kbase_lowest_gpu_freq_init, NULL, + "Lowest freq initialization failed" }, { kbase_device_hwcnt_backend_csf_if_init, kbase_device_hwcnt_backend_csf_if_term, "GPU hwcnt backend CSF interface creation failed" }, @@ -390,8 +394,19 @@ static int kbase_device_hwcnt_csf_deferred_init(struct kbase_device *kbdev) goto vinstr_fail; } + ret = kbase_kinstr_prfcnt_init(kbdev->hwcnt_gpu_virt, + &kbdev->kinstr_prfcnt_ctx); + if (ret) { + dev_err(kbdev->dev, + "Performance counter instrumentation initialization failed"); + goto kinstr_prfcnt_fail; + } + return ret; +kinstr_prfcnt_fail: + kbase_vinstr_term(kbdev->vinstr_ctx); + vinstr_fail: kbase_hwcnt_virtualizer_term(kbdev->hwcnt_gpu_virt); @@ -418,8 +433,6 @@ static int kbase_csf_firmware_deferred_init(struct kbase_device *kbdev) lockdep_assert_held(&kbdev->fw_load_lock); - kbase_pm_context_active(kbdev); - err = kbase_csf_firmware_init(kbdev); if (!err) { unsigned long flags; @@ -432,8 +445,6 @@ static int kbase_csf_firmware_deferred_init(struct kbase_device *kbdev) dev_err(kbdev->dev, "Firmware initialization failed"); } - kbase_pm_context_idle(kbdev); - return err; } @@ -444,6 +455,8 @@ int kbase_device_firmware_init_once(struct kbase_device *kbdev) mutex_lock(&kbdev->fw_load_lock); if (!kbdev->csf.firmware_inited) { + kbase_pm_context_active(kbdev); + ret = kbase_csf_firmware_deferred_init(kbdev); if (ret) goto out; @@ -455,9 +468,10 @@ int kbase_device_firmware_init_once(struct kbase_device *kbdev) } kbase_csf_debugfs_init(kbdev); +out: + kbase_pm_context_idle(kbdev); } -out: mutex_unlock(&kbdev->fw_load_lock); return ret; diff --git a/mali_kbase/device/backend/mali_kbase_device_hw_csf.c b/mali_kbase/device/backend/mali_kbase_device_hw_csf.c index 8427edb..ae6dc1b 100644 --- a/mali_kbase/device/backend/mali_kbase_device_hw_csf.c +++ b/mali_kbase/device/backend/mali_kbase_device_hw_csf.c @@ -80,6 +80,7 @@ static void kbase_gpu_fault_interrupt(struct kbase_device *kbdev) } } else kbase_report_gpu_fault(kbdev, status, as_nr, as_valid); + } void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val) @@ -124,6 +125,9 @@ void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val) if (kbase_prepare_to_reset_gpu( kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR)) kbase_reset_gpu(kbdev); + + /* Defer the clearing to the GPU reset sequence */ + val &= ~GPU_PROTECTED_FAULT; } if (val & RESET_COMPLETED) @@ -132,6 +136,20 @@ void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val) KBASE_KTRACE_ADD(kbdev, CORE_GPU_IRQ_CLEAR, NULL, val); kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_CLEAR), val); +#ifdef KBASE_PM_RUNTIME + if (val & DOORBELL_MIRROR) { + unsigned long flags; + + dev_dbg(kbdev->dev, "Doorbell mirror interrupt received"); + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + WARN_ON(!kbase_csf_scheduler_get_nr_active_csgs(kbdev)); + kbase_pm_disable_db_mirror_interrupt(kbdev); + kbdev->pm.backend.exit_gpu_sleep_mode = true; + kbase_csf_scheduler_invoke_tick(kbdev); + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + } +#endif + /* kbase_pm_check_transitions (called by kbase_pm_power_changed) must * be called after the IRQ has been cleared. This is because it might * trigger further power transitions and we don't want to miss the @@ -160,3 +178,60 @@ void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val) KBASE_KTRACE_ADD(kbdev, CORE_GPU_IRQ_DONE, NULL, val); } + +#if !IS_ENABLED(CONFIG_MALI_NO_MALI) +static bool kbase_is_register_accessible(u32 offset) +{ +#ifdef CONFIG_MALI_DEBUG + if (((offset >= MCU_SUBSYSTEM_BASE) && (offset < IPA_CONTROL_BASE)) || + ((offset >= GPU_CONTROL_MCU_BASE) && (offset < USER_BASE))) { + WARN(1, "Invalid register offset 0x%x", offset); + return false; + } +#endif + + return true; +} + +void kbase_reg_write(struct kbase_device *kbdev, u32 offset, u32 value) +{ + KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered); + KBASE_DEBUG_ASSERT(kbdev->dev != NULL); + + if (!kbase_is_register_accessible(offset)) + return; + + writel(value, kbdev->reg + offset); + +#if IS_ENABLED(CONFIG_DEBUG_FS) + if (unlikely(kbdev->io_history.enabled)) + kbase_io_history_add(&kbdev->io_history, kbdev->reg + offset, + value, 1); +#endif /* CONFIG_DEBUG_FS */ + dev_dbg(kbdev->dev, "w: reg %08x val %08x", offset, value); +} +KBASE_EXPORT_TEST_API(kbase_reg_write); + +u32 kbase_reg_read(struct kbase_device *kbdev, u32 offset) +{ + u32 val; + + KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered); + KBASE_DEBUG_ASSERT(kbdev->dev != NULL); + + if (!kbase_is_register_accessible(offset)) + return 0; + + val = readl(kbdev->reg + offset); + +#if IS_ENABLED(CONFIG_DEBUG_FS) + if (unlikely(kbdev->io_history.enabled)) + kbase_io_history_add(&kbdev->io_history, kbdev->reg + offset, + val, 0); +#endif /* CONFIG_DEBUG_FS */ + dev_dbg(kbdev->dev, "r: reg %08x val %08x", offset, val); + + return val; +} +KBASE_EXPORT_TEST_API(kbase_reg_read); +#endif /* !IS_ENABLED(CONFIG_MALI_NO_MALI) */ diff --git a/mali_kbase/device/backend/mali_kbase_device_hw_jm.c b/mali_kbase/device/backend/mali_kbase_device_hw_jm.c index c4e6eb8..e8f8953 100644 --- a/mali_kbase/device/backend/mali_kbase_device_hw_jm.c +++ b/mali_kbase/device/backend/mali_kbase_device_hw_jm.c @@ -51,6 +51,7 @@ static void kbase_report_gpu_fault(struct kbase_device *kbdev, int multiple) address); if (multiple) dev_warn(kbdev->dev, "There were multiple GPU faults - some have not been reported\n"); + } void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val) @@ -96,3 +97,41 @@ void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val) KBASE_KTRACE_ADD(kbdev, CORE_GPU_IRQ_DONE, NULL, val); } + +#if !IS_ENABLED(CONFIG_MALI_NO_MALI) +void kbase_reg_write(struct kbase_device *kbdev, u32 offset, u32 value) +{ + KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered); + KBASE_DEBUG_ASSERT(kbdev->dev != NULL); + + writel(value, kbdev->reg + offset); + +#if IS_ENABLED(CONFIG_DEBUG_FS) + if (unlikely(kbdev->io_history.enabled)) + kbase_io_history_add(&kbdev->io_history, kbdev->reg + offset, + value, 1); +#endif /* CONFIG_DEBUG_FS */ + dev_dbg(kbdev->dev, "w: reg %08x val %08x", offset, value); +} +KBASE_EXPORT_TEST_API(kbase_reg_write); + +u32 kbase_reg_read(struct kbase_device *kbdev, u32 offset) +{ + u32 val; + + KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered); + KBASE_DEBUG_ASSERT(kbdev->dev != NULL); + + val = readl(kbdev->reg + offset); + +#if IS_ENABLED(CONFIG_DEBUG_FS) + if (unlikely(kbdev->io_history.enabled)) + kbase_io_history_add(&kbdev->io_history, kbdev->reg + offset, + val, 0); +#endif /* CONFIG_DEBUG_FS */ + dev_dbg(kbdev->dev, "r: reg %08x val %08x", offset, val); + + return val; +} +KBASE_EXPORT_TEST_API(kbase_reg_read); +#endif /* !IS_ENABLED(CONFIG_MALI_NO_MALI) */ diff --git a/mali_kbase/device/backend/mali_kbase_device_jm.c b/mali_kbase/device/backend/mali_kbase_device_jm.c index 6a6ab60..7288e8e 100644 --- a/mali_kbase/device/backend/mali_kbase_device_jm.c +++ b/mali_kbase/device/backend/mali_kbase_device_jm.c @@ -185,6 +185,8 @@ static const struct kbase_device_init dev_init[] = { "Timeline stream initialization failed" }, { kbase_clk_rate_trace_manager_init, kbase_clk_rate_trace_manager_term, "Clock rate trace manager initialization failed" }, + { kbase_lowest_gpu_freq_init, NULL, + "Lowest freq initialization failed" }, { kbase_instr_backend_init, kbase_instr_backend_term, "Instrumentation backend initialization failed" }, { kbase_device_hwcnt_backend_jm_init, @@ -197,6 +199,8 @@ static const struct kbase_device_init dev_init[] = { "GPU hwcnt virtualizer initialization failed" }, { kbase_device_vinstr_init, kbase_device_vinstr_term, "Virtual instrumentation initialization failed" }, + { kbase_device_kinstr_prfcnt_init, kbase_device_kinstr_prfcnt_term, + "Performance counter instrumentation initialization failed" }, { kbase_backend_late_init, kbase_backend_late_term, "Late backend initialization failed" }, #ifdef MALI_KBASE_BUILD diff --git a/mali_kbase/device/mali_kbase_device.c b/mali_kbase/device/mali_kbase_device.c index 0f992c3..518aaf9 100644 --- a/mali_kbase/device/mali_kbase_device.c +++ b/mali_kbase/device/mali_kbase_device.c @@ -40,6 +40,7 @@ #include <linux/priority_control_manager.h> #include <tl/mali_kbase_timeline.h> +#include "mali_kbase_kinstr_prfcnt.h" #include "mali_kbase_vinstr.h" #include "mali_kbase_hwcnt_context.h" #include "mali_kbase_hwcnt_virtualizer.h" @@ -49,6 +50,7 @@ #include "backend/gpu/mali_kbase_pm_internal.h" #include "backend/gpu/mali_kbase_irq_internal.h" #include "mali_kbase_regs_history_debugfs.h" +#include "mali_kbase_pbha.h" #ifdef CONFIG_MALI_ARBITER_SUPPORT #include "arbiter/mali_kbase_arbiter_pm.h" @@ -273,6 +275,14 @@ int kbase_device_misc_init(struct kbase_device * const kbdev) if (err) goto dma_set_mask_failed; + /* There is no limit for Mali, so set to max. We only do this if dma_parms + * is already allocated by the platform. + */ + if (kbdev->dev->dma_parms) + err = dma_set_max_seg_size(kbdev->dev, UINT_MAX); + if (err) + goto dma_set_mask_failed; + kbdev->nr_hw_address_spaces = kbdev->gpu_props.num_address_spaces; err = kbase_device_all_as_init(kbdev); @@ -282,6 +292,9 @@ int kbase_device_misc_init(struct kbase_device * const kbdev) err = kbase_ktrace_init(kbdev); if (err) goto term_as; + err = kbase_pbha_read_dtb(kbdev); + if (err) + goto term_ktrace; init_waitqueue_head(&kbdev->cache_clean_wait); @@ -309,6 +322,8 @@ int kbase_device_misc_init(struct kbase_device * const kbdev) } return 0; +term_ktrace: + kbase_ktrace_term(kbdev); term_as: kbase_device_all_as_term(kbdev); dma_set_mask_failed: @@ -395,6 +410,17 @@ void kbase_device_vinstr_term(struct kbase_device *kbdev) kbase_vinstr_term(kbdev->vinstr_ctx); } +int kbase_device_kinstr_prfcnt_init(struct kbase_device *kbdev) +{ + return kbase_kinstr_prfcnt_init(kbdev->hwcnt_gpu_virt, + &kbdev->kinstr_prfcnt_ctx); +} + +void kbase_device_kinstr_prfcnt_term(struct kbase_device *kbdev) +{ + kbase_kinstr_prfcnt_term(kbdev->kinstr_prfcnt_ctx); +} + int kbase_device_io_history_init(struct kbase_device *kbdev) { return kbase_io_history_init(&kbdev->io_history, @@ -461,6 +487,11 @@ int kbase_device_early_init(struct kbase_device *kbdev) if (err) goto fail_runtime_pm; + /* This spinlock is initialized before doing the first access to GPU + * registers and installing interrupt handlers. + */ + spin_lock_init(&kbdev->hwaccess_lock); + /* Ensure we can access the GPU registers */ kbase_pm_register_access_enable(kbdev); @@ -470,10 +501,6 @@ int kbase_device_early_init(struct kbase_device *kbdev) /* We're done accessing the GPU registers for now. */ kbase_pm_register_access_disable(kbdev); - /* This spinlock has to be initialized before installing interrupt - * handlers that require to hold it to process interrupts. - */ - spin_lock_init(&kbdev->hwaccess_lock); #ifdef CONFIG_MALI_ARBITER_SUPPORT if (kbdev->arb.arb_if) err = kbase_arbiter_pm_install_interrupts(kbdev); diff --git a/mali_kbase/device/mali_kbase_device_hw.c b/mali_kbase/device/mali_kbase_device_hw.c index e80559a..4c98ae1 100644 --- a/mali_kbase/device/mali_kbase_device_hw.c +++ b/mali_kbase/device/mali_kbase_device_hw.c @@ -28,44 +28,6 @@ #include <mmu/mali_kbase_mmu.h> #if !IS_ENABLED(CONFIG_MALI_NO_MALI) -void kbase_reg_write(struct kbase_device *kbdev, u32 offset, u32 value) -{ - KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered); - KBASE_DEBUG_ASSERT(kbdev->dev != NULL); - - writel(value, kbdev->reg + offset); - -#if IS_ENABLED(CONFIG_DEBUG_FS) - if (unlikely(kbdev->io_history.enabled)) - kbase_io_history_add(&kbdev->io_history, kbdev->reg + offset, - value, 1); -#endif /* CONFIG_DEBUG_FS */ - dev_dbg(kbdev->dev, "w: reg %08x val %08x", offset, value); -} - -KBASE_EXPORT_TEST_API(kbase_reg_write); - -u32 kbase_reg_read(struct kbase_device *kbdev, u32 offset) -{ - u32 val; - - KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered); - KBASE_DEBUG_ASSERT(kbdev->dev != NULL); - - val = readl(kbdev->reg + offset); - -#if IS_ENABLED(CONFIG_DEBUG_FS) - if (unlikely(kbdev->io_history.enabled)) - kbase_io_history_add(&kbdev->io_history, kbdev->reg + offset, - val, 0); -#endif /* CONFIG_DEBUG_FS */ - dev_dbg(kbdev->dev, "r: reg %08x val %08x", offset, val); - - return val; -} - -KBASE_EXPORT_TEST_API(kbase_reg_read); - bool kbase_is_gpu_removed(struct kbase_device *kbdev) { u32 val; @@ -99,7 +61,7 @@ void kbase_gpu_start_cache_clean_nolock(struct kbase_device *kbdev) KBASE_KTRACE_ADD(kbdev, CORE_GPU_CLEAN_INV_CACHES, NULL, 0); kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND), - GPU_COMMAND_CLEAN_INV_CACHES); + GPU_COMMAND_CACHE_CLN_INV_L2); kbdev->cache_clean_in_progress = true; } @@ -134,7 +96,7 @@ void kbase_clean_caches_done(struct kbase_device *kbdev) KBASE_KTRACE_ADD(kbdev, CORE_GPU_CLEAN_INV_CACHES, NULL, 0); kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND), - GPU_COMMAND_CLEAN_INV_CACHES); + GPU_COMMAND_CACHE_CLN_INV_L2); } else { /* Disable interrupt */ irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK)); diff --git a/mali_kbase/device/mali_kbase_device_internal.h b/mali_kbase/device/mali_kbase_device_internal.h index d422407..d4f6875 100644 --- a/mali_kbase/device/mali_kbase_device_internal.h +++ b/mali_kbase/device/mali_kbase_device_internal.h @@ -39,6 +39,9 @@ struct kbase_device_init { int kbase_device_vinstr_init(struct kbase_device *kbdev); void kbase_device_vinstr_term(struct kbase_device *kbdev); +int kbase_device_kinstr_prfcnt_init(struct kbase_device *kbdev); +void kbase_device_kinstr_prfcnt_term(struct kbase_device *kbdev); + int kbase_device_timeline_init(struct kbase_device *kbdev); void kbase_device_timeline_term(struct kbase_device *kbdev); diff --git a/mali_kbase/gpu/backend/mali_kbase_gpu_fault_csf.c b/mali_kbase/gpu/backend/mali_kbase_gpu_fault_csf.c index f9d4c14..7499729 100644 --- a/mali_kbase/gpu/backend/mali_kbase_gpu_fault_csf.c +++ b/mali_kbase/gpu/backend/mali_kbase_gpu_fault_csf.c @@ -42,15 +42,19 @@ const char *kbase_gpu_exception_name(u32 const exception_code) case CS_FATAL_EXCEPTION_TYPE_CS_ENDPOINT_FAULT: e = "FATAL_CS_ENDPOINT_FAULT"; break; - case CS_FATAL_EXCEPTION_TYPE_CS_BUS_FAULT: - e = "FATAL_CS_BUS_FAULT"; - break; case CS_FATAL_EXCEPTION_TYPE_CS_INVALID_INSTRUCTION: e = "FATAL_CS_INVALID_INSTRUCTION"; break; case CS_FATAL_EXCEPTION_TYPE_CS_CALL_STACK_OVERFLOW: e = "FATAL_CS_CALL_STACK_OVERFLOW"; break; + /* + * CS_FAULT_EXCEPTION_TYPE_CS_BUS_FAULT and CS_FATAL_EXCEPTION_TYPE_CS_BUS_FAULT share the same error code + * Type of CS_BUS_FAULT will be differentiated by CSF exception handler + */ + case CS_FAULT_EXCEPTION_TYPE_CS_BUS_FAULT: + e = "CS_BUS_FAULT"; + break; /* Shader exceptions */ case CS_FAULT_EXCEPTION_TYPE_INSTR_INVALID_PC: e = "INSTR_INVALID_PC"; @@ -61,6 +65,10 @@ const char *kbase_gpu_exception_name(u32 const exception_code) case CS_FAULT_EXCEPTION_TYPE_INSTR_BARRIER_FAULT: e = "INSTR_BARRIER_FAULT"; break; + /* Iterator exceptions */ + case CS_FAULT_EXCEPTION_TYPE_KABOOM: + e = "KABOOM"; + break; /* Misc exceptions */ case CS_FAULT_EXCEPTION_TYPE_DATA_INVALID_FAULT: e = "DATA_INVALID_FAULT"; diff --git a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.c b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.c index 4737b0e..e240117 100644 --- a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.c +++ b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.c @@ -44,8 +44,9 @@ static inline u32 kbase_ipa_read_hwcnt( u32 offset) { u8 *p = (u8 *)model_data->dump_buf.dump_buf; + u64 val = *(u64 *)&p[offset]; - return *(u32 *)&p[offset]; + return (val > U32_MAX) ? U32_MAX : (u32)val; } static inline s64 kbase_ipa_add_saturate(s64 a, s64 b) diff --git a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.h b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.h index 3486a9b..faf08ef 100644 --- a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.h +++ b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_common_jm.h @@ -30,7 +30,7 @@ #define KBASE_IPA_MAX_GROUP_DEF_NUM 16 /* Number of bytes per hardware counter in a vinstr_buffer. */ -#define KBASE_IPA_NR_BYTES_PER_CNT 4 +#define KBASE_IPA_NR_BYTES_PER_CNT (sizeof(u64)) /* Number of hardware counters per block in a vinstr_buffer. */ #define KBASE_IPA_NR_CNT_PER_BLOCK 64 diff --git a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_csf.c b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_csf.c index 1852c3c..a47699c 100644 --- a/mali_kbase/ipa/backend/mali_kbase_ipa_counter_csf.c +++ b/mali_kbase/ipa/backend/mali_kbase_ipa_counter_csf.c @@ -25,14 +25,18 @@ /* MEMSYS counter block offsets */ #define L2_RD_MSG_IN (16) #define L2_WR_MSG_IN (18) +#define L2_RD_MSG_OUT (22) #define L2_READ_LOOKUP (26) #define L2_EXT_WRITE_NOSNP_FULL (43) /* SC counter block offsets */ +#define FRAG_STARVING (8) +#define FRAG_PARTIAL_QUADS_RAST (10) #define FRAG_QUADS_EZS_UPDATE (13) #define FULL_QUAD_WARPS (21) #define EXEC_INSTR_FMA (27) #define EXEC_INSTR_CVT (28) +#define EXEC_INSTR_MSG (30) #define TEX_FILT_NUM_OPS (39) #define LS_MEM_READ_SHORT (45) #define LS_MEM_WRITE_SHORT (47) @@ -44,6 +48,8 @@ #define VFETCH_POS_READ_WAIT (29) #define VFETCH_VERTEX_WAIT (30) #define IDVS_VAR_SHAD_STALL (38) +#define ITER_STALL (40) +#define PMGR_PTR_RD_STALL (48) #define COUNTER_DEF(cnt_name, coeff, cnt_idx, block_type) \ { \ @@ -80,6 +86,33 @@ static const struct kbase_ipa_counter ipa_top_level_cntrs_def_todx[] = { TILER_COUNTER_DEF("vfetch_pos_read_wait", -119118, VFETCH_POS_READ_WAIT), }; +static const struct kbase_ipa_counter ipa_top_level_cntrs_def_tgrx[] = { + MEMSYS_COUNTER_DEF("l2_rd_msg_in", 295631, L2_RD_MSG_IN), + MEMSYS_COUNTER_DEF("l2_ext_write_nosnp_ull", 325168, L2_EXT_WRITE_NOSNP_FULL), + + TILER_COUNTER_DEF("prefetch_stall", 145435, PREFETCH_STALL), + TILER_COUNTER_DEF("idvs_var_shad_stall", -171917, IDVS_VAR_SHAD_STALL), + TILER_COUNTER_DEF("idvs_pos_shad_stall", 109980, IDVS_POS_SHAD_STALL), + TILER_COUNTER_DEF("vfetch_pos_read_wait", -119118, VFETCH_POS_READ_WAIT), +}; + +static const struct kbase_ipa_counter ipa_top_level_cntrs_def_tvax[] = { + MEMSYS_COUNTER_DEF("l2_rd_msg_out", 491414, L2_RD_MSG_OUT), + MEMSYS_COUNTER_DEF("l2_wr_msg_in", 408645, L2_WR_MSG_IN), + + TILER_COUNTER_DEF("iter_stall", 893324, ITER_STALL), + TILER_COUNTER_DEF("pmgr_ptr_rd_stall", -975117, PMGR_PTR_RD_STALL), + TILER_COUNTER_DEF("idvs_pos_shad_stall", 22555, IDVS_POS_SHAD_STALL), +}; + +static const struct kbase_ipa_counter ipa_top_level_cntrs_def_ttux[] = { + MEMSYS_COUNTER_DEF("l2_rd_msg_in", 800836, L2_RD_MSG_IN), + MEMSYS_COUNTER_DEF("l2_wr_msg_in", 415579, L2_WR_MSG_IN), + MEMSYS_COUNTER_DEF("l2_read_lookup", -198124, L2_READ_LOOKUP), + + TILER_COUNTER_DEF("idvs_pos_shad_stall", 117358, IDVS_POS_SHAD_STALL), + TILER_COUNTER_DEF("vfetch_vertex_wait", -391964, VFETCH_VERTEX_WAIT), +}; /* These tables provide a description of each performance counter * used by the shader cores counter model for energy estimation. @@ -93,6 +126,32 @@ static const struct kbase_ipa_counter ipa_shader_core_cntrs_def_todx[] = { SC_COUNTER_DEF("vary_slot_16", 181069, VARY_SLOT_16), }; +static const struct kbase_ipa_counter ipa_shader_core_cntrs_def_tgrx[] = { + SC_COUNTER_DEF("exec_instr_fma", 505449, EXEC_INSTR_FMA), + SC_COUNTER_DEF("tex_filt_num_operations", 574869, TEX_FILT_NUM_OPS), + SC_COUNTER_DEF("ls_mem_read_short", 60917, LS_MEM_READ_SHORT), + SC_COUNTER_DEF("frag_quads_ezs_update", 694555, FRAG_QUADS_EZS_UPDATE), + SC_COUNTER_DEF("ls_mem_write_short", 698290, LS_MEM_WRITE_SHORT), + SC_COUNTER_DEF("vary_slot_16", 181069, VARY_SLOT_16), +}; + +static const struct kbase_ipa_counter ipa_shader_core_cntrs_def_tvax[] = { + SC_COUNTER_DEF("tex_filt_num_operations", 142536, TEX_FILT_NUM_OPS), + SC_COUNTER_DEF("exec_instr_fma", 243497, EXEC_INSTR_FMA), + SC_COUNTER_DEF("exec_instr_msg", 1344410, EXEC_INSTR_MSG), + SC_COUNTER_DEF("vary_slot_16", -119612, VARY_SLOT_16), + SC_COUNTER_DEF("frag_partial_quads_rast", 676201, FRAG_PARTIAL_QUADS_RAST), + SC_COUNTER_DEF("frag_starving", 62421, FRAG_STARVING), +}; + +static const struct kbase_ipa_counter ipa_shader_core_cntrs_def_ttux[] = { + SC_COUNTER_DEF("exec_instr_fma", 457012, EXEC_INSTR_FMA), + SC_COUNTER_DEF("tex_filt_num_operations", 441911, TEX_FILT_NUM_OPS), + SC_COUNTER_DEF("ls_mem_read_short", 322525, LS_MEM_READ_SHORT), + SC_COUNTER_DEF("full_quad_warps", 844124, FULL_QUAD_WARPS), + SC_COUNTER_DEF("exec_instr_cvt", 226411, EXEC_INSTR_CVT), + SC_COUNTER_DEF("frag_quads_ezs_update",372032, FRAG_QUADS_EZS_UPDATE), +}; #define IPA_POWER_MODEL_OPS(gpu, init_token) \ const struct kbase_ipa_model_ops kbase_ ## gpu ## _ipa_model_ops = { \ @@ -128,13 +187,21 @@ static const struct kbase_ipa_counter ipa_shader_core_cntrs_def_todx[] = { /* Reference voltage value is 750 mV. */ STANDARD_POWER_MODEL(todx, 750); +STANDARD_POWER_MODEL(tgrx, 750); +STANDARD_POWER_MODEL(tvax, 750); +STANDARD_POWER_MODEL(ttux, 750); /* Assuming LODX is an alias of TODX for IPA */ ALIAS_POWER_MODEL(lodx, todx); +/* Assuming LTUX is an alias of TTUX for IPA */ +ALIAS_POWER_MODEL(ltux, ttux); + static const struct kbase_ipa_model_ops *ipa_counter_model_ops[] = { &kbase_todx_ipa_model_ops, &kbase_lodx_ipa_model_ops, + &kbase_tgrx_ipa_model_ops, &kbase_tvax_ipa_model_ops, + &kbase_ttux_ipa_model_ops, &kbase_ltux_ipa_model_ops }; const struct kbase_ipa_model_ops *kbase_ipa_counter_model_ops_find( @@ -165,6 +232,14 @@ const char *kbase_ipa_counter_model_name_from_id(u32 gpu_id) return "mali-todx-power-model"; case GPU_ID2_PRODUCT_LODX: return "mali-lodx-power-model"; + case GPU_ID2_PRODUCT_TGRX: + return "mali-tgrx-power-model"; + case GPU_ID2_PRODUCT_TVAX: + return "mali-tvax-power-model"; + case GPU_ID2_PRODUCT_TTUX: + return "mali-ttux-power-model"; + case GPU_ID2_PRODUCT_LTUX: + return "mali-ltux-power-model"; default: return NULL; } diff --git a/mali_kbase/ipa/mali_kbase_ipa_debugfs.c b/mali_kbase/ipa/mali_kbase_ipa_debugfs.c index 5976389..14df542 100644 --- a/mali_kbase/ipa/mali_kbase_ipa_debugfs.c +++ b/mali_kbase/ipa/mali_kbase_ipa_debugfs.c @@ -247,7 +247,7 @@ static void kbase_ipa_model_debugfs_init(struct kbase_ipa_model *model) dir = debugfs_create_dir(model->ops->name, model->kbdev->mali_debugfs_directory); - if (!dir) { + if (IS_ERR_OR_NULL(dir)) { dev_err(model->kbdev->dev, "Couldn't create mali debugfs %s directory", model->ops->name); diff --git a/mali_kbase/jm/mali_kbase_jm_defs.h b/mali_kbase/jm/mali_kbase_jm_defs.h index c490f1c..cb1c276 100644 --- a/mali_kbase/jm/mali_kbase_jm_defs.h +++ b/mali_kbase/jm/mali_kbase_jm_defs.h @@ -87,8 +87,6 @@ #define KBASE_KATOM_FLAG_FAIL_BLOCKER (1<<8) /* Atom is currently in the list of atoms blocked on cross-slot dependencies */ #define KBASE_KATOM_FLAG_JSCTX_IN_X_DEP_LIST (1<<9) -/* Atom is currently holding a context reference */ -#define KBASE_KATOM_FLAG_HOLDING_CTX_REF (1<<10) /* Atom requires GPU to be in protected mode */ #define KBASE_KATOM_FLAG_PROTECTED (1<<11) /* Atom has been stored in runnable_tree */ @@ -176,7 +174,7 @@ struct kbase_jd_atom_dependency { static inline const struct kbase_jd_atom * kbase_jd_katom_dep_atom(const struct kbase_jd_atom_dependency *dep) { - LOCAL_ASSERT(dep != NULL); + KBASE_DEBUG_ASSERT(dep != NULL); return (const struct kbase_jd_atom *)(dep->atom); } @@ -191,7 +189,7 @@ kbase_jd_katom_dep_atom(const struct kbase_jd_atom_dependency *dep) static inline u8 kbase_jd_katom_dep_type( const struct kbase_jd_atom_dependency *dep) { - LOCAL_ASSERT(dep != NULL); + KBASE_DEBUG_ASSERT(dep != NULL); return dep->dep_type; } @@ -209,7 +207,7 @@ static inline void kbase_jd_katom_dep_set( { struct kbase_jd_atom_dependency *dep; - LOCAL_ASSERT(const_dep != NULL); + KBASE_DEBUG_ASSERT(const_dep != NULL); dep = (struct kbase_jd_atom_dependency *)const_dep; @@ -227,7 +225,7 @@ static inline void kbase_jd_katom_dep_clear( { struct kbase_jd_atom_dependency *dep; - LOCAL_ASSERT(const_dep != NULL); + KBASE_DEBUG_ASSERT(const_dep != NULL); dep = (struct kbase_jd_atom_dependency *)const_dep; @@ -653,6 +651,48 @@ static inline bool kbase_jd_katom_is_protected( return (bool)(katom->atom_flags & KBASE_KATOM_FLAG_PROTECTED); } +/** + * kbase_atom_is_younger - query if one atom is younger by age than another + * @katom_a the first atom + * @katom_a the second atom + * + * Return: true if the first atom is strictly younger than the second, false + * otherwise. + */ +static inline bool kbase_jd_atom_is_younger(const struct kbase_jd_atom *katom_a, + const struct kbase_jd_atom *katom_b) +{ + return ((s32)(katom_a->age - katom_b->age) < 0); +} + +/** + * kbase_jd_atom_is_earlier + * @katom_a: the first atom + * @katom_b: the second atom + * + * Return: true if the first atom has been submitted earlier than the + * second atom. It is used to understand if an atom that is ready has been + * submitted earlier than the currently running atom, so that the currently + * running atom should be preempted to allow the ready atom to run. + */ +static inline bool kbase_jd_atom_is_earlier(const struct kbase_jd_atom *katom_a, + const struct kbase_jd_atom *katom_b) +{ + /* No seq_nr set? */ + if (!katom_a->seq_nr || !katom_b->seq_nr) + return false; + + /* Efficiently handle the unlikely case of wrapping. + * The following code assumes that the delta between the sequence number + * of the two atoms is less than INT64_MAX. + * In the extremely unlikely case where the delta is higher, the comparison + * defaults for no preemption. + * The code also assumes that the conversion from unsigned to signed types + * works because the signed integers are 2's complement. + */ + return (s64)(katom_a->seq_nr - katom_b->seq_nr) < 0; +} + /* * Theory of operations: * diff --git a/mali_kbase/jm/mali_kbase_jm_js.h b/mali_kbase/jm/mali_kbase_jm_js.h index 5e0c4bc..5a972a5 100644 --- a/mali_kbase/jm/mali_kbase_jm_js.h +++ b/mali_kbase/jm/mali_kbase_jm_js.h @@ -108,6 +108,52 @@ int kbasep_js_kctx_init(struct kbase_context *const kctx); */ void kbasep_js_kctx_term(struct kbase_context *kctx); +/* kbase_jsctx_slot_prio_blocked_set - Set a context as being blocked for a job + * slot at and below a given priority level + * @kctx: The kbase_context + * @js: The job slot + * @sched_prio: The priority levels that the context is blocked at for @js (all + * priority levels at this level and below will be blocked) + * + * To preserve ordering and dependencies of atoms on soft-stopping (both within + * an between priority levels), a context must be marked as blocked for that + * atom's job slot, for all priority levels at or below the atom's priority. + * + * This must only be called due to an atom that was pulled from the context, + * otherwise there will be no way of unblocking the context when the atom is + * completed/unpulled. + * + * Atoms of higher priority might still be able to be pulled from the context + * on @js. This helps with starting a high priority atom as soon as possible. + */ +static inline void kbase_jsctx_slot_prio_blocked_set(struct kbase_context *kctx, + int js, int sched_prio) +{ + struct kbase_jsctx_slot_tracking *slot_tracking = + &kctx->slot_tracking[js]; + + lockdep_assert_held(&kctx->kbdev->hwaccess_lock); + WARN(!slot_tracking->atoms_pulled_pri[sched_prio], + "When marking slot %d as blocked for priority %d on a kctx, no atoms were pulled - the slot cannot become unblocked", + js, sched_prio); + + slot_tracking->blocked |= ((kbase_js_prio_bitmap_t)1) << sched_prio; + KBASE_KTRACE_ADD_JM_SLOT_INFO(kctx->kbdev, JS_SLOT_PRIO_BLOCKED, kctx, + NULL, 0, js, (unsigned int)sched_prio); +} + +/* kbase_jsctx_atoms_pulled - Return number of atoms pulled on a context + * @kctx: The kbase_context + * + * Having atoms pulled indicates the context is not idle. + * + * Return: the number of atoms pulled on @kctx + */ +static inline int kbase_jsctx_atoms_pulled(struct kbase_context *kctx) +{ + return atomic_read(&kctx->atoms_pulled_all_slots); +} + /** * kbasep_js_add_job - Add a job chain to the Job Scheduler, * and take necessary actions to @@ -947,7 +993,38 @@ static inline base_jd_prio kbasep_js_sched_prio_to_atom_prio(int sched_prio) * * Return: The same or lower priority than requested. */ - base_jd_prio kbase_js_priority_check(struct kbase_device *kbdev, base_jd_prio priority); +/** + * kbase_js_atom_runs_before - determine if atoms for the same slot have an + * ordering relation + * @kbdev: kbase device + * @katom_a: the first atom + * @katom_b: the second atom. + * @order_flags: combination of KBASE_ATOM_ORDERING_FLAG_<...> for the ordering + * relation + * + * This is for making consistent decisions about the ordering of atoms when we + * need to do pre-emption on a slot, which includes stopping existing atoms + * when a new atom is ready to run, and also which other atoms to remove from + * the slot when the atom in JSn_HEAD is being pre-empted. + * + * This only handles @katom_a and @katom_b being for the same job slot, as + * pre-emption only operates within a slot. + * + * Note: there is currently no use-case for this as a sorting comparison + * functions, hence only a boolean returned instead of int -1, 0, +1 return. If + * required in future, a modification to do so would be better than calling + * twice with katom_a and katom_b swapped. + * + * Return: + * true if @katom_a should run before @katom_b, false otherwise. + * A false return value does not distinguish between "no ordering relation" and + * "@katom_a should run after @katom_b". + */ +bool kbase_js_atom_runs_before(struct kbase_device *kbdev, + const struct kbase_jd_atom *katom_a, + const struct kbase_jd_atom *katom_b, + const kbase_atom_ordering_flag_t order_flags); + #endif /* _KBASE_JM_JS_H_ */ diff --git a/mali_kbase/jm/mali_kbase_js_defs.h b/mali_kbase/jm/mali_kbase_js_defs.h index 75152fb..a1d40ba 100644 --- a/mali_kbase/jm/mali_kbase_js_defs.h +++ b/mali_kbase/jm/mali_kbase_js_defs.h @@ -187,6 +187,33 @@ enum { */ #define KBASE_JS_ATOM_SCHED_PRIO_DEFAULT KBASE_JS_ATOM_SCHED_PRIO_MED +/* Atom priority bitmaps, where bit 0 is the highest priority, and higher bits + * indicate successively lower KBASE_JS_ATOM_SCHED_PRIO_<...> levels. + * + * Must be strictly larger than the number of bits to represent a bitmap of + * priorities, so that we can do calculations such as: + * (1 << KBASE_JS_ATOM_SCHED_PRIO_COUNT) - 1 + * ...without causing undefined behavior due to a shift beyond the width of the + * type + * + * If KBASE_JS_ATOM_SCHED_PRIO_COUNT starts requiring 32 bits, then it's worth + * moving to DECLARE_BITMAP() + */ +typedef u8 kbase_js_prio_bitmap_t; + +/* Ordering modification for kbase_js_atom_runs_before() */ +typedef u32 kbase_atom_ordering_flag_t; + +/* Atoms of the same context and priority should have their ordering decided by + * their seq_nr instead of their age. + * + * seq_nr is used as a more slowly changing variant of age - it increases once + * per group of related atoms, as determined by user-space. Hence, it can be + * used to limit re-ordering decisions (such as pre-emption) to only re-order + * between such groups, rather than re-order within those groups of atoms. + */ +#define KBASE_ATOM_ORDERING_FLAG_SEQNR (((kbase_atom_ordering_flag_t)1) << 0) + /** * struct kbasep_js_device_data - KBase Device Data Job Scheduler sub-structure * @runpool_irq: Sub-structure to collect together Job Scheduling data used in @@ -393,4 +420,23 @@ struct kbasep_js_atom_retained_state { */ #define KBASEP_JS_TICK_RESOLUTION_US 1 +/** + * struct kbase_jsctx_slot_tracking - Job Scheduling tracking of a context's + * use of a job slot + * @blocked: bitmap of priorities that this slot is blocked at + * @atoms_pulled: counts of atoms that have been pulled from this slot, + * across all priority levels + * @atoms_pulled_pri: counts of atoms that have been pulled from this slot, per + * priority level + * + * Controls how a slot from the &struct kbase_context's jsctx_queue is managed, + * for example to ensure correct ordering of atoms when atoms of different + * priorities are unpulled. + */ +struct kbase_jsctx_slot_tracking { + kbase_js_prio_bitmap_t blocked; + atomic_t atoms_pulled; + int atoms_pulled_pri[KBASE_JS_ATOM_SCHED_PRIO_COUNT]; +}; + #endif /* _KBASE_JS_DEFS_H_ */ diff --git a/mali_kbase/mali_base_hwconfig_features.h b/mali_kbase/mali_base_hwconfig_features.h index 93cd05f..2e81cb1 100644 --- a/mali_kbase/mali_base_hwconfig_features.h +++ b/mali_kbase/mali_base_hwconfig_features.h @@ -28,26 +28,7 @@ #define _BASE_HWCONFIG_FEATURES_H_ enum base_hw_feature { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_TEST4_DATUM_MODE, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_WARPING, BASE_HW_FEATURE_FLUSH_REDUCTION, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, BASE_HW_FEATURE_TLS_HASHING, BASE_HW_FEATURE_THREAD_GROUP_SPLIT, @@ -55,6 +36,7 @@ enum base_hw_feature { BASE_HW_FEATURE_IDVS_GROUP_SIZE, BASE_HW_FEATURE_L2_CONFIG, BASE_HW_FEATURE_ASN_HASH, + BASE_HW_FEATURE_GPU_SLEEP, BASE_HW_FEATURE_END }; @@ -63,240 +45,69 @@ static const enum base_hw_feature base_hw_features_generic[] = { }; static const enum base_hw_feature base_hw_features_tMIx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_THREAD_GROUP_SPLIT, BASE_HW_FEATURE_FLUSH_REDUCTION, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_END }; static const enum base_hw_feature base_hw_features_tHEx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_THREAD_GROUP_SPLIT, BASE_HW_FEATURE_FLUSH_REDUCTION, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_END }; static const enum base_hw_feature base_hw_features_tSIx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_THREAD_GROUP_SPLIT, BASE_HW_FEATURE_FLUSH_REDUCTION, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_END }; static const enum base_hw_feature base_hw_features_tDVx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_THREAD_GROUP_SPLIT, BASE_HW_FEATURE_FLUSH_REDUCTION, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_END }; static const enum base_hw_feature base_hw_features_tNOx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_THREAD_GROUP_SPLIT, BASE_HW_FEATURE_FLUSH_REDUCTION, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_TLS_HASHING, BASE_HW_FEATURE_IDVS_GROUP_SIZE, BASE_HW_FEATURE_END }; static const enum base_hw_feature base_hw_features_tGOx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_THREAD_GROUP_SPLIT, BASE_HW_FEATURE_FLUSH_REDUCTION, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_TLS_HASHING, BASE_HW_FEATURE_IDVS_GROUP_SIZE, BASE_HW_FEATURE_END }; static const enum base_hw_feature base_hw_features_tTRx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_FLUSH_REDUCTION, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_IDVS_GROUP_SIZE, BASE_HW_FEATURE_CLEAN_ONLY_SAFE, BASE_HW_FEATURE_END }; static const enum base_hw_feature base_hw_features_tNAx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_FLUSH_REDUCTION, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_IDVS_GROUP_SIZE, BASE_HW_FEATURE_CLEAN_ONLY_SAFE, BASE_HW_FEATURE_END }; static const enum base_hw_feature base_hw_features_tBEx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_FLUSH_REDUCTION, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_IDVS_GROUP_SIZE, BASE_HW_FEATURE_L2_CONFIG, BASE_HW_FEATURE_CLEAN_ONLY_SAFE, @@ -304,27 +115,8 @@ static const enum base_hw_feature base_hw_features_tBEx[] = { }; static const enum base_hw_feature base_hw_features_tBAx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_FLUSH_REDUCTION, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_IDVS_GROUP_SIZE, BASE_HW_FEATURE_L2_CONFIG, BASE_HW_FEATURE_CLEAN_ONLY_SAFE, @@ -332,27 +124,8 @@ static const enum base_hw_feature base_hw_features_tBAx[] = { }; static const enum base_hw_feature base_hw_features_tDUx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_FLUSH_REDUCTION, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_IDVS_GROUP_SIZE, BASE_HW_FEATURE_L2_CONFIG, BASE_HW_FEATURE_CLEAN_ONLY_SAFE, @@ -360,85 +133,37 @@ static const enum base_hw_feature base_hw_features_tDUx[] = { }; static const enum base_hw_feature base_hw_features_tODx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_FLUSH_REDUCTION, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_L2_CONFIG, BASE_HW_FEATURE_CLEAN_ONLY_SAFE, BASE_HW_FEATURE_END }; static const enum base_hw_feature base_hw_features_tGRx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_FLUSH_REDUCTION, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_L2_CONFIG, BASE_HW_FEATURE_CLEAN_ONLY_SAFE, BASE_HW_FEATURE_END }; static const enum base_hw_feature base_hw_features_tVAx[] = { - BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION, - BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS, - BASE_HW_FEATURE_XAFFINITY, - BASE_HW_FEATURE_WARPING, - BASE_HW_FEATURE_INTERPIPE_REG_ALIASING, - BASE_HW_FEATURE_32_BIT_UNIFORM_ADDRESS, - BASE_HW_FEATURE_ATTR_AUTO_TYPE_INFERRAL, - BASE_HW_FEATURE_BRNDOUT_CC, - BASE_HW_FEATURE_BRNDOUT_KILL, - BASE_HW_FEATURE_LD_ST_LEA_TEX, - BASE_HW_FEATURE_LD_ST_TILEBUFFER, - BASE_HW_FEATURE_LINEAR_FILTER_FLOAT, - BASE_HW_FEATURE_MRT, - BASE_HW_FEATURE_MSAA_16X, - BASE_HW_FEATURE_NEXT_INSTRUCTION_TYPE, - BASE_HW_FEATURE_OUT_OF_ORDER_EXEC, - BASE_HW_FEATURE_T7XX_PAIRING_RULES, - BASE_HW_FEATURE_TEST4_DATUM_MODE, BASE_HW_FEATURE_FLUSH_REDUCTION, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, - BASE_HW_FEATURE_COHERENCY_REG, BASE_HW_FEATURE_L2_CONFIG, BASE_HW_FEATURE_CLEAN_ONLY_SAFE, BASE_HW_FEATURE_END }; +static const enum base_hw_feature base_hw_features_tTUx[] = { + BASE_HW_FEATURE_FLUSH_REDUCTION, + BASE_HW_FEATURE_PROTECTED_DEBUG_MODE, + BASE_HW_FEATURE_L2_CONFIG, + BASE_HW_FEATURE_CLEAN_ONLY_SAFE, + BASE_HW_FEATURE_ASN_HASH, + BASE_HW_FEATURE_END +}; + #endif /* _BASE_HWCONFIG_FEATURES_H_ */ diff --git a/mali_kbase/mali_base_hwconfig_issues.h b/mali_kbase/mali_base_hwconfig_issues.h index beda1e4..d188120 100644 --- a/mali_kbase/mali_base_hwconfig_issues.h +++ b/mali_kbase/mali_base_hwconfig_issues.h @@ -59,6 +59,7 @@ enum base_hw_issue { BASE_HW_ISSUE_TTRX_3464, BASE_HW_ISSUE_TTRX_3485, BASE_HW_ISSUE_GPU2019_3212, + BASE_HW_ISSUE_TURSEHW_1997, BASE_HW_ISSUE_END }; @@ -637,5 +638,21 @@ static const enum base_hw_issue base_hw_issues_model_tVAx[] = { BASE_HW_ISSUE_END }; +static const enum base_hw_issue base_hw_issues_model_tTUx[] = { + BASE_HW_ISSUE_5736, + BASE_HW_ISSUE_9435, + BASE_HW_ISSUE_TSIX_2033, + BASE_HW_ISSUE_TTRX_1337, + BASE_HW_ISSUE_END +}; + +static const enum base_hw_issue base_hw_issues_tTUx_r0p0[] = { + BASE_HW_ISSUE_9435, + BASE_HW_ISSUE_TSIX_2033, + BASE_HW_ISSUE_TTRX_1337, + BASE_HW_ISSUE_TURSEHW_1997, + BASE_HW_ISSUE_END +}; + #endif /* _BASE_HWCONFIG_ISSUES_H_ */ diff --git a/mali_kbase/mali_kbase.h b/mali_kbase/mali_kbase.h index b4e50ae..6bcb754 100644 --- a/mali_kbase/mali_kbase.h +++ b/mali_kbase/mali_kbase.h @@ -491,6 +491,46 @@ void kbase_pm_metrics_start(struct kbase_device *kbdev); */ void kbase_pm_metrics_stop(struct kbase_device *kbdev); +#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME) +/** + * kbase_pm_handle_runtime_suspend - Handle the runtime suspend of GPU + * + * @kbdev: The kbase device structure for the device (must be a valid pointer) + * + * This function is called from the runtime suspend callback function for + * saving the HW state and powering down GPU, if GPU was in sleep state mode. + * It does the following steps + * - Powers up the L2 cache and re-activates the MCU. + * - Suspend the CSGs + * - Halts the MCU + * - Powers down the L2 cache. + * - Invokes the power_off callback to power down the GPU. + * + * Return: 0 if the GPU was already powered down or no error was encountered + * in the power down, otherwise an error code. + */ +int kbase_pm_handle_runtime_suspend(struct kbase_device *kbdev); + +/** + * kbase_pm_force_mcu_wakeup_after_sleep - Force the wake up of MCU from sleep + * + * @kbdev: The kbase device structure for the device (must be a valid pointer) + * + * This function forces the wake up of MCU from sleep state and wait for + * MCU to become active. + * It usually gets called from the runtime suspend callback function. + * It also gets called from the GPU reset handler or at the time of system + * suspend or when User tries to terminate/suspend the on-slot group. + * + * Note: @gpu_wakeup_override flag that forces the reactivation of MCU is + * set by this function and it is the caller's responsibility to + * clear the flag. + * + * Return: 0 if the wake up was successful. + */ +int kbase_pm_force_mcu_wakeup_after_sleep(struct kbase_device *kbdev); +#endif + #if !MALI_USE_CSF /** * Return the atom's ID, as was originally supplied by userspace in @@ -498,7 +538,8 @@ void kbase_pm_metrics_stop(struct kbase_device *kbdev); * @kctx: KBase context pointer * @katom: Atome for which to return ID */ -static inline int kbase_jd_atom_id(struct kbase_context *kctx, struct kbase_jd_atom *katom) +static inline int kbase_jd_atom_id(struct kbase_context *kctx, + const struct kbase_jd_atom *katom) { int result; diff --git a/mali_kbase/mali_kbase_as_fault_debugfs.c b/mali_kbase/mali_kbase_as_fault_debugfs.c index 027eb8c..deb412c 100644 --- a/mali_kbase/mali_kbase_as_fault_debugfs.c +++ b/mali_kbase/mali_kbase_as_fault_debugfs.c @@ -93,7 +93,10 @@ void kbase_as_fault_debugfs_init(struct kbase_device *kbdev) debugfs_directory = debugfs_create_dir("address_spaces", kbdev->mali_debugfs_directory); - if (debugfs_directory) { + if (IS_ERR_OR_NULL(debugfs_directory)) { + dev_warn(kbdev->dev, + "unable to create address_spaces debugfs directory"); + } else { for (i = 0; i < kbdev->nr_hw_address_spaces; i++) { snprintf(as_name, ARRAY_SIZE(as_name), "as%u", i); debugfs_create_file(as_name, S_IRUGO, @@ -101,9 +104,6 @@ void kbase_as_fault_debugfs_init(struct kbase_device *kbdev) (void *)(uintptr_t)i, &as_fault_fops); } - } else { - dev_warn(kbdev->dev, - "unable to create address_spaces debugfs directory"); } #endif /* CONFIG_MALI_DEBUG */ diff --git a/mali_kbase/mali_kbase_config.h b/mali_kbase/mali_kbase_config.h index e7eb334..8b7ee13 100644 --- a/mali_kbase/mali_kbase_config.h +++ b/mali_kbase/mali_kbase_config.h @@ -170,6 +170,12 @@ struct kbase_pm_callback_conf { * the clocks to the GPU, or to completely power down the GPU. * The platform specific private pointer kbase_device::platform_context can be accessed and modified in here. It is the * platform \em callbacks responsibility to initialize and terminate this pointer if used (see @ref kbase_platform_funcs_conf). + * + * If runtime PM is enabled and @power_runtime_gpu_idle_callback is used + * then this callback should power off the GPU (or switch off the clocks + * to GPU) immediately. If @power_runtime_gpu_idle_callback is not used, + * then this callback can set the autosuspend timeout (if desired) and + * let the GPU be powered down later. */ void (*power_off_callback)(struct kbase_device *kbdev); @@ -289,6 +295,49 @@ struct kbase_pm_callback_conf { * be raised. On error, return the corresponding OS error code. */ int (*soft_reset_callback)(struct kbase_device *kbdev); + + /* + * Optional callback invoked after GPU becomes idle, not supported on + * JM GPUs. + * + * This callback will be invoked by the Kbase when GPU becomes idle. + * For JM GPUs or when runtime PM is disabled, Kbase will not invoke + * this callback and @power_off_callback will be invoked directly. + * + * This callback is supposed to decrement the runtime PM core reference + * count to zero and trigger the auto-suspend timer, which implies that + * @power_off_callback shouldn't initiate the runtime suspend. + * + * GPU registers still remain accessible until @power_off_callback gets + * invoked later on the expiry of auto-suspend timer. + * + * Note: The Linux kernel must have CONFIG_PM_RUNTIME enabled to use + * this feature. + */ + void (*power_runtime_gpu_idle_callback)(struct kbase_device *kbdev); + + /* + * Optional callback invoked to change the runtime PM core state to + * active. + * + * This callback will be invoked by Kbase when GPU needs to be + * reactivated, but only if @power_runtime_gpu_idle_callback was invoked + * previously. So both @power_runtime_gpu_idle_callback and this + * callback needs to be implemented at the same time. + * + * Kbase will invoke @power_on_callback first before invoking this + * callback if the GPU was powered down previously, otherwise directly. + * + * This callback is supposed to increment the runtime PM core reference + * count to 1, which implies that @power_on_callback shouldn't initiate + * the runtime resume. The runtime resume may not happen synchronously + * to avoid a potential deadlock due to the runtime suspend happening + * simultaneously from some other thread. + * + * Note: The Linux kernel must have CONFIG_PM_RUNTIME enabled to use + * this feature. + */ + void (*power_runtime_gpu_active_callback)(struct kbase_device *kbdev); }; /* struct kbase_gpu_clk_notifier_data - Data for clock rate change notifier. diff --git a/mali_kbase/mali_kbase_config_defaults.h b/mali_kbase/mali_kbase_config_defaults.h index 63c36e2..8d64184 100644 --- a/mali_kbase/mali_kbase_config_defaults.h +++ b/mali_kbase/mali_kbase_config_defaults.h @@ -177,6 +177,19 @@ enum { */ #define DEFAULT_RESET_TIMEOUT_MS (3000) /* 3s */ +/* Waiting timeout for status change acknowledgment, in clock cycles + * Based on 3000ms timeout at nominal 100MHz, as is required for Android - based + * on scaling from a 50MHz GPU system. + */ +#define DEFAULT_REF_TIMEOUT_FREQ_KHZ (100000) +#define CSF_FIRMWARE_TIMEOUT_CYCLES (300000000) + +/* A default timeout to be used when an invalid timeout selector is + * used to retrieve the timeout, on JM GPUs. CSF GPUs use the Firmware + * timeout as the default. + */ +#define JM_DEFAULT_TIMEOUT_CYCLES (150000000) + /** * Default timeslice that a context is scheduled in for, in nanoseconds. * diff --git a/mali_kbase/mali_kbase_core_linux.c b/mali_kbase/mali_kbase_core_linux.c index e7fc41e..2472c7c 100644 --- a/mali_kbase/mali_kbase_core_linux.c +++ b/mali_kbase/mali_kbase_core_linux.c @@ -53,6 +53,7 @@ #include "mali_kbase_hwcnt_context.h" #include "mali_kbase_hwcnt_virtualizer.h" #include "mali_kbase_hwcnt_legacy.h" +#include "mali_kbase_kinstr_prfcnt.h" #include "mali_kbase_vinstr.h" #if MALI_USE_CSF #include "csf/mali_kbase_csf_firmware.h" @@ -71,6 +72,9 @@ #endif #include "backend/gpu/mali_kbase_pm_internal.h" #include "mali_kbase_dvfs_debugfs.h" +#if IS_ENABLED(CONFIG_DEBUG_FS) +#include "mali_kbase_pbha_debugfs.h" +#endif #include <linux/module.h> #include <linux/init.h> @@ -403,6 +407,22 @@ static int kbase_api_handshake_dummy(struct kbase_file *kfile, return -EPERM; } +static int kbase_api_kinstr_prfcnt_enum_info( + struct kbase_file *kfile, + struct kbase_ioctl_kinstr_prfcnt_enum_info *prfcnt_enum_info) +{ + return kbase_kinstr_prfcnt_enum_info(kfile->kbdev->kinstr_prfcnt_ctx, + prfcnt_enum_info); +} + +static int kbase_api_kinstr_prfcnt_setup( + struct kbase_file *kfile, + union kbase_ioctl_kinstr_prfcnt_setup *prfcnt_setup) +{ + return kbase_kinstr_prfcnt_setup(kfile->kbdev->kinstr_prfcnt_ctx, + prfcnt_setup); +} + static struct kbase_device *to_kbase_device(struct device *dev) { return dev_get_drvdata(dev); @@ -808,16 +828,13 @@ static int kbase_api_mem_alloc(struct kbase_context *kctx, u64 flags = alloc->in.flags; u64 gpu_va; - rcu_read_lock(); - /* Don't allow memory allocation until user space has set up the - * tracking page (which sets kctx->process_mm). Also catches when we've - * forked. + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. */ - if (rcu_dereference(kctx->process_mm) != current->mm) { - rcu_read_unlock(); + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + + if (!kbase_mem_allow_alloc(kctx)) return -EINVAL; - } - rcu_read_unlock(); if (flags & BASEP_MEM_FLAGS_KERNEL_ONLY) return -ENOMEM; @@ -849,7 +866,8 @@ static int kbase_api_mem_alloc(struct kbase_context *kctx, #endif reg = kbase_mem_alloc(kctx, alloc->in.va_pages, alloc->in.commit_pages, - alloc->in.extension, &flags, &gpu_va); + alloc->in.extension, &flags, &gpu_va, + mmu_sync_info); if (!reg) return -ENOMEM; @@ -1643,6 +1661,20 @@ static long kbase_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct kbase_ioctl_set_flags, kfile); break; + + case KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO: + KBASE_HANDLE_IOCTL_INOUT( + KBASE_IOCTL_KINSTR_PRFCNT_ENUM_INFO, + kbase_api_kinstr_prfcnt_enum_info, + struct kbase_ioctl_kinstr_prfcnt_enum_info, kfile); + break; + + case KBASE_IOCTL_KINSTR_PRFCNT_SETUP: + KBASE_HANDLE_IOCTL_INOUT(KBASE_IOCTL_KINSTR_PRFCNT_SETUP, + kbase_api_kinstr_prfcnt_setup, + union kbase_ioctl_kinstr_prfcnt_setup, + kfile); + break; } kctx = kbase_file_get_kctx_if_setup_complete(kfile); @@ -3097,6 +3129,10 @@ static ssize_t kbase_show_gpuinfo(struct device *dev, .name = "Mali-G510" }, { .id = GPU_ID2_PRODUCT_TVAX >> GPU_ID_VERSION_PRODUCT_ID_SHIFT, .name = "Mali-G310" }, + { .id = GPU_ID2_PRODUCT_TTUX >> GPU_ID_VERSION_PRODUCT_ID_SHIFT, + .name = "Mali-TTUX" }, + { .id = GPU_ID2_PRODUCT_LTUX >> GPU_ID_VERSION_PRODUCT_ID_SHIFT, + .name = "Mali-LTUX" }, }; const char *product_name = "(Unknown Mali GPU)"; struct kbase_device *kbdev; @@ -4574,25 +4610,31 @@ MAKE_QUIRK_ACCESSORS(tiler); MAKE_QUIRK_ACCESSORS(mmu); MAKE_QUIRK_ACCESSORS(gpu); -static ssize_t kbase_device_debugfs_reset_write(struct file *file, - const char __user *ubuf, size_t count, loff_t *ppos) +/** + * kbase_device_debugfs_reset_write() - Reset the GPU + * + * @data: Pointer to the Kbase device. + * @wait_for_reset: Value written to the file. + * + * This function will perform the GPU reset, and if the value written to + * the file is 1 it will also wait for the reset to complete. + * + * Return: 0 in case of no error otherwise a negative value. + */ +static int kbase_device_debugfs_reset_write(void *data, u64 wait_for_reset) { - struct kbase_device *kbdev = file->private_data; - CSTD_UNUSED(ubuf); - CSTD_UNUSED(count); - CSTD_UNUSED(ppos); + struct kbase_device *kbdev = data; trigger_reset(kbdev); - return count; + if (wait_for_reset == 1) + return kbase_reset_gpu_wait(kbdev); + + return 0; } -static const struct file_operations fops_trigger_reset = { - .owner = THIS_MODULE, - .open = simple_open, - .write = kbase_device_debugfs_reset_write, - .llseek = default_llseek, -}; +DEFINE_SIMPLE_ATTRIBUTE(fops_trigger_reset, + NULL, &kbase_device_debugfs_reset_write, "%llu\n"); /** * debugfs_protected_debug_mode_read - "protected_debug_mode" debugfs read @@ -4692,7 +4734,7 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev) kbdev->mali_debugfs_directory = debugfs_create_dir(kbdev->devname, NULL); - if (!kbdev->mali_debugfs_directory) { + if (IS_ERR_OR_NULL(kbdev->mali_debugfs_directory)) { dev_err(kbdev->dev, "Couldn't create mali debugfs directory: %s\n", kbdev->devname); @@ -4702,7 +4744,7 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev) kbdev->debugfs_ctx_directory = debugfs_create_dir("ctx", kbdev->mali_debugfs_directory); - if (!kbdev->debugfs_ctx_directory) { + if (IS_ERR_OR_NULL(kbdev->debugfs_ctx_directory)) { dev_err(kbdev->dev, "Couldn't create mali debugfs ctx directory\n"); err = -ENOMEM; goto out; @@ -4710,7 +4752,7 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev) kbdev->debugfs_instr_directory = debugfs_create_dir("instrumentation", kbdev->mali_debugfs_directory); - if (!kbdev->debugfs_instr_directory) { + if (IS_ERR_OR_NULL(kbdev->debugfs_instr_directory)) { dev_err(kbdev->dev, "Couldn't create mali debugfs instrumentation directory\n"); err = -ENOMEM; goto out; @@ -4718,7 +4760,7 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev) debugfs_ctx_defaults_directory = debugfs_create_dir("defaults", kbdev->debugfs_ctx_directory); - if (!debugfs_ctx_defaults_directory) { + if (IS_ERR_OR_NULL(debugfs_ctx_defaults_directory)) { dev_err(kbdev->dev, "Couldn't create mali debugfs ctx defaults directory\n"); err = -ENOMEM; goto out; @@ -4735,6 +4777,8 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev) #ifdef CONFIG_MALI_PRFCNT_SET_SELECT_VIA_DEBUG_FS kbase_instr_backend_debugfs_init(kbdev); #endif + kbase_pbha_debugfs_init(kbdev); + /* fops_* variables created by invocations of macro * MAKE_QUIRK_ACCESSORS() above. */ @@ -5293,11 +5337,19 @@ static int kbase_device_resume(struct device *dev) static int kbase_device_runtime_suspend(struct device *dev) { struct kbase_device *kbdev = to_kbase_device(dev); + int ret = 0; if (!kbdev) return -ENODEV; dev_dbg(dev, "Callback %s\n", __func__); + KBASE_KTRACE_ADD(kbdev, PM_RUNTIME_SUSPEND_CALLBACK, NULL, 0); + +#if MALI_USE_CSF + ret = kbase_pm_handle_runtime_suspend(kbdev); + if (ret) + return ret; +#endif #ifdef CONFIG_MALI_MIDGARD_DVFS kbase_pm_metrics_stop(kbdev); @@ -5312,7 +5364,7 @@ static int kbase_device_runtime_suspend(struct device *dev) kbdev->pm.backend.callback_power_runtime_off(kbdev); dev_dbg(dev, "runtime suspend\n"); } - return 0; + return ret; } #endif /* KBASE_PM_RUNTIME */ @@ -5336,6 +5388,7 @@ static int kbase_device_runtime_resume(struct device *dev) return -ENODEV; dev_dbg(dev, "Callback %s\n", __func__); + KBASE_KTRACE_ADD(kbdev, PM_RUNTIME_RESUME_CALLBACK, NULL, 0); if (kbdev->pm.backend.callback_power_runtime_on) { ret = kbdev->pm.backend.callback_power_runtime_on(kbdev); dev_dbg(dev, "runtime resume\n"); diff --git a/mali_kbase/mali_kbase_defs.h b/mali_kbase/mali_kbase_defs.h index 146695c..5b1fdd3 100644 --- a/mali_kbase/mali_kbase_defs.h +++ b/mali_kbase/mali_kbase_defs.h @@ -71,10 +71,6 @@ #include <linux/regulator/consumer.h> #include <linux/memory_group_manager.h> -#if defined(CONFIG_PM_RUNTIME) || defined(CONFIG_PM) -#define KBASE_PM_RUNTIME 1 -#endif - #include "debug/mali_kbase_debug_ktrace_defs.h" /** Number of milliseconds before we time out on a GPU soft/hard reset */ @@ -111,12 +107,12 @@ /** * Maximum size in bytes of a MMU lock region, as a logarithm */ -#define KBASE_LOCK_REGION_MAX_SIZE_LOG2 (64) +#define KBASE_LOCK_REGION_MAX_SIZE_LOG2 (48) /* 256 TB */ /** * Minimum size in bytes of a MMU lock region, as a logarithm */ -#define KBASE_LOCK_REGION_MIN_SIZE_LOG2 (15) +#define KBASE_LOCK_REGION_MIN_SIZE_LOG2 (15) /* 32 kB */ /** * Maximum number of GPU memory region zones @@ -269,6 +265,21 @@ struct kbase_mmu_table { struct kbase_context *kctx; }; +/** + * struct kbase_reg_zone - Information about GPU memory region zones + * @base_pfn: Page Frame Number in GPU virtual address space for the start of + * the Zone + * @va_size_pages: Size of the Zone in pages + * + * Track information about a zone KBASE_REG_ZONE() and related macros. + * In future, this could also store the &rb_root that are currently in + * &kbase_context and &kbase_csf_device. + */ +struct kbase_reg_zone { + u64 base_pfn; + u64 va_size_pages; +}; + #if MALI_USE_CSF #include "csf/mali_kbase_csf_defs.h" #else @@ -363,6 +374,12 @@ struct kbase_clk_rate_trace_manager { * that some code paths keep shaders/the tiler powered whilst this is 0. * Use kbase_pm_is_active() instead to check for such cases. * @suspending: Flag indicating suspending/suspended + * @runtime_active: Flag to track if the GPU is in runtime suspended or active + * state. This ensures that runtime_put and runtime_get + * functions are called in pairs. For example if runtime_get + * has already been called from the power_on callback, then + * the call to it from runtime_gpu_active callback can be + * skipped. * @gpu_lost: Flag indicating gpu lost * This structure contains data for the power management framework. There * is one instance of this structure per device in the system. @@ -388,6 +405,9 @@ struct kbase_pm_device_data { struct mutex lock; int active_count; bool suspending; +#if MALI_USE_CSF + bool runtime_active; +#endif #ifdef CONFIG_MALI_ARBITER_SUPPORT atomic_t gpu_lost; #endif /* CONFIG_MALI_ARBITER_SUPPORT */ @@ -529,8 +549,11 @@ struct kbase_devfreq_opp { * @entry_set_ate: program the pte to be a valid address translation entry to * encode the physical address of the actual page being mapped. * @entry_set_pte: program the pte to be a valid entry to encode the physical - * address of the next lower level page table. + * address of the next lower level page table and also update + * the number of valid entries. * @entry_invalidate: clear out or invalidate the pte. + * @get_num_valid_entries: returns the number of valid entries for a specific pgd. + * @set_num_valid_entries: sets the number of valid entries for a specific pgd * @flags: bitmask of MMU mode flags. Refer to KBASE_MMU_MODE_ constants. */ struct kbase_mmu_mode { @@ -545,8 +568,11 @@ struct kbase_mmu_mode { int (*pte_is_valid)(u64 pte, int level); void (*entry_set_ate)(u64 *entry, struct tagged_addr phy, unsigned long flags, int level); - void (*entry_set_pte)(u64 *entry, phys_addr_t phy); + void (*entry_set_pte)(u64 *pgd, u64 vpfn, phys_addr_t phy); void (*entry_invalidate)(u64 *entry); + unsigned int (*get_num_valid_entries)(u64 *pgd); + void (*set_num_valid_entries)(u64 *pgd, + unsigned int num_of_valid_entries); unsigned long flags; }; @@ -722,6 +748,7 @@ struct kbase_process { * kbase_hwcnt_context_enable() with @hwcnt_gpu_ctx. * @hwcnt_gpu_virt: Virtualizer for GPU hardware counters. * @vinstr_ctx: vinstr context created per device. + * @kinstr_prfcnt_ctx: kinstr_prfcnt context created per device. * @timeline_flags: Bitmask defining which sets of timeline tracepoints * are enabled. If zero, there is no timeline client and * therefore timeline is disabled. @@ -738,6 +765,8 @@ struct kbase_process { * @reset_timeout_ms: Number of milliseconds to wait for the soft stop to * complete for the GPU jobs before proceeding with the * GPU reset. + * @lowest_gpu_freq_khz: Lowest frequency in KHz that the GPU can run at. Used + * to calculate suitable timeouts for wait operations. * @cache_clean_in_progress: Set when a cache clean has been started, and * cleared when it has finished. This prevents multiple * cache cleans being done simultaneously. @@ -752,8 +781,6 @@ struct kbase_process { * including any contexts that might be created for * hardware counters. * @kctx_list_lock: Lock protecting concurrent accesses to @kctx_list. - * @group_max_uid_in_devices: Max value of any queue group UID in any kernel - * context in the kbase device. * @devfreq_profile: Describes devfreq profile for the Mali GPU device, passed * to devfreq_add_device() to add devfreq feature to Mali * GPU device. @@ -891,6 +918,10 @@ struct kbase_process { * @l2_hash_override: Used to set L2 cache hash via device tree blob * @l2_hash_values_override: true if @l2_hash_values is valid. * @l2_hash_values: Used to set L2 asn_hash via device tree blob + * @sysc_alloc: Array containing values to be programmed into + * SYSC_ALLOC[0..7] GPU registers on L2 cache + * power down. These come from either DTB or + * via DebugFS (if it is available in kernel). * @process_root: rb_tree root node for maintaining a rb_tree of * kbase_process based on key tgid(thread group ID). * @dma_buf_root: rb_tree root node for maintaining a rb_tree of @@ -993,6 +1024,7 @@ struct kbase_device { struct kbase_hwcnt_context *hwcnt_gpu_ctx; struct kbase_hwcnt_virtualizer *hwcnt_gpu_virt; struct kbase_vinstr_context *vinstr_ctx; + struct kbase_kinstr_prfcnt_context *kinstr_prfcnt_ctx; atomic_t timeline_flags; struct kbase_timeline *timeline; @@ -1002,6 +1034,8 @@ struct kbase_device { #endif u32 reset_timeout_ms; + u64 lowest_gpu_freq_khz; + bool cache_clean_in_progress; bool cache_clean_queued; wait_queue_head_t cache_clean_wait; @@ -1010,7 +1044,6 @@ struct kbase_device { struct list_head kctx_list; struct mutex kctx_list_lock; - atomic_t group_max_uid_in_devices; #ifdef CONFIG_MALI_DEVFREQ struct devfreq_dev_profile devfreq_profile; @@ -1129,6 +1162,8 @@ struct kbase_device { bool l2_hash_values_override; u32 l2_hash_values[ASN_HASH_COUNT]; + u32 sysc_alloc[SYSC_ALLOC_COUNT]; + struct mutex fw_load_lock; #if MALI_USE_CSF /* CSF object for the GPU device. */ @@ -1396,21 +1431,6 @@ struct kbase_sub_alloc { }; /** - * struct kbase_reg_zone - Information about GPU memory region zones - * @base_pfn: Page Frame Number in GPU virtual address space for the start of - * the Zone - * @va_size_pages: Size of the Zone in pages - * - * Track information about a zone KBASE_REG_ZONE() and related macros. - * In future, this could also store the &rb_root that are currently in - * &kbase_context - */ -struct kbase_reg_zone { - u64 base_pfn; - u64 va_size_pages; -}; - -/** * struct kbase_context - Kernel base context * * @filp: Pointer to the struct file corresponding to device file @@ -1561,17 +1581,10 @@ struct kbase_reg_zone { * of RB-tree holding currently runnable atoms on the job slot * and the head item of the linked list of atoms blocked on * cross-slot dependencies. - * @atoms_pulled: Total number of atoms currently pulled from the context. - * @atoms_pulled_slot: Per slot count of the number of atoms currently pulled - * from the context. - * @atoms_pulled_slot_pri: Per slot & priority count of the number of atoms currently - * pulled from the context. hwaccess_lock shall be held when - * accessing it. - * @blocked_js: Indicates if the context is blocked from submitting atoms - * on a slot at a given priority. This is set to true, when - * the atom corresponding to context is soft/hard stopped or - * removed from the HEAD_NEXT register in response to - * soft/hard stop. + * @slot_tracking: Tracking and control of this context's use of all job + * slots + * @atoms_pulled_all_slots: Total number of atoms currently pulled from the + * context, across all slots. * @slots_pullable: Bitmask of slots, indicating the slots for which the * context has pullable atoms in the runnable tree. * @work: Work structure used for deferred ASID assignment. @@ -1717,17 +1730,14 @@ struct kbase_context { struct kbase_jd_context jctx; struct jsctx_queue jsctx_queue [KBASE_JS_ATOM_SCHED_PRIO_COUNT][BASE_JM_MAX_NR_SLOTS]; + struct kbase_jsctx_slot_tracking slot_tracking[BASE_JM_MAX_NR_SLOTS]; + atomic_t atoms_pulled_all_slots; struct list_head completed_jobs; atomic_t work_count; struct timer_list soft_job_timeout; - atomic_t atoms_pulled; - atomic_t atoms_pulled_slot[BASE_JM_MAX_NR_SLOTS]; - int atoms_pulled_slot_pri[BASE_JM_MAX_NR_SLOTS][ - KBASE_JS_ATOM_SCHED_PRIO_COUNT]; int priority; - bool blocked_js[BASE_JM_MAX_NR_SLOTS][KBASE_JS_ATOM_SCHED_PRIO_COUNT]; s16 atoms_count[KBASE_JS_ATOM_SCHED_PRIO_COUNT]; u32 slots_pullable; u32 age_count; @@ -1888,6 +1898,13 @@ enum kbase_share_attr_bits { }; /** + * enum kbase_timeout_selector - The choice of which timeout to get scaled + * using current GPU frequency. + * @CSF_FIRMWARE_TIMEOUT: Response timeout from CSF firmware. + */ +enum kbase_timeout_selector { CSF_FIRMWARE_TIMEOUT }; + +/** * kbase_device_is_cpu_coherent - Returns if the device is CPU coherent. * @kbdev: kbase device * diff --git a/mali_kbase/mali_kbase_dma_fence.c b/mali_kbase/mali_kbase_dma_fence.c index 69ff8cc..bf2d9cc 100644 --- a/mali_kbase/mali_kbase_dma_fence.c +++ b/mali_kbase/mali_kbase_dma_fence.c @@ -249,8 +249,10 @@ kbase_dma_fence_add_reservation_callback(struct kbase_jd_atom *katom, #if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE) err = reservation_object_get_fences_rcu( -#else +#elif (KERNEL_VERSION(5, 14, 0) > LINUX_VERSION_CODE) err = dma_resv_get_fences_rcu( +#else + err = dma_resv_get_fences( #endif resv, &excl_fence, diff --git a/mali_kbase/mali_kbase_dummy_job_wa.c b/mali_kbase/mali_kbase_dummy_job_wa.c index 1e91ba0..bdc5d6d 100644 --- a/mali_kbase/mali_kbase_dummy_job_wa.c +++ b/mali_kbase/mali_kbase_dummy_job_wa.c @@ -281,6 +281,11 @@ int kbase_dummy_job_wa_load(struct kbase_device *kbdev) int err; struct kbase_context *kctx; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + lockdep_assert_held(&kbdev->fw_load_lock); if (!wa_blob_load_needed(kbdev)) @@ -375,8 +380,8 @@ int kbase_dummy_job_wa_load(struct kbase_device *kbdev) nr_pages = PFN_UP(blob->size); flags = blob->map_flags | BASE_MEM_FLAG_MAP_FIXED; - va_region = kbase_mem_alloc(kctx, nr_pages, nr_pages, - 0, &flags, &gpu_va); + va_region = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, + &gpu_va, mmu_sync_info); if (!va_region) { dev_err(kbdev->dev, "Failed to allocate for blob\n"); diff --git a/mali_kbase/mali_kbase_gpuprops.c b/mali_kbase/mali_kbase_gpuprops.c index e4d52c9..967c08e 100644 --- a/mali_kbase/mali_kbase_gpuprops.c +++ b/mali_kbase/mali_kbase_gpuprops.c @@ -661,6 +661,19 @@ int kbase_gpuprops_update_l2_features(struct kbase_device *kbdev) dev_info(kbdev->dev, "Reflected L2_CONFIG is 0x%08x\n", regdump.l2_config); + if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_ASN_HASH)) { + int idx; + const bool asn_he = regdump.l2_config & + L2_CONFIG_ASN_HASH_ENABLE_MASK; + if (!asn_he && kbdev->l2_hash_values_override) + dev_err(kbdev->dev, + "Failed to use requested ASN_HASH, fallback to default"); + for (idx = 0; idx < ASN_HASH_COUNT; idx++) + dev_info(kbdev->dev, + "%s ASN_HASH[%d] is [0x%08x]\n", + asn_he ? "Overridden" : "Default", idx, + regdump.l2_asn_hash[idx]); + } /* Update gpuprops with reflected L2_FEATURES */ gpu_props->raw_props.l2_features = regdump.l2_features; diff --git a/mali_kbase/mali_kbase_gpuprops_types.h b/mali_kbase/mali_kbase_gpuprops_types.h index 02705a0..67a4d7d 100644 --- a/mali_kbase/mali_kbase_gpuprops_types.h +++ b/mali_kbase/mali_kbase_gpuprops_types.h @@ -35,6 +35,7 @@ struct kbase_gpuprops_regdump { u32 gpu_id; u32 l2_features; u32 l2_config; + u32 l2_asn_hash[ASN_HASH_COUNT]; u32 core_features; u32 tiler_features; u32 mem_features; diff --git a/mali_kbase/mali_kbase_hw.c b/mali_kbase/mali_kbase_hw.c index 7ad583c..183fd18 100644 --- a/mali_kbase/mali_kbase_hw.c +++ b/mali_kbase/mali_kbase_hw.c @@ -81,6 +81,10 @@ void kbase_hw_set_features_mask(struct kbase_device *kbdev) case GPU_ID2_PRODUCT_TVAX: features = base_hw_features_tVAx; break; + case GPU_ID2_PRODUCT_TTUX: + case GPU_ID2_PRODUCT_LTUX: + features = base_hw_features_tTUx; + break; default: features = base_hw_features_generic; break; @@ -225,6 +229,15 @@ static const enum base_hw_issue *kbase_hw_get_issues_for_new_id( { GPU_ID2_PRODUCT_TVAX, { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tVAx_r0p0 }, { U32_MAX, NULL } } }, + + { GPU_ID2_PRODUCT_TTUX, + { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tTUx_r0p0 }, + { U32_MAX, NULL } } }, + + { GPU_ID2_PRODUCT_LTUX, + { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tTUx_r0p0 }, + { U32_MAX, NULL } } }, + }; u32 gpu_id = kbdev->gpu_props.props.raw_props.gpu_id; @@ -380,6 +393,11 @@ int kbase_hw_set_issues_mask(struct kbase_device *kbdev) case GPU_ID2_PRODUCT_TVAX: issues = base_hw_issues_model_tVAx; break; + case GPU_ID2_PRODUCT_TTUX: + case GPU_ID2_PRODUCT_LTUX: + issues = base_hw_issues_model_tTUx; + break; + default: dev_err(kbdev->dev, "Unknown GPU ID %x", gpu_id); diff --git a/mali_kbase/mali_kbase_hwaccess_time.h b/mali_kbase/mali_kbase_hwaccess_time.h index 8a4ece4..27e2cb7 100644 --- a/mali_kbase/mali_kbase_hwaccess_time.h +++ b/mali_kbase/mali_kbase_hwaccess_time.h @@ -48,3 +48,25 @@ void kbase_backend_get_gpu_time_norequest(struct kbase_device *kbdev, struct timespec64 *ts); #endif /* _KBASE_BACKEND_TIME_H_ */ + +/** + * kbase_get_timeout_ms - Choose a timeout value to get a timeout scaled + * GPU frequency, using a choice from + * kbase_timeout_selector. + * + * @kbdev: KBase device pointer. + * @selector: Value from kbase_scaled_timeout_selector enum. + * + * Return: Timeout in milliseconds, as an unsigned integer. + */ +unsigned int kbase_get_timeout_ms(struct kbase_device *kbdev, + enum kbase_timeout_selector selector); + +/** + * kbase_backend_get_cycle_cnt - Reads the GPU cycle counter + * + * @kbdev: Instance of a GPU platform device that implements a CSF interface. + * + * Return: Snapshot of the GPU cycle count register. + */ +u64 kbase_backend_get_cycle_cnt(struct kbase_device *kbdev); diff --git a/mali_kbase/mali_kbase_hwcnt_backend_csf.c b/mali_kbase/mali_kbase_hwcnt_backend_csf.c index 58b5e72..7ba1671 100644 --- a/mali_kbase/mali_kbase_hwcnt_backend_csf.c +++ b/mali_kbase/mali_kbase_hwcnt_backend_csf.c @@ -157,19 +157,20 @@ struct kbase_hwcnt_backend_csf_info { * @shader_cnt: Shader Core block count. * @block_cnt: Total block count (sum of all other block counts). * @shader_avail_mask: Bitmap of all shader cores in the system. - * @offset_enable_mask: Offset of enable mask in the block. + * @enable_mask_offset: Offset in array elements of enable mask in each block + * starting from the beginning of block. * @headers_per_block: Header size per block. * @counters_per_block: Counters size per block. * @values_per_block: Total size per block. */ struct kbase_hwcnt_csf_physical_layout { - size_t fe_cnt; - size_t tiler_cnt; - size_t mmu_l2_cnt; - size_t shader_cnt; - size_t block_cnt; + u8 fe_cnt; + u8 tiler_cnt; + u8 mmu_l2_cnt; + u8 shader_cnt; + u8 block_cnt; u64 shader_avail_mask; - size_t offset_enable_mask; + size_t enable_mask_offset; size_t headers_per_block; size_t counters_per_block; size_t values_per_block; @@ -184,11 +185,13 @@ struct kbase_hwcnt_csf_physical_layout { * to accumulate up to. * @enable_state_waitq: Wait queue object used to notify the enable * changing flag is done. - * @to_user_buf: HWC sample buffer for client user. + * @to_user_buf: HWC sample buffer for client user, size + * metadata.dump_buf_bytes. * @accum_buf: HWC sample buffer used as an internal - * accumulator. + * accumulator, size metadata.dump_buf_bytes. * @old_sample_buf: HWC sample buffer to save the previous values - * for delta calculation. + * for delta calculation, size + * prfcnt_info.dump_bytes. * @ring_buf: Opaque pointer for ring buffer object. * @ring_buf_cpu_base: CPU base address of the allocated ring buffer. * @clk_enable_map: The enable map specifying enabled clock domains. @@ -213,8 +216,8 @@ struct kbase_hwcnt_backend_csf { enum kbase_hwcnt_backend_csf_enable_state enable_state; u32 insert_index_to_accumulate; wait_queue_head_t enable_state_waitq; - u32 *to_user_buf; - u32 *accum_buf; + u64 *to_user_buf; + u64 *accum_buf; u32 *old_sample_buf; struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf; void *ring_buf_cpu_base; @@ -333,34 +336,40 @@ static void kbasep_hwcnt_backend_csf_init_layout( const struct kbase_hwcnt_backend_csf_if_prfcnt_info *prfcnt_info, struct kbase_hwcnt_csf_physical_layout *phys_layout) { + u8 shader_core_cnt; + size_t values_per_block; + WARN_ON(!prfcnt_info); WARN_ON(!phys_layout); - phys_layout->fe_cnt = 1; - phys_layout->tiler_cnt = 1; - phys_layout->mmu_l2_cnt = prfcnt_info->l2_count; - phys_layout->shader_cnt = fls64(prfcnt_info->core_mask); - phys_layout->block_cnt = phys_layout->fe_cnt + phys_layout->tiler_cnt + - phys_layout->mmu_l2_cnt + - phys_layout->shader_cnt; - - phys_layout->shader_avail_mask = prfcnt_info->core_mask; - - phys_layout->headers_per_block = KBASE_HWCNT_V5_HEADERS_PER_BLOCK; - phys_layout->values_per_block = - prfcnt_info->prfcnt_block_size / KBASE_HWCNT_VALUE_BYTES; - phys_layout->counters_per_block = - phys_layout->values_per_block - phys_layout->headers_per_block; - phys_layout->offset_enable_mask = KBASE_HWCNT_V5_PRFCNT_EN_HEADER; + shader_core_cnt = fls64(prfcnt_info->core_mask); + values_per_block = + prfcnt_info->prfcnt_block_size / KBASE_HWCNT_VALUE_HW_BYTES; + + *phys_layout = (struct kbase_hwcnt_csf_physical_layout){ + .fe_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT, + .tiler_cnt = KBASE_HWCNT_V5_TILER_BLOCK_COUNT, + .mmu_l2_cnt = prfcnt_info->l2_count, + .shader_cnt = shader_core_cnt, + .block_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT + + KBASE_HWCNT_V5_TILER_BLOCK_COUNT + + prfcnt_info->l2_count + shader_core_cnt, + .shader_avail_mask = prfcnt_info->core_mask, + .headers_per_block = KBASE_HWCNT_V5_HEADERS_PER_BLOCK, + .values_per_block = values_per_block, + .counters_per_block = + values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK, + .enable_mask_offset = KBASE_HWCNT_V5_PRFCNT_EN_HEADER, + }; } static void kbasep_hwcnt_backend_csf_reset_internal_buffers( struct kbase_hwcnt_backend_csf *backend_csf) { - memset(backend_csf->to_user_buf, 0, - backend_csf->info->prfcnt_info.dump_bytes); - memset(backend_csf->accum_buf, 0, - backend_csf->info->prfcnt_info.dump_bytes); + size_t user_buf_bytes = backend_csf->info->metadata->dump_buf_bytes; + + memset(backend_csf->to_user_buf, 0, user_buf_bytes); + memset(backend_csf->accum_buf, 0, user_buf_bytes); memset(backend_csf->old_sample_buf, 0, backend_csf->info->prfcnt_info.dump_bytes); } @@ -376,7 +385,7 @@ static void kbasep_hwcnt_backend_csf_zero_sample_prfcnt_en_header( for (block_idx = 0; block_idx < phys_layout->block_cnt; block_idx++) { block_buf = sample + block_idx * phys_layout->values_per_block; - block_buf[phys_layout->offset_enable_mask] = 0; + block_buf[phys_layout->enable_mask_offset] = 0; } } @@ -400,33 +409,35 @@ static void kbasep_hwcnt_backend_csf_zero_all_prfcnt_en_header( static void kbasep_hwcnt_backend_csf_update_user_sample( struct kbase_hwcnt_backend_csf *backend_csf) { + size_t user_buf_bytes = backend_csf->info->metadata->dump_buf_bytes; + /* Copy the data into the sample and wait for the user to get it. */ memcpy(backend_csf->to_user_buf, backend_csf->accum_buf, - backend_csf->info->prfcnt_info.dump_bytes); + user_buf_bytes); /* After copied data into user sample, clear the accumulator values to * prepare for the next accumulator, such as the next request or * threshold. */ - memset(backend_csf->accum_buf, 0, - backend_csf->info->prfcnt_info.dump_bytes); + memset(backend_csf->accum_buf, 0, user_buf_bytes); } static void kbasep_hwcnt_backend_csf_accumulate_sample( const struct kbase_hwcnt_csf_physical_layout *phys_layout, - size_t dump_bytes, u32 *accum_buf, const u32 *old_sample_buf, + size_t dump_bytes, u64 *accum_buf, const u32 *old_sample_buf, const u32 *new_sample_buf, bool clearing_samples) { - size_t block_idx, ctr_idx; + size_t block_idx; const u32 *old_block = old_sample_buf; const u32 *new_block = new_sample_buf; - u32 *acc_block = accum_buf; + u64 *acc_block = accum_buf; + const size_t values_per_block = phys_layout->values_per_block; for (block_idx = 0; block_idx < phys_layout->block_cnt; block_idx++) { const u32 old_enable_mask = - old_block[phys_layout->offset_enable_mask]; + old_block[phys_layout->enable_mask_offset]; const u32 new_enable_mask = - new_block[phys_layout->offset_enable_mask]; + new_block[phys_layout->enable_mask_offset]; if (new_enable_mask == 0) { /* Hardware block was unavailable or we didn't turn on @@ -436,11 +447,14 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample( /* Hardware block was available and it had some counters * enabled. We need to update the accumulation buffer. */ + size_t ctr_idx; /* Unconditionally copy the headers. */ - memcpy(acc_block, new_block, - phys_layout->headers_per_block * - KBASE_HWCNT_VALUE_BYTES); + for (ctr_idx = 0; + ctr_idx < phys_layout->headers_per_block; + ctr_idx++) { + acc_block[ctr_idx] = new_block[ctr_idx]; + } /* Accumulate counter samples * @@ -470,8 +484,7 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample( for (ctr_idx = phys_layout ->headers_per_block; - ctr_idx < - phys_layout->values_per_block; + ctr_idx < values_per_block; ctr_idx++) { acc_block[ctr_idx] += new_block[ctr_idx]; @@ -484,8 +497,7 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample( for (ctr_idx = phys_layout ->headers_per_block; - ctr_idx < - phys_layout->values_per_block; + ctr_idx < values_per_block; ctr_idx++) { acc_block[ctr_idx] += new_block[ctr_idx] - @@ -494,23 +506,23 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample( } } else { for (ctr_idx = phys_layout->headers_per_block; - ctr_idx < phys_layout->values_per_block; - ctr_idx++) { + ctr_idx < values_per_block; ctr_idx++) { acc_block[ctr_idx] += new_block[ctr_idx]; } } } - old_block += phys_layout->values_per_block; - new_block += phys_layout->values_per_block; - acc_block += phys_layout->values_per_block; + old_block += values_per_block; + new_block += values_per_block; + acc_block += values_per_block; } WARN_ON(old_block != - old_sample_buf + dump_bytes / KBASE_HWCNT_VALUE_BYTES); + old_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES)); WARN_ON(new_block != - new_sample_buf + dump_bytes / KBASE_HWCNT_VALUE_BYTES); - WARN_ON(acc_block != accum_buf + dump_bytes / KBASE_HWCNT_VALUE_BYTES); + new_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES)); + WARN_ON(acc_block != + accum_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES)); (void)dump_bytes; } @@ -1218,7 +1230,7 @@ kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info, &backend_csf->phys_layout); backend_csf->accum_buf = - kzalloc(csf_info->prfcnt_info.dump_bytes, GFP_KERNEL); + kzalloc(csf_info->metadata->dump_buf_bytes, GFP_KERNEL); if (!backend_csf->accum_buf) goto err_alloc_acc_buf; @@ -1228,7 +1240,7 @@ kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info, goto err_alloc_pre_sample_buf; backend_csf->to_user_buf = - kzalloc(csf_info->prfcnt_info.dump_bytes, GFP_KERNEL); + kzalloc(csf_info->metadata->dump_buf_bytes, GFP_KERNEL); if (!backend_csf->to_user_buf) goto err_alloc_user_sample_buf; @@ -1237,6 +1249,7 @@ kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info, &backend_csf->ring_buf_cpu_base, &backend_csf->ring_buf); if (errcode) goto err_ring_buf_alloc; + errcode = -ENOMEM; /* Zero all performance enable header to prepare for first enable. */ kbasep_hwcnt_backend_csf_zero_all_prfcnt_en_header(backend_csf); @@ -1787,17 +1800,17 @@ int kbase_hwcnt_backend_csf_metadata_init( gpu_info.clk_cnt = csf_info->prfcnt_info.clk_cnt; gpu_info.prfcnt_values_per_block = csf_info->prfcnt_info.prfcnt_block_size / - KBASE_HWCNT_VALUE_BYTES; + KBASE_HWCNT_VALUE_HW_BYTES; errcode = kbase_hwcnt_csf_metadata_create( &gpu_info, csf_info->counter_set, &csf_info->metadata); if (errcode) return errcode; /* - * Dump abstraction size should be exactly the same size and layout as - * the physical dump size, for backwards compatibility. + * Dump abstraction size should be exactly twice the size and layout as + * the physical dump size since 64-bit per value used in metadata. */ - WARN_ON(csf_info->prfcnt_info.dump_bytes != + WARN_ON(csf_info->prfcnt_info.dump_bytes * 2 != csf_info->metadata->dump_buf_bytes); return 0; diff --git a/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c b/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c index 78a8dc0..124224d 100644 --- a/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c +++ b/mali_kbase/mali_kbase_hwcnt_backend_csf_if_fw.c @@ -223,7 +223,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info( u32 prfcnt_hw_size = 0; u32 prfcnt_fw_size = 0; u32 prfcnt_block_size = KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK * - KBASE_HWCNT_VALUE_BYTES; + KBASE_HWCNT_VALUE_HW_BYTES; WARN_ON(!ctx); WARN_ON(!prfcnt_info); @@ -235,6 +235,16 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info( prfcnt_fw_size = (prfcnt_size >> 16) << 8; fw_ctx->buf_bytes = prfcnt_hw_size + prfcnt_fw_size; + /* Read the block size if the GPU has the register PRFCNT_FEATURES + * which was introduced in architecture version 11.x.7. + */ + if ((kbdev->gpu_props.props.raw_props.gpu_id & GPU_ID2_PRODUCT_MODEL) >= + GPU_ID2_PRODUCT_TTUX) { + prfcnt_block_size = + PRFCNT_FEATURES_COUNTER_BLOCK_SIZE_GET(kbase_reg_read( + kbdev, GPU_CONTROL_REG(PRFCNT_FEATURES))) + << 8; + } prfcnt_info->dump_bytes = fw_ctx->buf_bytes; prfcnt_info->prfcnt_block_size = prfcnt_block_size; @@ -246,7 +256,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info( prfcnt_info->clearing_samples = true; /* Block size must be multiple of counter size. */ - WARN_ON((prfcnt_info->prfcnt_block_size % KBASE_HWCNT_VALUE_BYTES) != + WARN_ON((prfcnt_info->prfcnt_block_size % KBASE_HWCNT_VALUE_HW_BYTES) != 0); /* Total size must be multiple of block size. */ WARN_ON((prfcnt_info->dump_bytes % prfcnt_info->prfcnt_block_size) != @@ -274,6 +284,11 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc( struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx = (struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + WARN_ON(!ctx); WARN_ON(!cpu_dump_base); WARN_ON(!out_ring_buf); @@ -322,7 +337,8 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc( /* Update MMU table */ ret = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu, gpu_va_base >> PAGE_SHIFT, phys, num_pages, - flags, MCU_AS_NR, KBASE_MEM_GROUP_CSF_FW); + flags, MCU_AS_NR, KBASE_MEM_GROUP_CSF_FW, + mmu_sync_info); if (ret) goto mmu_insert_failed; diff --git a/mali_kbase/mali_kbase_hwcnt_backend_jm.c b/mali_kbase/mali_kbase_hwcnt_backend_jm.c index 64001b1..56bb1b6 100644 --- a/mali_kbase/mali_kbase_hwcnt_backend_jm.c +++ b/mali_kbase/mali_kbase_hwcnt_backend_jm.c @@ -35,17 +35,47 @@ /** * struct kbase_hwcnt_backend_jm_info - Information used to create an instance * of a JM hardware counter backend. - * @kbdev: KBase device. - * @counter_set: The performance counter set to use. - * @metadata: Hardware counter metadata. - * @dump_bytes: Bytes of GPU memory required to perform a - * hardware counter dump. + * @kbdev: KBase device. + * @counter_set: The performance counter set to use. + * @metadata: Hardware counter metadata. + * @dump_bytes: Bytes of GPU memory required to perform a + * hardware counter dump. + * @hwcnt_gpu_info: Hardware counter block information. */ struct kbase_hwcnt_backend_jm_info { struct kbase_device *kbdev; enum kbase_hwcnt_set counter_set; const struct kbase_hwcnt_metadata *metadata; size_t dump_bytes; + struct kbase_hwcnt_gpu_info hwcnt_gpu_info; +}; + +/** + * struct kbase_hwcnt_jm_physical_layout - HWC sample memory physical layout + * information. + * @fe_cnt: Front end block count. + * @tiler_cnt: Tiler block count. + * @mmu_l2_cnt: Memory system(MMU and L2 cache) block count. + * @shader_cnt: Shader Core block count. + * @block_cnt: Total block count (sum of all other block counts). + * @shader_avail_mask: Bitmap of all shader cores in the system. + * @enable_mask_offset: Offset in array elements of enable mask in each block + * starting from the beginning of block. + * @headers_per_block: Header size per block. + * @counters_per_block: Counters size per block. + * @values_per_block: Total size per block. + */ +struct kbase_hwcnt_jm_physical_layout { + u8 fe_cnt; + u8 tiler_cnt; + u8 mmu_l2_cnt; + u8 shader_cnt; + u8 block_cnt; + u64 shader_avail_mask; + size_t enable_mask_offset; + size_t headers_per_block; + size_t counters_per_block; + size_t values_per_block; }; /** @@ -56,11 +86,13 @@ struct kbase_hwcnt_backend_jm_info { * @gpu_dump_va: GPU hardware counter dump buffer virtual address. * @cpu_dump_va: CPU mapping of gpu_dump_va. * @vmap: Dump buffer vmap. + * @to_user_buf: HWC sample buffer for client user, size + * metadata.dump_buf_bytes. * @enabled: True if dumping has been enabled, else false. * @pm_core_mask: PM state sync-ed shaders core mask for the enabled * dumping. - * @curr_config: Current allocated hardware resources to correctly map the src - * raw dump buffer to the dst dump buffer. + * @curr_config: Current allocated hardware resources to correctly map the + * source raw dump buffer to the destination dump buffer. * @clk_enable_map: The enable map specifying enabled clock domains. * @cycle_count_elapsed: * Cycle count elapsed for a given sample period. @@ -71,6 +103,7 @@ struct kbase_hwcnt_backend_jm_info { * sample period. * @rate_listener: Clock rate listener callback state. * @ccswe_shader_cores: Shader cores cycle count software estimator. + * @phys_layout: Physical memory layout information of HWC sample buffer. */ struct kbase_hwcnt_backend_jm { const struct kbase_hwcnt_backend_jm_info *info; @@ -78,6 +111,7 @@ struct kbase_hwcnt_backend_jm { u64 gpu_dump_va; void *cpu_dump_va; struct kbase_vmap_struct *vmap; + u64 *to_user_buf; bool enabled; u64 pm_core_mask; struct kbase_hwcnt_curr_config curr_config; @@ -86,6 +120,7 @@ struct kbase_hwcnt_backend_jm { u64 prev_cycle_count[BASE_MAX_NR_CLOCKS_REGULATORS]; struct kbase_clk_rate_listener rate_listener; struct kbase_ccswe ccswe_shader_cores; + struct kbase_hwcnt_jm_physical_layout phys_layout; }; /** @@ -127,6 +162,63 @@ kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev, return 0; } +static void kbasep_hwcnt_backend_jm_init_layout( + const struct kbase_hwcnt_gpu_info *gpu_info, + struct kbase_hwcnt_jm_physical_layout *phys_layout) +{ + u8 shader_core_cnt; + + WARN_ON(!gpu_info); + WARN_ON(!phys_layout); + + shader_core_cnt = fls64(gpu_info->core_mask); + + *phys_layout = (struct kbase_hwcnt_jm_physical_layout){ + .fe_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT, + .tiler_cnt = KBASE_HWCNT_V5_TILER_BLOCK_COUNT, + .mmu_l2_cnt = gpu_info->l2_count, + .shader_cnt = shader_core_cnt, + .block_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT + + KBASE_HWCNT_V5_TILER_BLOCK_COUNT + + gpu_info->l2_count + shader_core_cnt, + .shader_avail_mask = gpu_info->core_mask, + .headers_per_block = KBASE_HWCNT_V5_HEADERS_PER_BLOCK, + .values_per_block = gpu_info->prfcnt_values_per_block, + .counters_per_block = gpu_info->prfcnt_values_per_block - + KBASE_HWCNT_V5_HEADERS_PER_BLOCK, + .enable_mask_offset = KBASE_HWCNT_V5_PRFCNT_EN_HEADER, + }; +} + +static void kbasep_hwcnt_backend_jm_dump_sample( + const struct kbase_hwcnt_backend_jm *const backend_jm) +{ + size_t block_idx; + const u32 *new_sample_buf = backend_jm->cpu_dump_va; + const u32 *new_block = new_sample_buf; + u64 *dst_buf = backend_jm->to_user_buf; + u64 *dst_block = dst_buf; + const size_t values_per_block = + backend_jm->phys_layout.values_per_block; + const size_t dump_bytes = backend_jm->info->dump_bytes; + + for (block_idx = 0; block_idx < backend_jm->phys_layout.block_cnt; + block_idx++) { + size_t ctr_idx; + + for (ctr_idx = 0; ctr_idx < values_per_block; ctr_idx++) + dst_block[ctr_idx] = new_block[ctr_idx]; + + new_block += values_per_block; + dst_block += values_per_block; + } + + WARN_ON(new_block != + new_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES)); + WARN_ON(dst_block != + dst_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES)); +} + /** * kbasep_hwcnt_backend_jm_on_freq_change() - On freq change callback * @@ -487,6 +579,9 @@ static int kbasep_hwcnt_backend_jm_dump_get( kbase_sync_mem_regions( backend_jm->kctx, backend_jm->vmap, KBASE_SYNC_TO_CPU); + /* Dump sample to the internal 64-bit user buffer. */ + kbasep_hwcnt_backend_jm_dump_sample(backend_jm); + kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk) { if (!kbase_hwcnt_clk_enable_map_enabled( dst_enable_map->clk_enable_map, clk)) @@ -496,7 +591,7 @@ static int kbasep_hwcnt_backend_jm_dump_get( dst->clk_cnt_buf[clk] = backend_jm->cycle_count_elapsed[clk]; } - return kbase_hwcnt_jm_dump_get(dst, backend_jm->cpu_dump_va, + return kbase_hwcnt_jm_dump_get(dst, backend_jm->to_user_buf, dst_enable_map, backend_jm->pm_core_mask, &backend_jm->curr_config, accumulate); } @@ -519,6 +614,11 @@ static int kbasep_hwcnt_backend_jm_dump_alloc( u64 flags; u64 nr_pages; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + WARN_ON(!info); WARN_ON(!kctx); WARN_ON(!gpu_dump_va); @@ -531,7 +631,8 @@ static int kbasep_hwcnt_backend_jm_dump_alloc( nr_pages = PFN_UP(info->dump_bytes); - reg = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, gpu_dump_va); + reg = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, gpu_dump_va, + mmu_sync_info); if (!reg) return -ENOMEM; @@ -580,6 +681,8 @@ static void kbasep_hwcnt_backend_jm_destroy( kbase_destroy_context(kctx); } + kfree(backend->to_user_buf); + kfree(backend); } @@ -608,6 +711,8 @@ static int kbasep_hwcnt_backend_jm_create( goto alloc_error; backend->info = info; + kbasep_hwcnt_backend_jm_init_layout(&info->hwcnt_gpu_info, + &backend->phys_layout); backend->kctx = kbase_create_context(kbdev, true, BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED, 0, NULL); @@ -623,7 +728,12 @@ static int kbasep_hwcnt_backend_jm_create( backend->cpu_dump_va = kbase_phy_alloc_mapping_get(backend->kctx, backend->gpu_dump_va, &backend->vmap); - if (!backend->cpu_dump_va) + if (!backend->cpu_dump_va || !backend->vmap) + goto alloc_error; + + backend->to_user_buf = + kzalloc(info->metadata->dump_buf_bytes, GFP_KERNEL); + if (!backend->to_user_buf) goto alloc_error; kbase_ccswe_init(&backend->ccswe_shader_cores); @@ -710,19 +820,14 @@ static int kbasep_hwcnt_backend_jm_info_create( const struct kbase_hwcnt_backend_jm_info **out_info) { int errcode = -ENOMEM; - struct kbase_hwcnt_gpu_info hwcnt_gpu_info; struct kbase_hwcnt_backend_jm_info *info = NULL; WARN_ON(!kbdev); WARN_ON(!out_info); - errcode = kbasep_hwcnt_backend_jm_gpu_info_init(kbdev, &hwcnt_gpu_info); - if (errcode) - return errcode; - info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) - goto error; + return errcode; info->kbdev = kbdev; @@ -735,7 +840,12 @@ static int kbasep_hwcnt_backend_jm_info_create( info->counter_set = KBASE_HWCNT_SET_PRIMARY; #endif - errcode = kbase_hwcnt_jm_metadata_create(&hwcnt_gpu_info, + errcode = kbasep_hwcnt_backend_jm_gpu_info_init(kbdev, + &info->hwcnt_gpu_info); + if (errcode) + goto error; + + errcode = kbase_hwcnt_jm_metadata_create(&info->hwcnt_gpu_info, info->counter_set, &info->metadata, &info->dump_bytes); diff --git a/mali_kbase/mali_kbase_hwcnt_gpu.c b/mali_kbase/mali_kbase_hwcnt_gpu.c index 2975269..97a7511 100644 --- a/mali_kbase/mali_kbase_hwcnt_gpu.c +++ b/mali_kbase/mali_kbase_hwcnt_gpu.c @@ -223,7 +223,7 @@ kbasep_hwcnt_backend_jm_dump_bytes(const struct kbase_hwcnt_gpu_info *gpu_info) WARN_ON(!gpu_info); return (2 + gpu_info->l2_count + fls64(gpu_info->core_mask)) * - gpu_info->prfcnt_values_per_block * KBASE_HWCNT_VALUE_BYTES; + gpu_info->prfcnt_values_per_block * KBASE_HWCNT_VALUE_HW_BYTES; } int kbase_hwcnt_jm_metadata_create( @@ -253,10 +253,11 @@ int kbase_hwcnt_jm_metadata_create( return errcode; /* - * Dump abstraction size should be exactly the same size and layout as - * the physical dump size, for backwards compatibility. + * The physical dump size should be half of dump abstraction size in + * metadata since physical HW uses 32-bit per value but metadata + * specifies 64-bit per value. */ - WARN_ON(dump_bytes != metadata->dump_buf_bytes); + WARN_ON(dump_bytes * 2 != metadata->dump_buf_bytes); *out_metadata = metadata; *out_dump_bytes = dump_bytes; @@ -302,127 +303,6 @@ void kbase_hwcnt_csf_metadata_destroy( kbase_hwcnt_metadata_destroy(metadata); } -int kbase_hwcnt_gpu_metadata_create_truncate_64( - const struct kbase_hwcnt_metadata **dst_md, - const struct kbase_hwcnt_metadata *src_md) -{ - struct kbase_hwcnt_description desc; - struct kbase_hwcnt_group_description group; - struct kbase_hwcnt_block_description - blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT]; - size_t prfcnt_values_per_block; - size_t blk; - - if (!dst_md || !src_md || !src_md->grp_metadata || - !src_md->grp_metadata[0].blk_metadata) - return -EINVAL; - - /* Only support 1 group count and KBASE_HWCNT_V5_BLOCK_TYPE_COUNT block - * count in the metadata. - */ - if ((kbase_hwcnt_metadata_group_count(src_md) != 1) || - (kbase_hwcnt_metadata_block_count(src_md, 0) != - KBASE_HWCNT_V5_BLOCK_TYPE_COUNT)) - return -EINVAL; - - /* Get the values count in the first block. */ - prfcnt_values_per_block = - kbase_hwcnt_metadata_block_values_count(src_md, 0, 0); - - /* check all blocks should have same values count. */ - for (blk = 0; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) { - size_t val_cnt = - kbase_hwcnt_metadata_block_values_count(src_md, 0, blk); - if (val_cnt != prfcnt_values_per_block) - return -EINVAL; - } - - /* Only support 64 and 128 entries per block. */ - if ((prfcnt_values_per_block != 64) && (prfcnt_values_per_block != 128)) - return -EINVAL; - - if (prfcnt_values_per_block == 64) { - /* If the values per block is 64, no need to truncate. */ - *dst_md = NULL; - return 0; - } - - /* Truncate from 128 to 64 entries per block to keep API backward - * compatibility. - */ - prfcnt_values_per_block = 64; - - for (blk = 0; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) { - blks[blk].type = - kbase_hwcnt_metadata_block_type(src_md, 0, blk); - blks[blk].inst_cnt = kbase_hwcnt_metadata_block_instance_count( - src_md, 0, blk); - blks[blk].hdr_cnt = kbase_hwcnt_metadata_block_headers_count( - src_md, 0, blk); - blks[blk].ctr_cnt = prfcnt_values_per_block - blks[blk].hdr_cnt; - } - - group.type = kbase_hwcnt_metadata_group_type(src_md, 0); - group.blk_cnt = KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; - group.blks = blks; - - desc.grp_cnt = kbase_hwcnt_metadata_group_count(src_md); - desc.avail_mask = src_md->avail_mask; - desc.clk_cnt = src_md->clk_cnt; - desc.grps = &group; - - return kbase_hwcnt_metadata_create(&desc, dst_md); -} - -void kbase_hwcnt_dump_buffer_copy_strict_narrow( - struct kbase_hwcnt_dump_buffer *dst, - const struct kbase_hwcnt_dump_buffer *src, - const struct kbase_hwcnt_enable_map *dst_enable_map) -{ - const struct kbase_hwcnt_metadata *metadata; - size_t grp, blk, blk_inst; - size_t clk; - - if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || - WARN_ON(dst == src) || WARN_ON(dst->metadata == src->metadata) || - WARN_ON(dst->metadata->grp_cnt != src->metadata->grp_cnt) || - WARN_ON(src->metadata->grp_cnt != 1) || - WARN_ON(dst->metadata->grp_metadata[0].blk_cnt != - src->metadata->grp_metadata[0].blk_cnt) || - WARN_ON(dst->metadata->grp_metadata[0].blk_cnt != 4) || - WARN_ON(dst->metadata->grp_metadata[0].blk_metadata[0].ctr_cnt > - src->metadata->grp_metadata[0].blk_metadata[0].ctr_cnt)) - return; - - /* Don't use src metadata since src buffer is bigger than dst buffer. */ - metadata = dst->metadata; - - kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) { - u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance( - dst, grp, blk, blk_inst); - const u32 *src_blk = kbase_hwcnt_dump_buffer_block_instance( - src, grp, blk, blk_inst); - const u64 *blk_em = kbase_hwcnt_enable_map_block_instance( - dst_enable_map, grp, blk, blk_inst); - size_t val_cnt = kbase_hwcnt_metadata_block_values_count( - metadata, grp, blk); - /* Align upwards to include padding bytes */ - val_cnt = KBASE_HWCNT_ALIGN_UPWARDS( - val_cnt, (KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / - KBASE_HWCNT_VALUE_BYTES)); - - kbase_hwcnt_dump_buffer_block_copy_strict(dst_blk, src_blk, - blk_em, val_cnt); - } - - kbase_hwcnt_metadata_for_each_clock(metadata, clk) { - bool clk_enabled = kbase_hwcnt_clk_enable_map_enabled( - dst_enable_map->clk_enable_map, clk); - - dst->clk_cnt_buf[clk] = clk_enabled ? src->clk_cnt_buf[clk] : 0; - } -} - static bool is_block_type_shader( const u64 grp_type, const u64 blk_type, @@ -462,28 +342,26 @@ static bool is_block_type_l2_cache( return is_l2_cache; } -int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src, +int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src, const struct kbase_hwcnt_enable_map *dst_enable_map, u64 pm_core_mask, const struct kbase_hwcnt_curr_config *curr_config, bool accumulate) { const struct kbase_hwcnt_metadata *metadata; - const u32 *dump_src; - size_t src_offset, grp, blk, blk_inst; + size_t grp, blk, blk_inst; + const u64 *dump_src = src; + size_t src_offset = 0; u64 core_mask = pm_core_mask; /* Variables to deal with the current configuration */ int l2_count = 0; - bool hw_res_available = true; if (!dst || !src || !dst_enable_map || (dst_enable_map->metadata != dst->metadata)) return -EINVAL; metadata = dst->metadata; - dump_src = (const u32 *)src; - src_offset = 0; kbase_hwcnt_metadata_for_each_block( metadata, grp, blk, blk_inst) { @@ -501,6 +379,7 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src, const bool is_l2_cache = is_block_type_l2_cache( kbase_hwcnt_metadata_group_type(metadata, grp), blk_type); + bool hw_res_available = true; /* * If l2 blocks is greater than the current allocated number of @@ -525,14 +404,13 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src, } /* - * Early out if no values in the dest block are enabled or if - * the resource target of the block is not available in the HW. + * Skip block if no values in the destination block are enabled. */ if (kbase_hwcnt_enable_map_block_enabled( dst_enable_map, grp, blk, blk_inst)) { - u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance( + u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance( dst, grp, blk, blk_inst); - const u32 *src_blk = dump_src + src_offset; + const u64 *src_blk = dump_src + src_offset; if ((!is_shader_core || (core_mask & 1)) && hw_res_available) { if (accumulate) { @@ -560,21 +438,20 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src, return 0; } -int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src, +int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src, const struct kbase_hwcnt_enable_map *dst_enable_map, bool accumulate) { const struct kbase_hwcnt_metadata *metadata; - const u32 *dump_src; - size_t src_offset, grp, blk, blk_inst; + const u64 *dump_src = src; + size_t src_offset = 0; + size_t grp, blk, blk_inst; if (!dst || !src || !dst_enable_map || (dst_enable_map->metadata != dst->metadata)) return -EINVAL; metadata = dst->metadata; - dump_src = (const u32 *)src; - src_offset = 0; kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) { const size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count( @@ -583,12 +460,14 @@ int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src, kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk); - /* Early out if no values in the dest block are enabled */ + /* + * Skip block if no values in the destination block are enabled. + */ if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst)) { - u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance( + u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance( dst, grp, blk, blk_inst); - const u32 *src_blk = dump_src + src_offset; + const u64 *src_blk = dump_src + src_offset; if (accumulate) { kbase_hwcnt_dump_buffer_block_accumulate( @@ -606,48 +485,6 @@ int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src, } /** - * kbasep_hwcnt_backend_gpu_block_map_to_physical() - Convert from a block - * enable map abstraction to - * a physical block enable - * map. - * @lo: Low 64 bits of block enable map abstraction. - * @hi: High 64 bits of block enable map abstraction. - * - * The abstraction uses 128 bits to enable 128 block values, whereas the - * physical uses just 32 bits, as bit n enables values [n*4, n*4+3]. - * Therefore, this conversion is lossy. - * - * Return: 32-bit physical block enable map. - */ -static inline u32 kbasep_hwcnt_backend_gpu_block_map_to_physical( - u64 lo, - u64 hi) -{ - u32 phys = 0; - u64 dwords[2] = {lo, hi}; - size_t dword_idx; - - for (dword_idx = 0; dword_idx < 2; dword_idx++) { - const u64 dword = dwords[dword_idx]; - u16 packed = 0; - - size_t hword_bit; - - for (hword_bit = 0; hword_bit < 16; hword_bit++) { - const size_t dword_bit = hword_bit * 4; - const u16 mask = - ((dword >> (dword_bit + 0)) & 0x1) | - ((dword >> (dword_bit + 1)) & 0x1) | - ((dword >> (dword_bit + 2)) & 0x1) | - ((dword >> (dword_bit + 3)) & 0x1); - packed |= (mask << hword_bit); - } - phys |= ((u32)packed) << (16 * dword_idx); - } - return phys; -} - -/** * kbasep_hwcnt_backend_gpu_block_map_from_physical() - Convert from a physical * block enable map to a * block enable map @@ -746,14 +583,13 @@ void kbase_hwcnt_gpu_enable_map_to_physical( } } - dst->fe_bm = - kbasep_hwcnt_backend_gpu_block_map_to_physical(fe_bm, 0); + dst->fe_bm = kbase_hwcnt_backend_gpu_block_map_to_physical(fe_bm, 0); dst->shader_bm = - kbasep_hwcnt_backend_gpu_block_map_to_physical(shader_bm, 0); + kbase_hwcnt_backend_gpu_block_map_to_physical(shader_bm, 0); dst->tiler_bm = - kbasep_hwcnt_backend_gpu_block_map_to_physical(tiler_bm, 0); + kbase_hwcnt_backend_gpu_block_map_to_physical(tiler_bm, 0); dst->mmu_l2_bm = - kbasep_hwcnt_backend_gpu_block_map_to_physical(mmu_l2_bm, 0); + kbase_hwcnt_backend_gpu_block_map_to_physical(mmu_l2_bm, 0); } void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst, @@ -857,12 +693,12 @@ void kbase_hwcnt_gpu_patch_dump_headers( kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) { const u64 grp_type = kbase_hwcnt_metadata_group_type(metadata, grp); - u32 *buf_blk = kbase_hwcnt_dump_buffer_block_instance( + u64 *buf_blk = kbase_hwcnt_dump_buffer_block_instance( buf, grp, blk, blk_inst); const u64 *blk_map = kbase_hwcnt_enable_map_block_instance( enable_map, grp, blk, blk_inst); const u32 prfcnt_en = - kbasep_hwcnt_backend_gpu_block_map_to_physical( + kbase_hwcnt_backend_gpu_block_map_to_physical( blk_map[0], 0); if ((enum kbase_hwcnt_gpu_group_type)grp_type == diff --git a/mali_kbase/mali_kbase_hwcnt_gpu.h b/mali_kbase/mali_kbase_hwcnt_gpu.h index 50ae80d..648f85f 100644 --- a/mali_kbase/mali_kbase_hwcnt_gpu.h +++ b/mali_kbase/mali_kbase_hwcnt_gpu.h @@ -29,15 +29,25 @@ struct kbase_hwcnt_metadata; struct kbase_hwcnt_enable_map; struct kbase_hwcnt_dump_buffer; +/* Hardware counter version 5 definitions, V5 is the only supported version. */ #define KBASE_HWCNT_V5_BLOCK_TYPE_COUNT 4 #define KBASE_HWCNT_V5_HEADERS_PER_BLOCK 4 #define KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK 60 #define KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK \ (KBASE_HWCNT_V5_HEADERS_PER_BLOCK + \ KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK) -/** Index of the PRFCNT_EN header into a V5 counter block */ + +/* FrontEnd block count in V5 GPU hardware counter. */ +#define KBASE_HWCNT_V5_FE_BLOCK_COUNT 1 +/* Tiler block count in V5 GPU hardware counter. */ +#define KBASE_HWCNT_V5_TILER_BLOCK_COUNT 1 + +/* Index of the PRFCNT_EN header into a V5 counter block */ #define KBASE_HWCNT_V5_PRFCNT_EN_HEADER 2 +/* Number of bytes for each counter value in hardware. */ +#define KBASE_HWCNT_VALUE_HW_BYTES (sizeof(u32)) + /** * enum kbase_hwcnt_gpu_group_type - GPU hardware counter group types, used to * identify metadata groups. @@ -84,11 +94,13 @@ enum kbase_hwcnt_gpu_v5_block_type { * @KBASE_HWCNT_SET_PRIMARY: The Primary set of counters * @KBASE_HWCNT_SET_SECONDARY: The Secondary set of counters * @KBASE_HWCNT_SET_TERTIARY: The Tertiary set of counters + * @KBASE_HWCNT_SET_UNDEFINED: Undefined set of counters */ enum kbase_hwcnt_set { KBASE_HWCNT_SET_PRIMARY, KBASE_HWCNT_SET_SECONDARY, KBASE_HWCNT_SET_TERTIARY, + KBASE_HWCNT_SET_UNDEFINED = 255, }; /** @@ -225,61 +237,19 @@ void kbase_hwcnt_csf_metadata_destroy( const struct kbase_hwcnt_metadata *metadata); /** - * kbase_hwcnt_gpu_metadata_create_truncate_64() - Create HWC metadata with HWC - * block entries truncated - * to 64. - * - * @dst_md: Non-NULL pointer to where created metadata is stored on success. - * @src_md: Non-NULL pointer to the HWC metadata used as the source to create - * dst_md. - * - * If the total block entries in src_md is 64, metadata dst_md returns NULL - * since no need to truncate. - * if the total block entries in src_md is 128, then a new metadata with block - * entries truncated to 64 will be created for dst_md, which keeps the interface - * to user clients backward compatible. - * If the total block entries in src_md is other values, function returns error - * since it's not supported. - * - * Return: 0 on success, else error code. - */ -int kbase_hwcnt_gpu_metadata_create_truncate_64( - const struct kbase_hwcnt_metadata **dst_md, - const struct kbase_hwcnt_metadata *src_md); - -/** - * kbase_hwcnt_dump_buffer_copy_strict_narrow() - Copy all enabled values from - * src to dst. - * - * @dst: Non-NULL pointer to dst dump buffer. - * @src: Non-NULL pointer to src dump buffer. - * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values. - * - * After the operation, all non-enabled values (including padding bytes) will be - * zero. - * - * The dst and src have different metadata, and the dst metadata is narrower - * than src metadata. - */ -void kbase_hwcnt_dump_buffer_copy_strict_narrow( - struct kbase_hwcnt_dump_buffer *dst, - const struct kbase_hwcnt_dump_buffer *src, - const struct kbase_hwcnt_enable_map *dst_enable_map); - -/** * kbase_hwcnt_jm_dump_get() - Copy or accumulate enabled counters from the raw * dump buffer in src into the dump buffer * abstraction in dst. - * @dst: Non-NULL pointer to dst dump buffer. - * @src: Non-NULL pointer to src raw dump buffer, of same length - * as returned in out_dump_bytes parameter of - * kbase_hwcnt_jm_metadata_create. + * @dst: Non-NULL pointer to destination dump buffer. + * @src: Non-NULL pointer to source raw dump buffer, of same length + * as dump_buf_bytes in the metadata of destination dump + * buffer. * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values. * @pm_core_mask: PM state synchronized shaders core mask with the dump. * @curr_config: Current allocated hardware resources to correctly map the - * src raw dump buffer to the dst dump buffer. - * @accumulate: True if counters in src should be accumulated into dst, - * rather than copied. + * source raw dump buffer to the destination dump buffer. + * @accumulate: True if counters in source should be accumulated into + * destination, rather than copied. * * The dst and dst_enable_map MUST have been created from the same metadata as * returned from the call to kbase_hwcnt_jm_metadata_create as was used to get @@ -287,7 +257,7 @@ void kbase_hwcnt_dump_buffer_copy_strict_narrow( * * Return: 0 on success, else error code. */ -int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src, +int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src, const struct kbase_hwcnt_enable_map *dst_enable_map, const u64 pm_core_mask, const struct kbase_hwcnt_curr_config *curr_config, @@ -297,13 +267,12 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src, * kbase_hwcnt_csf_dump_get() - Copy or accumulate enabled counters from the raw * dump buffer in src into the dump buffer * abstraction in dst. - * @dst: Non-NULL pointer to dst dump buffer. - * @src: Non-NULL pointer to src raw dump buffer, of same length - * as returned in out_dump_bytes parameter of - * kbase_hwcnt_csf_metadata_create. + * @dst: Non-NULL pointer to destination dump buffer. + * @src: Non-NULL pointer to source raw dump buffer, of same length + * as dump_buf_bytes in the metadata of dst dump buffer. * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values. - * @accumulate: True if counters in src should be accumulated into dst, - * rather than copied. + * @accumulate: True if counters in src should be accumulated into + * destination, rather than copied. * * The dst and dst_enable_map MUST have been created from the same metadata as * returned from the call to kbase_hwcnt_csf_metadata_create as was used to get @@ -311,15 +280,54 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src, * * Return: 0 on success, else error code. */ -int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, void *src, +int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src, const struct kbase_hwcnt_enable_map *dst_enable_map, bool accumulate); /** + * kbase_hwcnt_backend_gpu_block_map_to_physical() - Convert from a block + * enable map abstraction to + * a physical block enable + * map. + * @lo: Low 64 bits of block enable map abstraction. + * @hi: High 64 bits of block enable map abstraction. + * + * The abstraction uses 128 bits to enable 128 block values, whereas the + * physical uses just 32 bits, as bit n enables values [n*4, n*4+3]. + * Therefore, this conversion is lossy. + * + * Return: 32-bit physical block enable map. + */ +static inline u32 kbase_hwcnt_backend_gpu_block_map_to_physical(u64 lo, u64 hi) +{ + u32 phys = 0; + u64 dwords[2] = { lo, hi }; + size_t dword_idx; + + for (dword_idx = 0; dword_idx < 2; dword_idx++) { + const u64 dword = dwords[dword_idx]; + u16 packed = 0; + + size_t hword_bit; + + for (hword_bit = 0; hword_bit < 16; hword_bit++) { + const size_t dword_bit = hword_bit * 4; + const u16 mask = ((dword >> (dword_bit + 0)) & 0x1) | + ((dword >> (dword_bit + 1)) & 0x1) | + ((dword >> (dword_bit + 2)) & 0x1) | + ((dword >> (dword_bit + 3)) & 0x1); + packed |= (mask << hword_bit); + } + phys |= ((u32)packed) << (16 * dword_idx); + } + return phys; +} + +/** * kbase_hwcnt_gpu_enable_map_to_physical() - Convert an enable map abstraction * into a physical enable map. - * @dst: Non-NULL pointer to dst physical enable map. - * @src: Non-NULL pointer to src enable map abstraction. + * @dst: Non-NULL pointer to destination physical enable map. + * @src: Non-NULL pointer to source enable map abstraction. * * The src must have been created from a metadata returned from a call to * kbase_hwcnt_jm_metadata_create or kbase_hwcnt_csf_metadata_create. @@ -336,8 +344,8 @@ void kbase_hwcnt_gpu_enable_map_to_physical( * kbase_hwcnt_gpu_set_to_physical() - Map counter set selection to physical * SET_SELECT value. * - * @dst: Non-NULL pointer to dst physical SET_SELECT value. - * @src: Non-NULL pointer to src counter set selection. + * @dst: Non-NULL pointer to destination physical SET_SELECT value. + * @src: Non-NULL pointer to source counter set selection. */ void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst, enum kbase_hwcnt_set src); @@ -345,8 +353,8 @@ void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst, /** * kbase_hwcnt_gpu_enable_map_from_physical() - Convert a physical enable map to * an enable map abstraction. - * @dst: Non-NULL pointer to dst enable map abstraction. - * @src: Non-NULL pointer to src physical enable map. + * @dst: Non-NULL pointer to destination enable map abstraction. + * @src: Non-NULL pointer to source physical enable map. * * The dst must have been created from a metadata returned from a call to * kbase_hwcnt_jm_metadata_create or kbase_hwcnt_csf_metadata_create. diff --git a/mali_kbase/mali_kbase_hwcnt_gpu_narrow.c b/mali_kbase/mali_kbase_hwcnt_gpu_narrow.c new file mode 100644 index 0000000..e2caa1c --- /dev/null +++ b/mali_kbase/mali_kbase_hwcnt_gpu_narrow.c @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +/* + * + * (C) COPYRIGHT 2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#include "mali_kbase_hwcnt_gpu.h" +#include "mali_kbase_hwcnt_gpu_narrow.h" + +#include <linux/bug.h> +#include <linux/err.h> +#include <linux/slab.h> + +int kbase_hwcnt_gpu_metadata_narrow_create( + const struct kbase_hwcnt_metadata_narrow **dst_md_narrow, + const struct kbase_hwcnt_metadata *src_md) +{ + struct kbase_hwcnt_description desc; + struct kbase_hwcnt_group_description group; + struct kbase_hwcnt_block_description + blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT]; + size_t prfcnt_values_per_block; + size_t blk; + int err; + struct kbase_hwcnt_metadata_narrow *metadata_narrow; + + if (!dst_md_narrow || !src_md || !src_md->grp_metadata || + !src_md->grp_metadata[0].blk_metadata) + return -EINVAL; + + /* Only support 1 group count and KBASE_HWCNT_V5_BLOCK_TYPE_COUNT block + * count in the metadata. + */ + if ((kbase_hwcnt_metadata_group_count(src_md) != 1) || + (kbase_hwcnt_metadata_block_count(src_md, 0) != + KBASE_HWCNT_V5_BLOCK_TYPE_COUNT)) + return -EINVAL; + + /* Get the values count in the first block. */ + prfcnt_values_per_block = + kbase_hwcnt_metadata_block_values_count(src_md, 0, 0); + + /* check all blocks should have same values count. */ + for (blk = 1; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) { + size_t val_cnt = + kbase_hwcnt_metadata_block_values_count(src_md, 0, blk); + if (val_cnt != prfcnt_values_per_block) + return -EINVAL; + } + + /* Only support 64 and 128 entries per block. */ + if ((prfcnt_values_per_block != 64) && (prfcnt_values_per_block != 128)) + return -EINVAL; + + metadata_narrow = kmalloc(sizeof(*metadata_narrow), GFP_KERNEL); + if (!metadata_narrow) + return -ENOMEM; + + /* Narrow to 64 entries per block to keep API backward compatibility. */ + prfcnt_values_per_block = 64; + + for (blk = 0; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) { + size_t blk_hdr_cnt = kbase_hwcnt_metadata_block_headers_count( + src_md, 0, blk); + blks[blk] = (struct kbase_hwcnt_block_description){ + .type = kbase_hwcnt_metadata_block_type(src_md, 0, blk), + .inst_cnt = kbase_hwcnt_metadata_block_instance_count( + src_md, 0, blk), + .hdr_cnt = blk_hdr_cnt, + .ctr_cnt = prfcnt_values_per_block - blk_hdr_cnt, + }; + } + + group = (struct kbase_hwcnt_group_description){ + .type = kbase_hwcnt_metadata_group_type(src_md, 0), + .blk_cnt = KBASE_HWCNT_V5_BLOCK_TYPE_COUNT, + .blks = blks, + }; + + desc = (struct kbase_hwcnt_description){ + .grp_cnt = kbase_hwcnt_metadata_group_count(src_md), + .avail_mask = src_md->avail_mask, + .clk_cnt = src_md->clk_cnt, + .grps = &group, + }; + + err = kbase_hwcnt_metadata_create(&desc, &metadata_narrow->metadata); + if (!err) { + /* Narrow down the buffer size to half as the narrowed metadata + * only supports 32-bit but the created metadata uses 64-bit for + * block entry. + */ + metadata_narrow->dump_buf_bytes = + metadata_narrow->metadata->dump_buf_bytes >> 1; + *dst_md_narrow = metadata_narrow; + } else { + kfree(metadata_narrow); + } + + return err; +} + +void kbase_hwcnt_gpu_metadata_narrow_destroy( + const struct kbase_hwcnt_metadata_narrow *md_narrow) +{ + if (!md_narrow) + return; + + kbase_hwcnt_metadata_destroy(md_narrow->metadata); + kfree(md_narrow); +} + +int kbase_hwcnt_dump_buffer_narrow_alloc( + const struct kbase_hwcnt_metadata_narrow *md_narrow, + struct kbase_hwcnt_dump_buffer_narrow *dump_buf) +{ + size_t dump_buf_bytes; + size_t clk_cnt_buf_bytes; + u8 *buf; + + if (!md_narrow || !dump_buf) + return -EINVAL; + + dump_buf_bytes = md_narrow->dump_buf_bytes; + clk_cnt_buf_bytes = + sizeof(*dump_buf->clk_cnt_buf) * md_narrow->metadata->clk_cnt; + + /* Make a single allocation for both dump_buf and clk_cnt_buf. */ + buf = kmalloc(dump_buf_bytes + clk_cnt_buf_bytes, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + *dump_buf = (struct kbase_hwcnt_dump_buffer_narrow){ + .md_narrow = md_narrow, + .dump_buf = (u32 *)buf, + .clk_cnt_buf = (u64 *)(buf + dump_buf_bytes), + }; + + return 0; +} + +void kbase_hwcnt_dump_buffer_narrow_free( + struct kbase_hwcnt_dump_buffer_narrow *dump_buf_narrow) +{ + if (!dump_buf_narrow) + return; + + kfree(dump_buf_narrow->dump_buf); + *dump_buf_narrow = (struct kbase_hwcnt_dump_buffer_narrow){ 0 }; +} + +int kbase_hwcnt_dump_buffer_narrow_array_alloc( + const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t n, + struct kbase_hwcnt_dump_buffer_narrow_array *dump_bufs) +{ + struct kbase_hwcnt_dump_buffer_narrow *buffers; + size_t buf_idx; + unsigned int order; + unsigned long addr; + size_t dump_buf_bytes; + size_t clk_cnt_buf_bytes; + size_t total_dump_buf_size; + + if (!md_narrow || !dump_bufs) + return -EINVAL; + + dump_buf_bytes = md_narrow->dump_buf_bytes; + clk_cnt_buf_bytes = sizeof(*dump_bufs->bufs->clk_cnt_buf) * + md_narrow->metadata->clk_cnt; + + /* Allocate memory for the dump buffer struct array */ + buffers = kmalloc_array(n, sizeof(*buffers), GFP_KERNEL); + if (!buffers) + return -ENOMEM; + + /* Allocate pages for the actual dump buffers, as they tend to be fairly + * large. + */ + order = get_order((dump_buf_bytes + clk_cnt_buf_bytes) * n); + addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); + + if (!addr) { + kfree(buffers); + return -ENOMEM; + } + + *dump_bufs = (struct kbase_hwcnt_dump_buffer_narrow_array){ + .page_addr = addr, + .page_order = order, + .buf_cnt = n, + .bufs = buffers, + }; + + total_dump_buf_size = dump_buf_bytes * n; + /* Set the buffer of each dump buf */ + for (buf_idx = 0; buf_idx < n; buf_idx++) { + const size_t dump_buf_offset = dump_buf_bytes * buf_idx; + const size_t clk_cnt_buf_offset = + total_dump_buf_size + (clk_cnt_buf_bytes * buf_idx); + + buffers[buf_idx] = (struct kbase_hwcnt_dump_buffer_narrow){ + .md_narrow = md_narrow, + .dump_buf = (u32 *)(addr + dump_buf_offset), + .clk_cnt_buf = (u64 *)(addr + clk_cnt_buf_offset), + }; + } + + return 0; +} + +void kbase_hwcnt_dump_buffer_narrow_array_free( + struct kbase_hwcnt_dump_buffer_narrow_array *dump_bufs) +{ + if (!dump_bufs) + return; + + kfree(dump_bufs->bufs); + free_pages(dump_bufs->page_addr, dump_bufs->page_order); + memset(dump_bufs, 0, sizeof(*dump_bufs)); +} + +void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk, + const u64 *src_blk, + const u64 *blk_em, + size_t val_cnt) +{ + size_t val; + + for (val = 0; val < val_cnt; val++) { + bool val_enabled = + kbase_hwcnt_enable_map_block_value_enabled(blk_em, val); + u32 src_val = + (src_blk[val] > U32_MAX) ? U32_MAX : (u32)src_blk[val]; + + dst_blk[val] = val_enabled ? src_val : 0; + } +} + +void kbase_hwcnt_dump_buffer_copy_strict_narrow( + struct kbase_hwcnt_dump_buffer_narrow *dst_narrow, + const struct kbase_hwcnt_dump_buffer *src, + const struct kbase_hwcnt_enable_map *dst_enable_map) +{ + const struct kbase_hwcnt_metadata_narrow *metadata_narrow; + size_t grp; + size_t clk; + + if (WARN_ON(!dst_narrow) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || + WARN_ON(dst_narrow->md_narrow->metadata == src->metadata) || + WARN_ON(dst_narrow->md_narrow->metadata->grp_cnt != + src->metadata->grp_cnt) || + WARN_ON(src->metadata->grp_cnt != 1) || + WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0].blk_cnt != + src->metadata->grp_metadata[0].blk_cnt) || + WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0].blk_cnt != + KBASE_HWCNT_V5_BLOCK_TYPE_COUNT) || + WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0] + .blk_metadata[0] + .ctr_cnt > + src->metadata->grp_metadata[0].blk_metadata[0].ctr_cnt)) + return; + + /* Don't use src metadata since src buffer is bigger than dst buffer. */ + metadata_narrow = dst_narrow->md_narrow; + + for (grp = 0; + grp < kbase_hwcnt_metadata_narrow_group_count(metadata_narrow); + grp++) { + size_t blk; + size_t blk_cnt = kbase_hwcnt_metadata_narrow_block_count( + metadata_narrow, grp); + + for (blk = 0; blk < blk_cnt; blk++) { + size_t blk_inst; + size_t blk_inst_cnt = + kbase_hwcnt_metadata_narrow_block_instance_count( + metadata_narrow, grp, blk); + + for (blk_inst = 0; blk_inst < blk_inst_cnt; + blk_inst++) { + /* The narrowed down buffer is only 32-bit. */ + u32 *dst_blk = + kbase_hwcnt_dump_buffer_narrow_block_instance( + dst_narrow, grp, blk, blk_inst); + const u64 *src_blk = + kbase_hwcnt_dump_buffer_block_instance( + src, grp, blk, blk_inst); + const u64 *blk_em = + kbase_hwcnt_enable_map_block_instance( + dst_enable_map, grp, blk, + blk_inst); + size_t val_cnt = + kbase_hwcnt_metadata_narrow_block_values_count( + metadata_narrow, grp, blk); + /* Align upwards to include padding bytes */ + val_cnt = KBASE_HWCNT_ALIGN_UPWARDS( + val_cnt, + (KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / + KBASE_HWCNT_VALUE_BYTES)); + + kbase_hwcnt_dump_buffer_block_copy_strict_narrow( + dst_blk, src_blk, blk_em, val_cnt); + } + } + } + + for (clk = 0; clk < metadata_narrow->metadata->clk_cnt; clk++) { + bool clk_enabled = kbase_hwcnt_clk_enable_map_enabled( + dst_enable_map->clk_enable_map, clk); + + dst_narrow->clk_cnt_buf[clk] = + clk_enabled ? src->clk_cnt_buf[clk] : 0; + } +} diff --git a/mali_kbase/mali_kbase_hwcnt_gpu_narrow.h b/mali_kbase/mali_kbase_hwcnt_gpu_narrow.h new file mode 100644 index 0000000..af6fa19 --- /dev/null +++ b/mali_kbase/mali_kbase_hwcnt_gpu_narrow.h @@ -0,0 +1,347 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _KBASE_HWCNT_GPU_NARROW_H_ +#define _KBASE_HWCNT_GPU_NARROW_H_ + +#include "mali_kbase_hwcnt_types.h" +#include <linux/types.h> + +struct kbase_device; +struct kbase_hwcnt_metadata; +struct kbase_hwcnt_enable_map; +struct kbase_hwcnt_dump_buffer; + +/** + * struct kbase_hwcnt_metadata_narrow - Narrow metadata describing the physical + * layout of narrow dump buffers. + * For backward compatibility, the narrow + * metadata only supports 64 counters per + * block and 32-bit per block entry. + * @metadata: Non-NULL pointer to the metadata before narrow down to + * 32-bit per block entry, it has 64 counters per block and + * 64-bit per value. + * @dump_buf_bytes: The size in bytes after narrow 64-bit to 32-bit per block + * entry. + */ +struct kbase_hwcnt_metadata_narrow { + const struct kbase_hwcnt_metadata *metadata; + size_t dump_buf_bytes; +}; + +/** + * struct kbase_hwcnt_dump_buffer_narrow - Hardware counter narrow dump buffer. + * @md_narrow: Non-NULL pointer to narrow metadata used to identify, and to + * describe the layout of the narrow dump buffer. + * @dump_buf: Non-NULL pointer to an array of u32 values, the array size + * is md_narrow->dump_buf_bytes. + * @clk_cnt_buf: A pointer to an array of u64 values for cycle count elapsed + * for each clock domain. + */ +struct kbase_hwcnt_dump_buffer_narrow { + const struct kbase_hwcnt_metadata_narrow *md_narrow; + u32 *dump_buf; + u64 *clk_cnt_buf; +}; + +/** + * struct kbase_hwcnt_dump_buffer_narrow_array - Hardware counter narrow dump + * buffer array. + * @page_addr: Address of first allocated page. A single allocation is used for + * all narrow dump buffers in the array. + * @page_order: The allocation order of the pages, the order is on a logarithmic + * scale. + * @buf_cnt: The number of allocated dump buffers. + * @bufs: Non-NULL pointer to the array of narrow dump buffer descriptors. + */ +struct kbase_hwcnt_dump_buffer_narrow_array { + unsigned long page_addr; + unsigned int page_order; + size_t buf_cnt; + struct kbase_hwcnt_dump_buffer_narrow *bufs; +}; + +/** + * kbase_hwcnt_metadata_narrow_group_count() - Get the number of groups from + * narrow metadata. + * @md_narrow: Non-NULL pointer to narrow metadata. + * + * Return: Number of hardware counter groups described by narrow metadata. + */ +static inline size_t kbase_hwcnt_metadata_narrow_group_count( + const struct kbase_hwcnt_metadata_narrow *md_narrow) +{ + return kbase_hwcnt_metadata_group_count(md_narrow->metadata); +} + +/** + * kbase_hwcnt_metadata_narrow_group_type() - Get the arbitrary type of a group + * from narrow metadata. + * @md_narrow: Non-NULL pointer to narrow metadata. + * @grp: Index of the group in the narrow metadata. + * + * Return: Type of the group grp. + */ +static inline u64 kbase_hwcnt_metadata_narrow_group_type( + const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp) +{ + return kbase_hwcnt_metadata_group_type(md_narrow->metadata, grp); +} + +/** + * kbase_hwcnt_metadata_narrow_block_count() - Get the number of blocks in a + * group from narrow metadata. + * @md_narrow: Non-NULL pointer to narrow metadata. + * @grp: Index of the group in the narrow metadata. + * + * Return: Number of blocks in group grp. + */ +static inline size_t kbase_hwcnt_metadata_narrow_block_count( + const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp) +{ + return kbase_hwcnt_metadata_block_count(md_narrow->metadata, grp); +} + +/** + * kbase_hwcnt_metadata_narrow_block_instance_count() - Get the number of + * instances of a block + * from narrow metadata. + * @md_narrow: Non-NULL pointer to narrow metadata. + * @grp: Index of the group in the narrow metadata. + * @blk: Index of the block in the group. + * + * Return: Number of instances of block blk in group grp. + */ +static inline size_t kbase_hwcnt_metadata_narrow_block_instance_count( + const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp, + size_t blk) +{ + return kbase_hwcnt_metadata_block_instance_count(md_narrow->metadata, + grp, blk); +} + +/** + * kbase_hwcnt_metadata_narrow_block_headers_count() - Get the number of counter + * headers from narrow + * metadata. + * @md_narrow: Non-NULL pointer to narrow metadata. + * @grp: Index of the group in the narrow metadata. + * @blk: Index of the block in the group. + * + * Return: Number of counter headers in each instance of block blk in group grp. + */ +static inline size_t kbase_hwcnt_metadata_narrow_block_headers_count( + const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp, + size_t blk) +{ + return kbase_hwcnt_metadata_block_headers_count(md_narrow->metadata, + grp, blk); +} + +/** + * kbase_hwcnt_metadata_narrow_block_counters_count() - Get the number of + * counters from narrow + * metadata. + * @md_narrow: Non-NULL pointer to narrow metadata. + * @grp: Index of the group in the narrow metadata. + * @blk: Index of the block in the group. + * + * Return: Number of counters in each instance of block blk in group grp. + */ +static inline size_t kbase_hwcnt_metadata_narrow_block_counters_count( + const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp, + size_t blk) +{ + return kbase_hwcnt_metadata_block_counters_count(md_narrow->metadata, + grp, blk); +} + +/** + * kbase_hwcnt_metadata_narrow_block_values_count() - Get the number of values + * from narrow metadata. + * @md_narrow: Non-NULL pointer to narrow metadata. + * @grp: Index of the group in the narrow metadata. + * @blk: Index of the block in the group. + * + * Return: Number of headers plus counters in each instance of block blk + * in group grp. + */ +static inline size_t kbase_hwcnt_metadata_narrow_block_values_count( + const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp, + size_t blk) +{ + return kbase_hwcnt_metadata_narrow_block_counters_count(md_narrow, grp, + blk) + + kbase_hwcnt_metadata_narrow_block_headers_count(md_narrow, grp, + blk); +} + +/** + * kbase_hwcnt_dump_buffer_narrow_block_instance() - Get the pointer to a + * narrowed block instance's + * dump buffer. + * @buf: Non-NULL pointer to narrow dump buffer. + * @grp: Index of the group in the narrow metadata. + * @blk: Index of the block in the group. + * @blk_inst: Index of the block instance in the block. + * + * Return: u32* to the dump buffer for the block instance. + */ +static inline u32 *kbase_hwcnt_dump_buffer_narrow_block_instance( + const struct kbase_hwcnt_dump_buffer_narrow *buf, size_t grp, + size_t blk, size_t blk_inst) +{ + return buf->dump_buf + + buf->md_narrow->metadata->grp_metadata[grp].dump_buf_index + + buf->md_narrow->metadata->grp_metadata[grp] + .blk_metadata[blk] + .dump_buf_index + + (buf->md_narrow->metadata->grp_metadata[grp] + .blk_metadata[blk] + .dump_buf_stride * + blk_inst); +} + +/** + * kbase_hwcnt_gpu_metadata_narrow_create() - Create HWC metadata with HWC + * entries per block truncated to + * 64 entries and block entry size + * narrowed down to 32-bit. + * + * @dst_md_narrow: Non-NULL pointer to where created narrow metadata is stored + * on success. + * @src_md: Non-NULL pointer to the HWC metadata used as the source to + * create dst_md_narrow. + * + * For backward compatibility of the interface to user clients, a new metadata + * with entries per block truncated to 64 and block entry size narrowed down + * to 32-bit will be created for dst_md_narrow. + * The total entries per block in src_md must be 64 or 128, if it's other + * values, function returns error since it's not supported. + * + * Return: 0 on success, else error code. + */ +int kbase_hwcnt_gpu_metadata_narrow_create( + const struct kbase_hwcnt_metadata_narrow **dst_md_narrow, + const struct kbase_hwcnt_metadata *src_md); + +/** + * kbase_hwcnt_gpu_metadata_narrow_destroy() - Destroy a hardware counter narrow + * metadata object. + * @md_narrow: Pointer to hardware counter narrow metadata. + */ +void kbase_hwcnt_gpu_metadata_narrow_destroy( + const struct kbase_hwcnt_metadata_narrow *md_narrow); + +/** + * kbase_hwcnt_dump_buffer_narrow_alloc() - Allocate a narrow dump buffer. + * @md_narrow: Non-NULL pointer to narrow metadata. + * @dump_buf: Non-NULL pointer to narrow dump buffer to be initialised. Will be + * initialised to undefined values, so must be used as a copy + * destination, or cleared before use. + * + * Return: 0 on success, else error code. + */ +int kbase_hwcnt_dump_buffer_narrow_alloc( + const struct kbase_hwcnt_metadata_narrow *md_narrow, + struct kbase_hwcnt_dump_buffer_narrow *dump_buf); + +/** + * kbase_hwcnt_dump_buffer_narrow_free() - Free a narrow dump buffer. + * @dump_buf: Dump buffer to be freed. + * + * Can be safely called on an all-zeroed narrow dump buffer structure, or on an + * already freed narrow dump buffer. + */ +void kbase_hwcnt_dump_buffer_narrow_free( + struct kbase_hwcnt_dump_buffer_narrow *dump_buf); + +/** + * kbase_hwcnt_dump_buffer_narrow_array_alloc() - Allocate an array of narrow + * dump buffers. + * @md_narrow: Non-NULL pointer to narrow metadata. + * @n: Number of narrow dump buffers to allocate + * @dump_bufs: Non-NULL pointer to a kbase_hwcnt_dump_buffer_narrow_array + * object to be initialised. + * + * A single zeroed contiguous page allocation will be used for all of the + * buffers inside the object, where: + * dump_bufs->bufs[n].dump_buf == page_addr + n * md_narrow.dump_buf_bytes + * + * Return: 0 on success, else error code. + */ +int kbase_hwcnt_dump_buffer_narrow_array_alloc( + const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t n, + struct kbase_hwcnt_dump_buffer_narrow_array *dump_bufs); + +/** + * kbase_hwcnt_dump_buffer_narrow_array_free() - Free a narrow dump buffer + * array. + * @dump_bufs: Narrow Dump buffer array to be freed. + * + * Can be safely called on an all-zeroed narrow dump buffer array structure, or + * on an already freed narrow dump buffer array. + */ +void kbase_hwcnt_dump_buffer_narrow_array_free( + struct kbase_hwcnt_dump_buffer_narrow_array *dump_bufs); + +/** + * kbase_hwcnt_dump_buffer_block_copy_strict_narrow() - Copy all enabled block + * values from source to + * destination. + * @dst_blk: Non-NULL pointer to destination block obtained from a call to + * kbase_hwcnt_dump_buffer_narrow_block_instance. + * @src_blk: Non-NULL pointer to source block obtained from a call to + * kbase_hwcnt_dump_buffer_block_instance. + * @blk_em: Non-NULL pointer to the block bitfield(s) obtained from a call to + * kbase_hwcnt_enable_map_block_instance. + * @val_cnt: Number of values in the block. + * + * After the copy, any disabled values in destination will be zero, the enabled + * values in destination will be saturated at U32_MAX if the corresponding + * source value is bigger than U32_MAX, or copy the value from source if the + * corresponding source value is less than or equal to U32_MAX. + */ +void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk, + const u64 *src_blk, + const u64 *blk_em, + size_t val_cnt); + +/** + * kbase_hwcnt_dump_buffer_copy_strict_narrow() - Copy all enabled values to a + * narrow dump buffer. + * @dst_narrow: Non-NULL pointer to destination dump buffer. + * @src: Non-NULL pointer to source dump buffer. + * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values. + * + * After the operation, all non-enabled values (including padding bytes) will be + * zero. Slower than the non-strict variant. + * + * The enabled values in dst_narrow will be saturated at U32_MAX if the + * corresponding source value is bigger than U32_MAX, or copy the value from + * source if the corresponding source value is less than or equal to U32_MAX. + */ +void kbase_hwcnt_dump_buffer_copy_strict_narrow( + struct kbase_hwcnt_dump_buffer_narrow *dst_narrow, + const struct kbase_hwcnt_dump_buffer *src, + const struct kbase_hwcnt_enable_map *dst_enable_map); + +#endif /* _KBASE_HWCNT_GPU_NARROW_H_ */ diff --git a/mali_kbase/mali_kbase_hwcnt_legacy.c b/mali_kbase/mali_kbase_hwcnt_legacy.c index 0687253..5ca4c51 100644 --- a/mali_kbase/mali_kbase_hwcnt_legacy.c +++ b/mali_kbase/mali_kbase_hwcnt_legacy.c @@ -23,6 +23,7 @@ #include "mali_kbase_hwcnt_virtualizer.h" #include "mali_kbase_hwcnt_types.h" #include "mali_kbase_hwcnt_gpu.h" +#include "mali_kbase_hwcnt_gpu_narrow.h" #include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h> #include <linux/slab.h> @@ -32,14 +33,22 @@ * struct kbase_hwcnt_legacy_client - Legacy hardware counter client. * @user_dump_buf: Pointer to a non-NULL user buffer, where dumps are returned. * @enable_map: Counter enable map. - * @dump_buf: Dump buffer used to manipulate dumps before copied to user. + * @dump_buf: Dump buffer used to manipulate dumps from virtualizer. * @hvcli: Hardware counter virtualizer client. + * @dump_buf_user: Narrow dump buffer used to manipulate dumps before they are + * copied to user. + * @metadata_user: For compatibility with the user driver interface, this + * contains a narrowed version of the hardware counter metadata + * which is limited to 64 entries per block and 32-bit for each + * entry. */ struct kbase_hwcnt_legacy_client { void __user *user_dump_buf; struct kbase_hwcnt_enable_map enable_map; struct kbase_hwcnt_dump_buffer dump_buf; struct kbase_hwcnt_virtualizer_client *hvcli; + struct kbase_hwcnt_dump_buffer_narrow dump_buf_user; + const struct kbase_hwcnt_metadata_narrow *metadata_user; }; int kbase_hwcnt_legacy_client_create( @@ -61,6 +70,16 @@ int kbase_hwcnt_legacy_client_create( if (!hlcli) return -ENOMEM; + errcode = kbase_hwcnt_gpu_metadata_narrow_create(&hlcli->metadata_user, + metadata); + if (errcode) + goto error; + + errcode = kbase_hwcnt_dump_buffer_narrow_alloc(hlcli->metadata_user, + &hlcli->dump_buf_user); + if (errcode) + goto error; + hlcli->user_dump_buf = (void __user *)(uintptr_t)enable->dump_buffer; errcode = kbase_hwcnt_enable_map_alloc(metadata, &hlcli->enable_map); @@ -99,6 +118,8 @@ void kbase_hwcnt_legacy_client_destroy(struct kbase_hwcnt_legacy_client *hlcli) kbase_hwcnt_virtualizer_client_destroy(hlcli->hvcli); kbase_hwcnt_dump_buffer_free(&hlcli->dump_buf); kbase_hwcnt_enable_map_free(&hlcli->enable_map); + kbase_hwcnt_dump_buffer_narrow_free(&hlcli->dump_buf_user); + kbase_hwcnt_gpu_metadata_narrow_destroy(hlcli->metadata_user); kfree(hlcli); } @@ -123,13 +144,20 @@ int kbase_hwcnt_legacy_client_dump(struct kbase_hwcnt_legacy_client *hlcli) kbase_hwcnt_gpu_patch_dump_headers( &hlcli->dump_buf, &hlcli->enable_map); - /* Zero all non-enabled counters (current values are undefined) */ - kbase_hwcnt_dump_buffer_zero_non_enabled( - &hlcli->dump_buf, &hlcli->enable_map); + /* Copy the dump buffer to the userspace visible buffer. The strict + * variant will explicitly zero any non-enabled counters to ensure + * nothing except exactly what the user asked for is made visible. + * + * A narrow copy is required since virtualizer has a bigger buffer + * but user only needs part of it. + */ + kbase_hwcnt_dump_buffer_copy_strict_narrow( + &hlcli->dump_buf_user, &hlcli->dump_buf, &hlcli->enable_map); /* Copy into the user's buffer */ - errcode = copy_to_user(hlcli->user_dump_buf, hlcli->dump_buf.dump_buf, - hlcli->dump_buf.metadata->dump_buf_bytes); + errcode = copy_to_user(hlcli->user_dump_buf, + hlcli->dump_buf_user.dump_buf, + hlcli->dump_buf_user.md_narrow->dump_buf_bytes); /* Non-zero errcode implies user buf was invalid or too small */ if (errcode) return -EFAULT; diff --git a/mali_kbase/mali_kbase_hwcnt_types.c b/mali_kbase/mali_kbase_hwcnt_types.c index 492f572..d925ed7 100644 --- a/mali_kbase/mali_kbase_hwcnt_types.c +++ b/mali_kbase/mali_kbase_hwcnt_types.c @@ -32,7 +32,7 @@ int kbase_hwcnt_metadata_create( struct kbase_hwcnt_group_metadata *grp_mds; size_t grp; size_t enable_map_count; /* Number of u64 bitfields (inc padding) */ - size_t dump_buf_count; /* Number of u32 values (inc padding) */ + size_t dump_buf_count; /* Number of u64 values (inc padding) */ size_t avail_mask_bits; /* Number of availability mask bits */ size_t size; @@ -220,7 +220,7 @@ int kbase_hwcnt_dump_buffer_alloc( return -ENOMEM; dump_buf->metadata = metadata; - dump_buf->dump_buf = (u32 *)buf; + dump_buf->dump_buf = (u64 *)buf; dump_buf->clk_cnt_buf = (u64 *)(buf + dump_buf_bytes); return 0; @@ -282,7 +282,7 @@ int kbase_hwcnt_dump_buffer_array_alloc( (dump_buf_bytes * n) + (clk_cnt_buf_bytes * buf_idx); buffers[buf_idx].metadata = metadata; - buffers[buf_idx].dump_buf = (u32 *)(addr + dump_buf_offset); + buffers[buf_idx].dump_buf = (u64 *)(addr + dump_buf_offset); buffers[buf_idx].clk_cnt_buf = (u64 *)(addr + clk_cnt_buf_offset); } @@ -316,7 +316,7 @@ void kbase_hwcnt_dump_buffer_zero( metadata = dst->metadata; kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) { - u32 *dst_blk; + u64 *dst_blk; size_t val_cnt; if (!kbase_hwcnt_enable_map_block_enabled( @@ -362,7 +362,7 @@ void kbase_hwcnt_dump_buffer_zero_non_enabled( metadata = dst->metadata; kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) { - u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance( + u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance( dst, grp, blk, blk_inst); const u64 *blk_em = kbase_hwcnt_enable_map_block_instance( dst_enable_map, grp, blk, blk_inst); @@ -406,8 +406,8 @@ void kbase_hwcnt_dump_buffer_copy( metadata = dst->metadata; kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) { - u32 *dst_blk; - const u32 *src_blk; + u64 *dst_blk; + const u64 *src_blk; size_t val_cnt; if (!kbase_hwcnt_enable_map_block_enabled( @@ -451,9 +451,9 @@ void kbase_hwcnt_dump_buffer_copy_strict( metadata = dst->metadata; kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) { - u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance( + u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance( dst, grp, blk, blk_inst); - const u32 *src_blk = kbase_hwcnt_dump_buffer_block_instance( + const u64 *src_blk = kbase_hwcnt_dump_buffer_block_instance( src, grp, blk, blk_inst); const u64 *blk_em = kbase_hwcnt_enable_map_block_instance( dst_enable_map, grp, blk, blk_inst); @@ -497,8 +497,8 @@ void kbase_hwcnt_dump_buffer_accumulate( metadata = dst->metadata; kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) { - u32 *dst_blk; - const u32 *src_blk; + u64 *dst_blk; + const u64 *src_blk; size_t hdr_cnt; size_t ctr_cnt; @@ -546,9 +546,9 @@ void kbase_hwcnt_dump_buffer_accumulate_strict( metadata = dst->metadata; kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) { - u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance( + u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance( dst, grp, blk, blk_inst); - const u32 *src_blk = kbase_hwcnt_dump_buffer_block_instance( + const u64 *src_blk = kbase_hwcnt_dump_buffer_block_instance( src, grp, blk, blk_inst); const u64 *blk_em = kbase_hwcnt_enable_map_block_instance( dst_enable_map, grp, blk, blk_inst); diff --git a/mali_kbase/mali_kbase_hwcnt_types.h b/mali_kbase/mali_kbase_hwcnt_types.h index 6b7985b..f04c0ec 100644 --- a/mali_kbase/mali_kbase_hwcnt_types.h +++ b/mali_kbase/mali_kbase_hwcnt_types.h @@ -61,7 +61,7 @@ * An array of u64 bitfields, where each bit either enables exactly one * block value, or is unused (padding). * Dump Buffer: - * An array of u32 values, where each u32 corresponds either to one block + * An array of u64 values, where each u64 corresponds either to one block * value, or is unused (padding). * Availability Mask: * A bitfield, where each bit corresponds to whether a block instance is @@ -81,6 +81,7 @@ #define _KBASE_HWCNT_TYPES_H_ #include <linux/bitops.h> +#include <linux/bug.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/types.h> @@ -91,8 +92,11 @@ /* Number of bits in each bitfield */ #define KBASE_HWCNT_BITFIELD_BITS (KBASE_HWCNT_BITFIELD_BYTES * BITS_PER_BYTE) -/* Number of bytes for each counter value */ -#define KBASE_HWCNT_VALUE_BYTES (sizeof(u32)) +/* Number of bytes for each counter value. + * Use 64-bit per counter in driver to avoid HW 32-bit register values + * overflow after a long time accumulation. + */ +#define KBASE_HWCNT_VALUE_BYTES (sizeof(u64)) /* Number of bits in an availability mask (i.e. max total number of block * instances supported in a Hardware Counter System) @@ -119,8 +123,8 @@ * contiguous, Hardware Counter Blocks. * @type: The arbitrary identifier used to identify the type of the block. * @inst_cnt: The number of Instances of the block. - * @hdr_cnt: The number of 32-bit Block Headers in the block. - * @ctr_cnt: The number of 32-bit Block Counters in the block. + * @hdr_cnt: The number of 64-bit Block Headers in the block. + * @ctr_cnt: The number of 64-bit Block Counters in the block. */ struct kbase_hwcnt_block_description { u64 type; @@ -165,17 +169,17 @@ struct kbase_hwcnt_description { * @type: The arbitrary identifier used to identify the type of the * block. * @inst_cnt: The number of Instances of the block. - * @hdr_cnt: The number of 32-bit Block Headers in the block. - * @ctr_cnt: The number of 32-bit Block Counters in the block. + * @hdr_cnt: The number of 64-bit Block Headers in the block. + * @ctr_cnt: The number of 64-bit Block Counters in the block. * @enable_map_index: Index in u64s into the parent's Enable Map where the * Enable Map bitfields of the Block Instances described by * this metadata start. * @enable_map_stride: Stride in u64s between the Enable Maps of each of the * Block Instances described by this metadata. - * @dump_buf_index: Index in u32s into the parent's Dump Buffer where the + * @dump_buf_index: Index in u64s into the parent's Dump Buffer where the * Dump Buffers of the Block Instances described by this * metadata start. - * @dump_buf_stride: Stride in u32s between the Dump Buffers of each of the + * @dump_buf_stride: Stride in u64s between the Dump Buffers of each of the * Block Instances described by this metadata. * @avail_mask_index: Index in bits into the parent's Availability Mask where * the Availability Masks of the Block Instances described @@ -208,7 +212,7 @@ struct kbase_hwcnt_block_metadata { * @enable_map_index: Index in u64s into the parent's Enable Map where the * Enable Maps of the blocks within the group described by * this metadata start. - * @dump_buf_index: Index in u32s into the parent's Dump Buffer where the + * @dump_buf_index: Index in u64s into the parent's Dump Buffer where the * Dump Buffers of the blocks within the group described by * metadata start. * @avail_mask_index: Index in bits into the parent's Availability Mask where @@ -225,7 +229,7 @@ struct kbase_hwcnt_group_metadata { }; /** - * struct kbase_hwcnt_metadata - Metadata describing the physical layout + * struct kbase_hwcnt_metadata - Metadata describing the memory layout * of Dump Buffers and Enable Maps within a * Hardware Counter System. * @grp_cnt: The number of Hardware Counter Groups. @@ -264,18 +268,17 @@ struct kbase_hwcnt_enable_map { }; /** - * struct kbase_hwcnt_dump_buffer - Hardware Counter Dump Buffer. Array of u32 - * values. + * struct kbase_hwcnt_dump_buffer - Hardware Counter Dump Buffer. * @metadata: Non-NULL pointer to metadata used to identify, and to describe * the layout of the Dump Buffer. - * @dump_buf: Non-NULL pointer of size metadata->dump_buf_bytes to an array - * of u32 values. + * @dump_buf: Non-NULL pointer to an array of u64 values, the array size is + * metadata->dump_buf_bytes. * @clk_cnt_buf: A pointer to an array of u64 values for cycle count elapsed * for each clock domain. */ struct kbase_hwcnt_dump_buffer { const struct kbase_hwcnt_metadata *metadata; - u32 *dump_buf; + u64 *dump_buf; u64 *clk_cnt_buf; }; @@ -283,7 +286,8 @@ struct kbase_hwcnt_dump_buffer { * struct kbase_hwcnt_dump_buffer_array - Hardware Counter Dump Buffer array. * @page_addr: Address of allocated pages. A single allocation is used for all * Dump Buffers in the array. - * @page_order: The allocation order of the pages. + * @page_order: The allocation order of the pages, the order is on a logarithmic + * scale. * @buf_cnt: The number of allocated Dump Buffers. * @bufs: Non-NULL pointer to the array of Dump Buffers. */ @@ -319,8 +323,14 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata); * * Return: Number of hardware counter groups described by metadata. */ -#define kbase_hwcnt_metadata_group_count(metadata) \ - ((metadata)->grp_cnt) +static inline size_t +kbase_hwcnt_metadata_group_count(const struct kbase_hwcnt_metadata *metadata) +{ + if (WARN_ON(!metadata)) + return 0; + + return metadata->grp_cnt; +} /** * kbase_hwcnt_metadata_group_type() - Get the arbitrary type of a group. @@ -329,8 +339,15 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata); * * Return: Type of the group grp. */ -#define kbase_hwcnt_metadata_group_type(metadata, grp) \ - ((metadata)->grp_metadata[(grp)].type) +static inline u64 +kbase_hwcnt_metadata_group_type(const struct kbase_hwcnt_metadata *metadata, + size_t grp) +{ + if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt)) + return 0; + + return metadata->grp_metadata[grp].type; +} /** * kbase_hwcnt_metadata_block_count() - Get the number of blocks in a group. @@ -339,8 +356,15 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata); * * Return: Number of blocks in group grp. */ -#define kbase_hwcnt_metadata_block_count(metadata, grp) \ - ((metadata)->grp_metadata[(grp)].blk_cnt) +static inline size_t +kbase_hwcnt_metadata_block_count(const struct kbase_hwcnt_metadata *metadata, + size_t grp) +{ + if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt)) + return 0; + + return metadata->grp_metadata[grp].blk_cnt; +} /** * kbase_hwcnt_metadata_block_type() - Get the arbitrary type of a block. @@ -350,8 +374,16 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata); * * Return: Type of the block blk in group grp. */ -#define kbase_hwcnt_metadata_block_type(metadata, grp, blk) \ - ((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].type) +static inline u64 +kbase_hwcnt_metadata_block_type(const struct kbase_hwcnt_metadata *metadata, + size_t grp, size_t blk) +{ + if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) || + WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt)) + return 0; + + return metadata->grp_metadata[grp].blk_metadata[blk].type; +} /** * kbase_hwcnt_metadata_block_instance_count() - Get the number of instances of @@ -362,8 +394,15 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata); * * Return: Number of instances of block blk in group grp. */ -#define kbase_hwcnt_metadata_block_instance_count(metadata, grp, blk) \ - ((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].inst_cnt) +static inline size_t kbase_hwcnt_metadata_block_instance_count( + const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk) +{ + if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) || + WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt)) + return 0; + + return metadata->grp_metadata[grp].blk_metadata[blk].inst_cnt; +} /** * kbase_hwcnt_metadata_block_headers_count() - Get the number of counter @@ -374,8 +413,15 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata); * * Return: Number of counter headers in each instance of block blk in group grp. */ -#define kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk) \ - ((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].hdr_cnt) +static inline size_t kbase_hwcnt_metadata_block_headers_count( + const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk) +{ + if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) || + WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt)) + return 0; + + return metadata->grp_metadata[grp].blk_metadata[blk].hdr_cnt; +} /** * kbase_hwcnt_metadata_block_counters_count() - Get the number of counters. @@ -385,8 +431,15 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata); * * Return: Number of counters in each instance of block blk in group grp. */ -#define kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk) \ - ((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].ctr_cnt) +static inline size_t kbase_hwcnt_metadata_block_counters_count( + const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk) +{ + if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) || + WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt)) + return 0; + + return metadata->grp_metadata[grp].blk_metadata[blk].ctr_cnt; +} /** * kbase_hwcnt_metadata_block_enable_map_stride() - Get the enable map stride. @@ -396,8 +449,15 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata); * * Return: enable map stride in each instance of block blk in group grp. */ -#define kbase_hwcnt_metadata_block_enable_map_stride(metadata, grp, blk) \ - ((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].enable_map_stride) +static inline size_t kbase_hwcnt_metadata_block_enable_map_stride( + const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk) +{ + if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) || + WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt)) + return 0; + + return metadata->grp_metadata[grp].blk_metadata[blk].enable_map_stride; +} /** * kbase_hwcnt_metadata_block_values_count() - Get the number of values. @@ -408,9 +468,16 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata); * Return: Number of headers plus counters in each instance of block blk * in group grp. */ -#define kbase_hwcnt_metadata_block_values_count(metadata, grp, blk) \ - (kbase_hwcnt_metadata_block_counters_count((metadata), (grp), (blk)) \ - + kbase_hwcnt_metadata_block_headers_count((metadata), (grp), (blk))) +static inline size_t kbase_hwcnt_metadata_block_values_count( + const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk) +{ + if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) || + WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt)) + return 0; + + return kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk) + + kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk); +} /** * kbase_hwcnt_metadata_for_each_block() - Iterate over each block instance in @@ -496,19 +563,28 @@ void kbase_hwcnt_enable_map_free(struct kbase_hwcnt_enable_map *enable_map); /** * kbase_hwcnt_enable_map_block_instance() - Get the pointer to a block * instance's enable map. - * @map: Non-NULL pointer to (const) enable map. + * @map: Non-NULL pointer to enable map. * @grp: Index of the group in the metadata. * @blk: Index of the block in the group. * @blk_inst: Index of the block instance in the block. * - * Return: (const) u64* to the bitfield(s) used as the enable map for the + * Return: u64* to the bitfield(s) used as the enable map for the * block instance. */ -#define kbase_hwcnt_enable_map_block_instance(map, grp, blk, blk_inst) \ - ((map)->hwcnt_enable_map + \ - (map)->metadata->grp_metadata[(grp)].enable_map_index + \ - (map)->metadata->grp_metadata[(grp)].blk_metadata[(blk)].enable_map_index + \ - (map)->metadata->grp_metadata[(grp)].blk_metadata[(blk)].enable_map_stride * (blk_inst)) +static inline u64 * +kbase_hwcnt_enable_map_block_instance(const struct kbase_hwcnt_enable_map *map, + size_t grp, size_t blk, size_t blk_inst) +{ + return map->hwcnt_enable_map + + map->metadata->grp_metadata[grp].enable_map_index + + map->metadata->grp_metadata[grp] + .blk_metadata[blk] + .enable_map_index + + (map->metadata->grp_metadata[grp] + .blk_metadata[blk] + .enable_map_stride * + blk_inst); +} /** * kbase_hwcnt_bitfield_count() - Calculate the number of u64 bitfields required @@ -827,18 +903,24 @@ void kbase_hwcnt_dump_buffer_array_free( /** * kbase_hwcnt_dump_buffer_block_instance() - Get the pointer to a block * instance's dump buffer. - * @buf: Non-NULL pointer to (const) dump buffer. + * @buf: Non-NULL pointer to dump buffer. * @grp: Index of the group in the metadata. * @blk: Index of the block in the group. * @blk_inst: Index of the block instance in the block. * - * Return: (const) u32* to the dump buffer for the block instance. + * Return: u64* to the dump buffer for the block instance. */ -#define kbase_hwcnt_dump_buffer_block_instance(buf, grp, blk, blk_inst) \ - ((buf)->dump_buf + \ - (buf)->metadata->grp_metadata[(grp)].dump_buf_index + \ - (buf)->metadata->grp_metadata[(grp)].blk_metadata[(blk)].dump_buf_index + \ - (buf)->metadata->grp_metadata[(grp)].blk_metadata[(blk)].dump_buf_stride * (blk_inst)) +static inline u64 *kbase_hwcnt_dump_buffer_block_instance( + const struct kbase_hwcnt_dump_buffer *buf, size_t grp, size_t blk, + size_t blk_inst) +{ + return buf->dump_buf + buf->metadata->grp_metadata[grp].dump_buf_index + + buf->metadata->grp_metadata[grp].blk_metadata[blk].dump_buf_index + + (buf->metadata->grp_metadata[grp] + .blk_metadata[blk] + .dump_buf_stride * + blk_inst); +} /** * kbase_hwcnt_dump_buffer_zero() - Zero all enabled values in dst. @@ -859,9 +941,8 @@ void kbase_hwcnt_dump_buffer_zero( * kbase_hwcnt_dump_buffer_block_instance. * @val_cnt: Number of values in the block. */ -static inline void kbase_hwcnt_dump_buffer_block_zero( - u32 *dst_blk, - size_t val_cnt) +static inline void kbase_hwcnt_dump_buffer_block_zero(u64 *dst_blk, + size_t val_cnt) { memset(dst_blk, 0, (val_cnt * KBASE_HWCNT_VALUE_BYTES)); } @@ -904,10 +985,9 @@ void kbase_hwcnt_dump_buffer_zero_non_enabled( * kbase_hwcnt_enable_map_block_instance. * @val_cnt: Number of values in the block. */ -static inline void kbase_hwcnt_dump_buffer_block_zero_non_enabled( - u32 *dst_blk, - const u64 *blk_em, - size_t val_cnt) +static inline void +kbase_hwcnt_dump_buffer_block_zero_non_enabled(u64 *dst_blk, const u64 *blk_em, + size_t val_cnt) { size_t val; @@ -941,10 +1021,9 @@ void kbase_hwcnt_dump_buffer_copy( * kbase_hwcnt_dump_buffer_block_instance. * @val_cnt: Number of values in the block. */ -static inline void kbase_hwcnt_dump_buffer_block_copy( - u32 *dst_blk, - const u32 *src_blk, - size_t val_cnt) +static inline void kbase_hwcnt_dump_buffer_block_copy(u64 *dst_blk, + const u64 *src_blk, + size_t val_cnt) { /* Copy all the counters in the block instance. * Values of non-enabled counters are undefined. @@ -987,11 +1066,10 @@ void kbase_hwcnt_dump_buffer_copy_strict( * * After the copy, any disabled values in dst will be zero. */ -static inline void kbase_hwcnt_dump_buffer_block_copy_strict( - u32 *dst_blk, - const u32 *src_blk, - const u64 *blk_em, - size_t val_cnt) +static inline void kbase_hwcnt_dump_buffer_block_copy_strict(u64 *dst_blk, + const u64 *src_blk, + const u64 *blk_em, + size_t val_cnt) { size_t val; @@ -1032,11 +1110,10 @@ void kbase_hwcnt_dump_buffer_accumulate( * @hdr_cnt: Number of headers in the block. * @ctr_cnt: Number of counters in the block. */ -static inline void kbase_hwcnt_dump_buffer_block_accumulate( - u32 *dst_blk, - const u32 *src_blk, - size_t hdr_cnt, - size_t ctr_cnt) +static inline void kbase_hwcnt_dump_buffer_block_accumulate(u64 *dst_blk, + const u64 *src_blk, + size_t hdr_cnt, + size_t ctr_cnt) { size_t ctr; /* Copy all the headers in the block instance. @@ -1047,21 +1124,8 @@ static inline void kbase_hwcnt_dump_buffer_block_accumulate( /* Accumulate all the counters in the block instance. * Values of non-enabled counters are undefined. */ - for (ctr = hdr_cnt; ctr < ctr_cnt + hdr_cnt; ctr++) { - u32 *dst_ctr = dst_blk + ctr; - const u32 *src_ctr = src_blk + ctr; - - const u32 src_counter = *src_ctr; - const u32 dst_counter = *dst_ctr; - - /* Saturating add */ - u32 accumulated = src_counter + dst_counter; - - if (accumulated < src_counter) - accumulated = U32_MAX; - - *dst_ctr = accumulated; - } + for (ctr = hdr_cnt; ctr < ctr_cnt + hdr_cnt; ctr++) + dst_blk[ctr] += src_blk[ctr]; } /** @@ -1103,10 +1167,7 @@ void kbase_hwcnt_dump_buffer_accumulate_strict( * @ctr_cnt: Number of counters in the block. */ static inline void kbase_hwcnt_dump_buffer_block_accumulate_strict( - u32 *dst_blk, - const u32 *src_blk, - const u64 *blk_em, - size_t hdr_cnt, + u64 *dst_blk, const u64 *src_blk, const u64 *blk_em, size_t hdr_cnt, size_t ctr_cnt) { size_t ctr; @@ -1118,25 +1179,16 @@ static inline void kbase_hwcnt_dump_buffer_block_accumulate_strict( bool ctr_enabled = kbase_hwcnt_enable_map_block_value_enabled( blk_em, ctr); - u32 *dst_ctr = dst_blk + ctr; - const u32 *src_ctr = src_blk + ctr; - - const u32 src_counter = *src_ctr; - const u32 dst_counter = *dst_ctr; - - /* Saturating add */ - u32 accumulated = src_counter + dst_counter; - - if (accumulated < src_counter) - accumulated = U32_MAX; - - *dst_ctr = ctr_enabled ? accumulated : 0; + if (ctr_enabled) + dst_blk[ctr] += src_blk[ctr]; + else + dst_blk[ctr] = 0; } } -/* - * Iterate over each clock domain in the metadata. - * +/** + * kbase_hwcnt_metadata_for_each_clock() - Iterate over each clock domain in the + * metadata. * @md: Non-NULL pointer to metadata. * @clk: size_t variable used as clock iterator. */ diff --git a/mali_kbase/mali_kbase_jd.c b/mali_kbase/mali_kbase_jd.c index 2b071dd..c892455 100644 --- a/mali_kbase/mali_kbase_jd.c +++ b/mali_kbase/mali_kbase_jd.c @@ -76,6 +76,7 @@ static void jd_mark_atom_complete(struct kbase_jd_atom *katom) kbase_kinstr_jm_atom_complete(katom); dev_dbg(katom->kctx->kbdev->dev, "Atom %pK status to completed\n", (void *)katom); + KBASE_TLSTREAM_TL_JD_ATOM_COMPLETE(katom->kctx->kbdev, katom); } /* Runs an atom, either by handing to the JS or by immediately running it in the case of soft-jobs @@ -139,7 +140,13 @@ void kbase_jd_dep_clear_locked(struct kbase_jd_atom *katom) /* katom dep complete, attempt to run it */ bool resched = false; + KBASE_TLSTREAM_TL_RUN_ATOM_START( + katom->kctx->kbdev, katom, + kbase_jd_atom_id(katom->kctx, katom)); resched = jd_run_atom(katom); + KBASE_TLSTREAM_TL_RUN_ATOM_END(katom->kctx->kbdev, katom, + kbase_jd_atom_id(katom->kctx, + katom)); if (katom->status == KBASE_JD_ATOM_STATE_COMPLETED) { /* The atom has already finished */ @@ -715,6 +722,8 @@ bool jd_done_nolock(struct kbase_jd_atom *katom, bool need_to_try_schedule_context = false; int i; + KBASE_TLSTREAM_TL_JD_DONE_NO_LOCK_START(kctx->kbdev, katom); + INIT_LIST_HEAD(&completed_jobs); INIT_LIST_HEAD(&runnable_jobs); @@ -736,6 +745,7 @@ bool jd_done_nolock(struct kbase_jd_atom *katom, } jd_mark_atom_complete(katom); + list_add_tail(&katom->jd_item, &completed_jobs); while (!list_empty(&completed_jobs)) { @@ -767,7 +777,13 @@ bool jd_done_nolock(struct kbase_jd_atom *katom, if (node->status != KBASE_JD_ATOM_STATE_COMPLETED && !kbase_ctx_flag(kctx, KCTX_DYING)) { + KBASE_TLSTREAM_TL_RUN_ATOM_START( + kctx->kbdev, node, + kbase_jd_atom_id(kctx, node)); need_to_try_schedule_context |= jd_run_atom(node); + KBASE_TLSTREAM_TL_RUN_ATOM_END( + kctx->kbdev, node, + kbase_jd_atom_id(kctx, node)); } else { node->event_code = katom->event_code; @@ -811,7 +827,7 @@ bool jd_done_nolock(struct kbase_jd_atom *katom, */ wake_up(&kctx->jctx.zero_jobs_wait); } - + KBASE_TLSTREAM_TL_JD_DONE_NO_LOCK_END(kctx->kbdev, katom); return need_to_try_schedule_context; } @@ -984,7 +1000,6 @@ static bool jd_submit_atom(struct kbase_context *const kctx, * dependencies. */ jd_trace_atom_submit(kctx, katom, NULL); - return jd_done_nolock(katom, NULL); } } @@ -1049,7 +1064,6 @@ static bool jd_submit_atom(struct kbase_context *const kctx, if (err >= 0) kbase_finish_soft_job(katom); } - return jd_done_nolock(katom, NULL); } @@ -1378,10 +1392,10 @@ while (false) } mutex_lock(&jctx->lock); } - + KBASE_TLSTREAM_TL_JD_SUBMIT_ATOM_START(kbdev, katom); need_to_try_schedule_context |= jd_submit_atom(kctx, &user_atom, &user_jc_incr, katom); - + KBASE_TLSTREAM_TL_JD_SUBMIT_ATOM_END(kbdev, katom); /* Register a completed job as a disjoint event when the GPU is in a disjoint state * (ie. being reset). */ @@ -1479,7 +1493,6 @@ void kbase_jd_done_worker(struct work_struct *data) kbasep_js_remove_job(kbdev, kctx, katom); mutex_unlock(&js_kctx_info->ctx.jsctx_mutex); mutex_unlock(&js_devdata->queue_mutex); - katom->atom_flags &= ~KBASE_KATOM_FLAG_HOLDING_CTX_REF; /* jd_done_nolock() requires the jsctx_mutex lock to be dropped */ jd_done_nolock(katom, &kctx->completed_jobs); @@ -1498,22 +1511,23 @@ void kbase_jd_done_worker(struct work_struct *data) * drop our reference. But do not call kbase_jm_idle_ctx(), as * the context is active and fast-starting is allowed. * - * If an atom has been fast-started then kctx->atoms_pulled will - * be non-zero but KCTX_ACTIVE will still be false (as the - * previous pm reference has been inherited). Do NOT drop our - * reference, as it has been re-used, and leave the context as - * active. + * If an atom has been fast-started then + * kbase_jsctx_atoms_pulled(kctx) will return non-zero but + * KCTX_ACTIVE will still be false (as the previous pm + * reference has been inherited). Do NOT drop our reference, as + * it has been re-used, and leave the context as active. * - * If no new atoms have been started then KCTX_ACTIVE will still - * be false and atoms_pulled will be zero, so drop the reference - * and call kbase_jm_idle_ctx(). + * If no new atoms have been started then KCTX_ACTIVE will + * still be false and kbase_jsctx_atoms_pulled(kctx) will + * return zero, so drop the reference and call + * kbase_jm_idle_ctx(). * * As the checks are done under both the queue_mutex and * hwaccess_lock is should be impossible for this to race * with the scheduler code. */ if (kbase_ctx_flag(kctx, KCTX_ACTIVE) || - !atomic_read(&kctx->atoms_pulled)) { + !kbase_jsctx_atoms_pulled(kctx)) { /* Calling kbase_jm_idle_ctx() here will ensure that * atoms are not fast-started when we drop the * hwaccess_lock. This is not performed if diff --git a/mali_kbase/mali_kbase_jm.c b/mali_kbase/mali_kbase_jm.c index 6995050..898606b 100644 --- a/mali_kbase/mali_kbase_jm.c +++ b/mali_kbase/mali_kbase_jm.c @@ -132,6 +132,9 @@ struct kbase_jd_atom *kbase_jm_return_atom_to_js(struct kbase_device *kbdev, dev_dbg(kbdev->dev, "Atom %pK is returning with event code 0x%x\n", (void *)katom, katom->event_code); + KBASE_KTRACE_ADD_JM(kbdev, JM_RETURN_ATOM_TO_JS, katom->kctx, katom, + katom->jc, katom->event_code); + if (katom->event_code != BASE_JD_EVENT_STOPPED && katom->event_code != BASE_JD_EVENT_REMOVED_FROM_NEXT) { return kbase_js_complete_atom(katom, NULL); diff --git a/mali_kbase/mali_kbase_jm.h b/mali_kbase/mali_kbase_jm.h index c6b28f3..eeafcb6 100644 --- a/mali_kbase/mali_kbase_jm.h +++ b/mali_kbase/mali_kbase_jm.h @@ -84,7 +84,7 @@ void kbase_jm_try_kick_all(struct kbase_device *kbdev); * by kbase_js_use_ctx(). * * The context should have no atoms currently pulled from it - * (kctx->atoms_pulled == 0). + * (kbase_jsctx_atoms_pulled(kctx) == 0). * * Caller must hold the hwaccess_lock */ diff --git a/mali_kbase/mali_kbase_js.c b/mali_kbase/mali_kbase_js.c index 3682486..799c7e5 100644 --- a/mali_kbase/mali_kbase_js.c +++ b/mali_kbase/mali_kbase_js.c @@ -372,8 +372,6 @@ jsctx_rb_pull(struct kbase_context *kctx, struct kbase_jd_atom *katom) rb_erase(&katom->runnable_tree_node, &rb->runnable_tree); } -#define LESS_THAN_WRAP(a, b) ((s32)(a - b) < 0) - static void jsctx_tree_add(struct kbase_context *kctx, struct kbase_jd_atom *katom) { @@ -393,7 +391,7 @@ jsctx_tree_add(struct kbase_context *kctx, struct kbase_jd_atom *katom) struct kbase_jd_atom, runnable_tree_node); parent = *new; - if (LESS_THAN_WRAP(katom->age, entry->age)) + if (kbase_jd_atom_is_younger(katom, entry)) new = &((*new)->rb_left); else new = &((*new)->rb_right); @@ -421,6 +419,9 @@ jsctx_rb_unpull(struct kbase_context *kctx, struct kbase_jd_atom *katom) { lockdep_assert_held(&kctx->kbdev->hwaccess_lock); + KBASE_KTRACE_ADD_JM(kctx->kbdev, JS_UNPULL_JOB, kctx, katom, katom->jc, + 0u); + jsctx_tree_add(kctx, katom); } @@ -434,6 +435,67 @@ static bool kbase_js_ctx_list_add_unpullable_nolock(struct kbase_device *kbdev, struct kbase_context *kctx, int js); +typedef bool(katom_ordering_func)(const struct kbase_jd_atom *, + const struct kbase_jd_atom *); + +bool kbase_js_atom_runs_before(struct kbase_device *kbdev, + const struct kbase_jd_atom *katom_a, + const struct kbase_jd_atom *katom_b, + const kbase_atom_ordering_flag_t order_flags) +{ + struct kbase_context *kctx_a = katom_a->kctx; + struct kbase_context *kctx_b = katom_b->kctx; + katom_ordering_func *samectxatomprio_ordering_func = + kbase_jd_atom_is_younger; + + lockdep_assert_held(&kbdev->hwaccess_lock); + + if (order_flags & KBASE_ATOM_ORDERING_FLAG_SEQNR) + samectxatomprio_ordering_func = kbase_jd_atom_is_earlier; + + /* It only makes sense to make this test for atoms on the same slot */ + WARN_ON(katom_a->slot_nr != katom_b->slot_nr); + + if (kbdev->js_ctx_scheduling_mode == + KBASE_JS_PROCESS_LOCAL_PRIORITY_MODE) { + /* In local priority mode, querying either way around for "a + * should run before b" and "b should run before a" should + * always be false when they're from different contexts + */ + if (kctx_a != kctx_b) + return false; + } else { + /* In system priority mode, ordering is done first strictly by + * context priority, even when katom_b might be lower priority + * than katom_a. This is due to scheduling of contexts in order + * of highest priority first, regardless of whether the atoms + * for a particular slot from such contexts have the highest + * priority or not. + */ + if (kctx_a != kctx_b) { + if (kctx_a->priority < kctx_b->priority) + return true; + if (kctx_a->priority > kctx_b->priority) + return false; + } + } + + /* For same contexts/contexts with the same context priority (in system + * priority mode), ordering is next done by atom priority + */ + if (katom_a->sched_priority < katom_b->sched_priority) + return true; + if (katom_a->sched_priority > katom_b->sched_priority) + return false; + /* For atoms of same priority on the same kctx, they are + * ordered by seq_nr/age (dependent on caller) + */ + if (kctx_a == kctx_b && samectxatomprio_ordering_func(katom_a, katom_b)) + return true; + + return false; +} + /* * Functions private to KBase ('Protected' functions) */ @@ -475,6 +537,7 @@ int kbasep_js_devdata_init(struct kbase_device * const kbdev) jsdd->hard_stop_ticks_dumping = DEFAULT_JS_HARD_STOP_TICKS_DUMPING; jsdd->gpu_reset_ticks_ss = DEFAULT_JS_RESET_TICKS_SS; jsdd->gpu_reset_ticks_cl = DEFAULT_JS_RESET_TICKS_CL; + jsdd->gpu_reset_ticks_dumping = DEFAULT_JS_RESET_TICKS_DUMPING; jsdd->ctx_timeslice_ns = DEFAULT_JS_CTX_TIMESLICE_NS; atomic_set(&jsdd->soft_job_timeout_ms, DEFAULT_JS_SOFT_JOB_TIMEOUT); @@ -662,6 +725,147 @@ void kbasep_js_kctx_term(struct kbase_context *kctx) } } +/* + * Priority blocking management functions + */ + +/* Should not normally use directly - use kbase_jsctx_slot_atom_pulled_dec() instead */ +static void kbase_jsctx_slot_prio_blocked_clear(struct kbase_context *kctx, + int js, int sched_prio) +{ + struct kbase_jsctx_slot_tracking *slot_tracking = + &kctx->slot_tracking[js]; + + lockdep_assert_held(&kctx->kbdev->hwaccess_lock); + + slot_tracking->blocked &= ~(((kbase_js_prio_bitmap_t)1) << sched_prio); + KBASE_KTRACE_ADD_JM_SLOT_INFO(kctx->kbdev, JS_SLOT_PRIO_UNBLOCKED, kctx, + NULL, 0, js, (unsigned int)sched_prio); +} + +static int kbase_jsctx_slot_atoms_pulled(struct kbase_context *kctx, int js) +{ + return atomic_read(&kctx->slot_tracking[js].atoms_pulled); +} + +/* + * A priority level on a slot is blocked when: + * - that priority level is blocked + * - or, any higher priority level is blocked + */ +static bool kbase_jsctx_slot_prio_is_blocked(struct kbase_context *kctx, int js, + int sched_prio) +{ + struct kbase_jsctx_slot_tracking *slot_tracking = + &kctx->slot_tracking[js]; + kbase_js_prio_bitmap_t prio_bit, higher_prios_mask; + + lockdep_assert_held(&kctx->kbdev->hwaccess_lock); + + /* done in two separate shifts to prevent future undefined behavior + * should the number of priority levels == (bit width of the type) + */ + prio_bit = (((kbase_js_prio_bitmap_t)1) << sched_prio); + /* all bits of sched_prio or higher, with sched_prio = 0 being the + * highest priority + */ + higher_prios_mask = (prio_bit << 1) - 1u; + return (slot_tracking->blocked & higher_prios_mask) != 0u; +} + +/** + * kbase_jsctx_slot_atom_pulled_inc - Increase counts of atoms that have being + * pulled for a slot from a ctx, based on + * this atom + * @kctx: kbase context + * @katom: atom pulled + * + * Manages counts of atoms pulled (including per-priority-level counts), for + * later determining when a ctx can become unblocked on a slot. + * + * Once a slot has been blocked at @katom's priority level, it should not be + * pulled from, hence this function should not be called in that case. + * + * The return value is to aid tracking of when @kctx becomes runnable. + * + * Return: new total count of atoms pulled from all slots on @kctx + */ +static int kbase_jsctx_slot_atom_pulled_inc(struct kbase_context *kctx, + const struct kbase_jd_atom *katom) +{ + int js = katom->slot_nr; + int sched_prio = katom->sched_priority; + struct kbase_jsctx_slot_tracking *slot_tracking = + &kctx->slot_tracking[js]; + int nr_atoms_pulled; + + lockdep_assert_held(&kctx->kbdev->hwaccess_lock); + + WARN(kbase_jsctx_slot_prio_is_blocked(kctx, js, sched_prio), + "Should not have pulled atoms for slot %d from a context that is blocked at priority %d or higher", + js, sched_prio); + + nr_atoms_pulled = atomic_inc_return(&kctx->atoms_pulled_all_slots); + atomic_inc(&slot_tracking->atoms_pulled); + slot_tracking->atoms_pulled_pri[sched_prio]++; + + return nr_atoms_pulled; +} + +/** + * kbase_jsctx_slot_atom_pulled_dec- Decrease counts of atoms that have being + * pulled for a slot from a ctx, and + * re-evaluate whether a context is blocked + * on this slot + * @kctx: kbase context + * @katom: atom that has just been removed from a job slot + * + * @kctx can become unblocked on a slot for a priority level when it no longer + * has any pulled atoms at that priority level on that slot, and all higher + * (numerically lower) priority levels are also unblocked @kctx on that + * slot. The latter condition is to retain priority ordering within @kctx. + * + * Return: true if the slot was previously blocked but has now become unblocked + * at @katom's priority level, false otherwise. + */ +static bool kbase_jsctx_slot_atom_pulled_dec(struct kbase_context *kctx, + const struct kbase_jd_atom *katom) +{ + int js = katom->slot_nr; + int sched_prio = katom->sched_priority; + int atoms_pulled_pri; + struct kbase_jsctx_slot_tracking *slot_tracking = + &kctx->slot_tracking[js]; + bool slot_prio_became_unblocked = false; + + lockdep_assert_held(&kctx->kbdev->hwaccess_lock); + + atomic_dec(&kctx->atoms_pulled_all_slots); + atomic_dec(&slot_tracking->atoms_pulled); + + atoms_pulled_pri = --(slot_tracking->atoms_pulled_pri[sched_prio]); + + /* We can safely clear this priority level's blocked status even if + * higher priority levels are still blocked: a subsequent query to + * kbase_jsctx_slot_prio_is_blocked() will still return true + */ + if (!atoms_pulled_pri && + kbase_jsctx_slot_prio_is_blocked(kctx, js, sched_prio)) { + kbase_jsctx_slot_prio_blocked_clear(kctx, js, sched_prio); + + if (!kbase_jsctx_slot_prio_is_blocked(kctx, js, sched_prio)) + slot_prio_became_unblocked = true; + } + + if (slot_prio_became_unblocked) + KBASE_KTRACE_ADD_JM_SLOT_INFO(kctx->kbdev, + JS_SLOT_PRIO_AND_HIGHER_UNBLOCKED, + kctx, katom, katom->jc, js, + (unsigned int)sched_prio); + + return slot_prio_became_unblocked; +} + /** * kbase_js_ctx_list_add_pullable_nolock - Variant of * kbase_jd_ctx_list_add_pullable() @@ -694,7 +898,7 @@ static bool kbase_js_ctx_list_add_pullable_nolock(struct kbase_device *kbdev, if (!kctx->slots_pullable) { kbdev->js_data.nr_contexts_pullable++; ret = true; - if (!atomic_read(&kctx->atoms_pulled)) { + if (!kbase_jsctx_atoms_pulled(kctx)) { WARN_ON(kbase_ctx_flag(kctx, KCTX_RUNNABLE_REF)); kbase_ctx_flag_set(kctx, KCTX_RUNNABLE_REF); atomic_inc(&kbdev->js_data.nr_contexts_runnable); @@ -736,7 +940,7 @@ static bool kbase_js_ctx_list_add_pullable_head_nolock( if (!kctx->slots_pullable) { kbdev->js_data.nr_contexts_pullable++; ret = true; - if (!atomic_read(&kctx->atoms_pulled)) { + if (!kbase_jsctx_atoms_pulled(kctx)) { WARN_ON(kbase_ctx_flag(kctx, KCTX_RUNNABLE_REF)); kbase_ctx_flag_set(kctx, KCTX_RUNNABLE_REF); atomic_inc(&kbdev->js_data.nr_contexts_runnable); @@ -809,7 +1013,7 @@ static bool kbase_js_ctx_list_add_unpullable_nolock(struct kbase_device *kbdev, if (kctx->slots_pullable == (1 << js)) { kbdev->js_data.nr_contexts_pullable--; ret = true; - if (!atomic_read(&kctx->atoms_pulled)) { + if (!kbase_jsctx_atoms_pulled(kctx)) { WARN_ON(!kbase_ctx_flag(kctx, KCTX_RUNNABLE_REF)); kbase_ctx_flag_clear(kctx, KCTX_RUNNABLE_REF); atomic_dec(&kbdev->js_data.nr_contexts_runnable); @@ -851,7 +1055,7 @@ static bool kbase_js_ctx_list_remove_nolock(struct kbase_device *kbdev, if (kctx->slots_pullable == (1 << js)) { kbdev->js_data.nr_contexts_pullable--; ret = true; - if (!atomic_read(&kctx->atoms_pulled)) { + if (!kbase_jsctx_atoms_pulled(kctx)) { WARN_ON(!kbase_ctx_flag(kctx, KCTX_RUNNABLE_REF)); kbase_ctx_flag_clear(kctx, KCTX_RUNNABLE_REF); atomic_dec(&kbdev->js_data.nr_contexts_runnable); @@ -958,9 +1162,12 @@ static bool kbase_js_ctx_pullable(struct kbase_context *kctx, int js, (void *)kctx, js); return false; /* No pullable atoms */ } - if (kctx->blocked_js[js][katom->sched_priority]) { + if (kbase_jsctx_slot_prio_is_blocked(kctx, js, katom->sched_priority)) { + KBASE_KTRACE_ADD_JM_SLOT_INFO( + kctx->kbdev, JS_SLOT_PRIO_IS_BLOCKED, kctx, katom, + katom->jc, js, (unsigned int)katom->sched_priority); dev_dbg(kbdev->dev, - "JS: kctx %pK is blocked from submitting atoms at priority %d (s:%d)\n", + "JS: kctx %pK is blocked from submitting atoms at priority %d and lower (s:%d)\n", (void *)kctx, katom->sched_priority, js); return false; } @@ -2493,9 +2700,9 @@ struct kbase_jd_atom *kbase_js_pull(struct kbase_context *kctx, int js) (void *)kctx, js); return NULL; } - if (kctx->blocked_js[js][katom->sched_priority]) { + if (kbase_jsctx_slot_prio_is_blocked(kctx, js, katom->sched_priority)) { dev_dbg(kbdev->dev, - "JS: kctx %pK is blocked from submitting atoms at priority %d (s:%d)\n", + "JS: kctx %pK is blocked from submitting atoms at priority %d and lower (s:%d)\n", (void *)kctx, katom->sched_priority, js); return NULL; } @@ -2509,7 +2716,7 @@ struct kbase_jd_atom *kbase_js_pull(struct kbase_context *kctx, int js) * not allow multiple runs of fail-dep atoms from the same context to be * present on the same slot */ - if (katom->pre_dep && atomic_read(&kctx->atoms_pulled_slot[js])) { + if (katom->pre_dep && kbase_jsctx_slot_atoms_pulled(kctx, js)) { struct kbase_jd_atom *prev_atom = kbase_backend_inspect_tail(kbdev, js); @@ -2535,23 +2742,21 @@ struct kbase_jd_atom *kbase_js_pull(struct kbase_context *kctx, int js) } } + KBASE_KTRACE_ADD_JM_SLOT_INFO(kbdev, JS_PULL_JOB, kctx, katom, + katom->jc, js, katom->sched_priority); kbase_ctx_flag_set(kctx, KCTX_PULLED); kbase_ctx_flag_set(kctx, (KCTX_PULLED_SINCE_ACTIVE_JS0 << js)); - pulled = atomic_inc_return(&kctx->atoms_pulled); + pulled = kbase_jsctx_slot_atom_pulled_inc(kctx, katom); if (pulled == 1 && !kctx->slots_pullable) { WARN_ON(kbase_ctx_flag(kctx, KCTX_RUNNABLE_REF)); kbase_ctx_flag_set(kctx, KCTX_RUNNABLE_REF); atomic_inc(&kbdev->js_data.nr_contexts_runnable); } - atomic_inc(&kctx->atoms_pulled_slot[katom->slot_nr]); - kctx->atoms_pulled_slot_pri[katom->slot_nr][katom->sched_priority]++; jsctx_rb_pull(kctx, katom); kbase_ctx_sched_retain_ctx_refcount(kctx); - katom->atom_flags |= KBASE_KATOM_FLAG_HOLDING_CTX_REF; - katom->ticks = 0; dev_dbg(kbdev->dev, "JS: successfully pulled atom %pK from kctx %pK (s:%d)\n", @@ -2773,15 +2978,18 @@ static void js_return_worker(struct work_struct *data) struct kbasep_js_kctx_info *js_kctx_info = &kctx->jctx.sched_info; struct kbasep_js_atom_retained_state retained_state; int js = katom->slot_nr; - int prio = katom->sched_priority; + bool slot_became_unblocked; bool timer_sync = false; bool context_idle = false; unsigned long flags; base_jd_core_req core_req = katom->core_req; + u64 cache_jc = katom->jc; dev_dbg(kbdev->dev, "%s for atom %pK with event code 0x%x\n", __func__, (void *)katom, katom->event_code); + KBASE_KTRACE_ADD_JM(kbdev, JS_RETURN_WORKER, kctx, katom, katom->jc, 0); + if (katom->event_code != BASE_JD_EVENT_END_RP_DONE) KBASE_TLSTREAM_TL_EVENT_ATOM_SOFTSTOP_EX(kbdev, katom); @@ -2792,37 +3000,27 @@ static void js_return_worker(struct work_struct *data) mutex_lock(&js_devdata->queue_mutex); mutex_lock(&js_kctx_info->ctx.jsctx_mutex); - atomic_dec(&kctx->atoms_pulled); - atomic_dec(&kctx->atoms_pulled_slot[js]); - if (katom->event_code != BASE_JD_EVENT_END_RP_DONE) atomic_dec(&katom->blocked); spin_lock_irqsave(&kbdev->hwaccess_lock, flags); - kctx->atoms_pulled_slot_pri[js][katom->sched_priority]--; + slot_became_unblocked = kbase_jsctx_slot_atom_pulled_dec(kctx, katom); - if (!atomic_read(&kctx->atoms_pulled_slot[js]) && - jsctx_rb_none_to_pull(kctx, js)) + if (!kbase_jsctx_slot_atoms_pulled(kctx, js) && + jsctx_rb_none_to_pull(kctx, js)) timer_sync |= kbase_js_ctx_list_remove_nolock(kbdev, kctx, js); - /* If this slot has been blocked due to soft-stopped atoms, and all - * atoms have now been processed, then unblock the slot + /* If the context is now unblocked on this slot after soft-stopped + * atoms, then only mark it as pullable on this slot if it is not + * idle */ - if (!kctx->atoms_pulled_slot_pri[js][prio] && - kctx->blocked_js[js][prio]) { - kctx->blocked_js[js][prio] = false; + if (slot_became_unblocked && kbase_jsctx_atoms_pulled(kctx) && + kbase_js_ctx_pullable(kctx, js, true)) + timer_sync |= + kbase_js_ctx_list_add_pullable_nolock(kbdev, kctx, js); - /* Only mark the slot as pullable if the context is not idle - - * that case is handled below - */ - if (atomic_read(&kctx->atoms_pulled) && - kbase_js_ctx_pullable(kctx, js, true)) - timer_sync |= kbase_js_ctx_list_add_pullable_nolock( - kbdev, kctx, js); - } - - if (!atomic_read(&kctx->atoms_pulled)) { + if (!kbase_jsctx_atoms_pulled(kctx)) { dev_dbg(kbdev->dev, "No atoms currently pulled from context %pK\n", (void *)kctx); @@ -2890,7 +3088,6 @@ static void js_return_worker(struct work_struct *data) mutex_unlock(&kctx->jctx.lock); } - katom->atom_flags &= ~KBASE_KATOM_FLAG_HOLDING_CTX_REF; dev_dbg(kbdev->dev, "JS: retained state %s finished", kbasep_js_has_atom_finished(&retained_state) ? "has" : "hasn't"); @@ -2904,6 +3101,9 @@ static void js_return_worker(struct work_struct *data) kbase_backend_complete_wq_post_sched(kbdev, core_req); + KBASE_KTRACE_ADD_JM(kbdev, JS_RETURN_WORKER_END, kctx, NULL, cache_jc, + 0); + dev_dbg(kbdev->dev, "Leaving %s for atom %pK\n", __func__, (void *)katom); } @@ -3113,15 +3313,16 @@ bool kbase_js_complete_atom_wq(struct kbase_context *kctx, spin_lock_irqsave(&kbdev->hwaccess_lock, flags); if (katom->atom_flags & KBASE_KATOM_FLAG_JSCTX_IN_TREE) { + bool slot_became_unblocked; + dev_dbg(kbdev->dev, "Atom %pK is in runnable_tree\n", (void *)katom); - context_idle = !atomic_dec_return(&kctx->atoms_pulled); - atomic_dec(&kctx->atoms_pulled_slot[atom_slot]); - kctx->atoms_pulled_slot_pri[atom_slot][prio]--; + slot_became_unblocked = + kbase_jsctx_slot_atom_pulled_dec(kctx, katom); + context_idle = !kbase_jsctx_atoms_pulled(kctx); - if (!atomic_read(&kctx->atoms_pulled) && - !kctx->slots_pullable) { + if (!kbase_jsctx_atoms_pulled(kctx) && !kctx->slots_pullable) { WARN_ON(!kbase_ctx_flag(kctx, KCTX_RUNNABLE_REF)); kbase_ctx_flag_clear(kctx, KCTX_RUNNABLE_REF); atomic_dec(&kbdev->js_data.nr_contexts_runnable); @@ -3129,15 +3330,14 @@ bool kbase_js_complete_atom_wq(struct kbase_context *kctx, } /* If this slot has been blocked due to soft-stopped atoms, and - * all atoms have now been processed, then unblock the slot + * all atoms have now been processed at this priority level and + * higher, then unblock the slot */ - if (!kctx->atoms_pulled_slot_pri[atom_slot][prio] - && kctx->blocked_js[atom_slot][prio]) { + if (slot_became_unblocked) { dev_dbg(kbdev->dev, - "kctx %pK is no longer blocked from submitting on slot %d at priority %d\n", + "kctx %pK is no longer blocked from submitting on slot %d at priority %d or higher\n", (void *)kctx, atom_slot, prio); - kctx->blocked_js[atom_slot][prio] = false; if (kbase_js_ctx_pullable(kctx, atom_slot, true)) timer_sync |= kbase_js_ctx_list_add_pullable_nolock( @@ -3146,8 +3346,8 @@ bool kbase_js_complete_atom_wq(struct kbase_context *kctx, } WARN_ON(!(katom->atom_flags & KBASE_KATOM_FLAG_JSCTX_IN_TREE)); - if (!atomic_read(&kctx->atoms_pulled_slot[atom_slot]) && - jsctx_rb_none_to_pull(kctx, atom_slot)) { + if (!kbase_jsctx_slot_atoms_pulled(kctx, atom_slot) && + jsctx_rb_none_to_pull(kctx, atom_slot)) { if (!list_empty( &kctx->jctx.sched_info.ctx.ctx_list_entry[atom_slot])) timer_sync |= kbase_js_ctx_list_remove_nolock( @@ -3160,8 +3360,8 @@ bool kbase_js_complete_atom_wq(struct kbase_context *kctx, * re-enable submission so that context can be scheduled again. */ if (!kbasep_js_is_submit_allowed(js_devdata, kctx) && - !atomic_read(&kctx->atoms_pulled) && - !kbase_ctx_flag(kctx, KCTX_DYING)) { + !kbase_jsctx_atoms_pulled(kctx) && + !kbase_ctx_flag(kctx, KCTX_DYING)) { int js; kbasep_js_set_submit_allowed(js_devdata, kctx); @@ -3297,7 +3497,9 @@ struct kbase_jd_atom *kbase_js_complete_atom(struct kbase_jd_atom *katom, trace_sysgraph_gpu(SGR_COMPLETE, kctx->id, kbase_jd_atom_id(katom->kctx, katom), katom->slot_nr); + KBASE_TLSTREAM_TL_JD_DONE_START(kbdev, katom); kbase_jd_done(katom, katom->slot_nr, end_timestamp, 0); + KBASE_TLSTREAM_TL_JD_DONE_END(kbdev, katom); /* Unblock cross dependency if present */ if (x_dep && (katom->event_code == BASE_JD_EVENT_DONE || @@ -3405,6 +3607,8 @@ void kbase_js_sched(struct kbase_device *kbdev, int js_mask) bool ctx_waiting[BASE_JM_MAX_NR_SLOTS]; int js; + KBASE_TLSTREAM_TL_JS_SCHED_START(kbdev, 0); + dev_dbg(kbdev->dev, "%s kbdev %pK mask 0x%x\n", __func__, (void *)kbdev, (unsigned int)js_mask); @@ -3460,6 +3664,8 @@ void kbase_js_sched(struct kbase_device *kbdev, int js_mask) &kctx->jctx.sched_info.ctx.jsctx_mutex); mutex_unlock(&js_devdata->queue_mutex); up(&js_devdata->schedule_sem); + KBASE_TLSTREAM_TL_JS_SCHED_END(kbdev, + 0); return; } kbase_ctx_flag_set(kctx, KCTX_ACTIVE); @@ -3604,6 +3810,7 @@ void kbase_js_sched(struct kbase_device *kbdev, int js_mask) mutex_unlock(&js_devdata->queue_mutex); up(&js_devdata->schedule_sem); + KBASE_TLSTREAM_TL_JS_SCHED_END(kbdev, 0); } void kbase_js_zap_context(struct kbase_context *kctx) diff --git a/mali_kbase/mali_kbase_kinstr_prfcnt.c b/mali_kbase/mali_kbase_kinstr_prfcnt.c new file mode 100644 index 0000000..ce996ca --- /dev/null +++ b/mali_kbase/mali_kbase_kinstr_prfcnt.c @@ -0,0 +1,1184 @@ +// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +/* + * + * (C) COPYRIGHT 2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#include "mali_kbase_kinstr_prfcnt.h" +#include "mali_kbase_hwcnt_virtualizer.h" +#include "mali_kbase_hwcnt_types.h" +#include <uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h> +#include "mali_kbase_hwcnt_gpu.h" +#include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h> +#include "mali_malisw.h" +#include "mali_kbase_debug.h" + +#include <linux/anon_inodes.h> +#include <linux/fcntl.h> +#include <linux/fs.h> +#include <linux/hrtimer.h> +#include <linux/log2.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/poll.h> +#include <linux/slab.h> +#include <linux/workqueue.h> + +/* The minimum allowed interval between dumps, in nanoseconds + * (equivalent to 10KHz) + */ +#define DUMP_INTERVAL_MIN_NS (100 * NSEC_PER_USEC) + +/* The minimum allowed interval between dumps, in microseconds + * (equivalent to 10KHz) + */ +#define DUMP_INTERVAL_MIN_US (DUMP_INTERVAL_MIN_NS / 1000) + +/* The maximum allowed buffers per client */ +#define MAX_BUFFER_COUNT 32 + +/** + * struct kbase_kinstr_prfcnt_context - IOCTL interface for userspace hardware + * counters. + * @hvirt: Hardware counter virtualizer used by kinstr_prfcnt. + * @info_item_count: Number of metadata elements. + * @metadata: Hardware counter metadata provided by virtualizer. + * @lock: Lock protecting kinstr_prfcnt state. + * @suspend_count: Suspend reference count. If non-zero, timer and worker + * are prevented from being re-scheduled. + * @client_count: Number of kinstr_prfcnt clients. + * @clients: List of kinstr_prfcnt clients. + * @dump_timer: Timer that enqueues dump_work to a workqueue. + * @dump_work: Worker for performing periodic counter dumps. + */ +struct kbase_kinstr_prfcnt_context { + struct kbase_hwcnt_virtualizer *hvirt; + u32 info_item_count; + const struct kbase_hwcnt_metadata *metadata; + struct mutex lock; + size_t suspend_count; + size_t client_count; + struct list_head clients; + struct hrtimer dump_timer; + struct work_struct dump_work; +}; + +/** + * struct kbase_kinstr_prfcnt_sample - Buffer and descriptor for sample data. + * @sample_meta: Pointer to samle metadata. + * @dump_buf: Dump buffer containing sample data. + */ +struct kbase_kinstr_prfcnt_sample { + u64 *sample_meta; + struct kbase_hwcnt_dump_buffer dump_buf; +}; + +/** + * struct kbase_kinstr_prfcnt_sample_array - Array of sample data. + * @page_addr: Address of allocated pages. A single allocation is used + * for all Dump Buffers in the array. + * @page_order: The allocation order of the pages. + * @sample_count: Number of allocated samples. + * @samples: Non-NULL pointer to the array of Dump Buffers. + */ +struct kbase_kinstr_prfcnt_sample_array { + u64 page_addr; + unsigned int page_order; + size_t sample_count; + struct kbase_kinstr_prfcnt_sample *samples; +}; + +/** + * struct kbase_kinstr_prfcnt_client_config - Client session configuration. + * @prfcnt_mode: Sampling mode: either manual or periodic. + * @counter_set: Set of performance counter blocks. + * @buffer_count: Number of buffers used to store samples. + * @period_us: Sampling period, in microseconds, or 0 if manual mode. + * @phys_em: Enable map used by the GPU. + */ +struct kbase_kinstr_prfcnt_client_config { + u8 prfcnt_mode; + u8 counter_set; + u16 buffer_count; + u64 period_us; + struct kbase_hwcnt_physical_enable_map phys_em; +}; + +/** + * struct kbase_kinstr_prfcnt_client - A kinstr_prfcnt client attached + * to a kinstr_prfcnt context. + * @kinstr_ctx: kinstr_prfcnt context client is attached to. + * @hvcli: Hardware counter virtualizer client. + * @node: Node used to attach this client to list in kinstr_prfcnt + * context. + * @next_dump_time_ns: Time in ns when this client's next periodic dump must + * occur. If 0, not a periodic client. + * @dump_interval_ns: Interval between periodic dumps. If 0, not a periodic + * client. + * @config: Configuration of the client session. + * @enable_map: Counters enable map. + * @tmp_buf: Temporary buffer to use before handing over dump to + * client. + * @sample_arr: Array of dump buffers allocated by this client. + * @dump_bufs_meta: Metadata of dump buffers. + * @meta_idx: Index of metadata being accessed by userspace. + * @read_idx: Index of buffer read by userspace. + * @write_idx: Index of buffer being written by dump worker. + * @waitq: Client's notification queue. + * @sample_size: Size of the data required for one sample, in bytes. + * @sample_count: Number of samples the client is able to capture. + */ +struct kbase_kinstr_prfcnt_client { + struct kbase_kinstr_prfcnt_context *kinstr_ctx; + struct kbase_hwcnt_virtualizer_client *hvcli; + struct list_head node; + u64 next_dump_time_ns; + u32 dump_interval_ns; + struct kbase_kinstr_prfcnt_client_config config; + struct kbase_hwcnt_enable_map enable_map; + struct kbase_hwcnt_dump_buffer tmp_buf; + struct kbase_kinstr_prfcnt_sample_array sample_arr; + struct kbase_hwcnt_reader_metadata *dump_bufs_meta; + atomic_t meta_idx; + atomic_t read_idx; + atomic_t write_idx; + wait_queue_head_t waitq; + size_t sample_size; + size_t sample_count; +}; + +static struct prfcnt_enum_item kinstr_prfcnt_supported_requests[] = { + { + /* Request description for MODE request */ + .hdr = { + .item_type = PRFCNT_ENUM_TYPE_REQUEST, + .item_version = PRFCNT_READER_API_VERSION, + }, + .u.request = { + .request_item_type = PRFCNT_REQUEST_MODE, + .versions_mask = 0x1, + }, + }, + { + /* Request description for ENABLE request */ + .hdr = { + .item_type = PRFCNT_ENUM_TYPE_REQUEST, + .item_version = PRFCNT_READER_API_VERSION, + }, + .u.request = { + .request_item_type = PRFCNT_REQUEST_ENABLE, + .versions_mask = 0x1, + }, + }, +}; + +/** + * kbasep_kinstr_prfcnt_hwcnt_reader_buffer_ready() - Check if client has ready + * buffers. + * @cli: Non-NULL pointer to kinstr_prfcnt client. + * + * Return: Non-zero if client has at least one dumping buffer filled that was + * not notified to user yet. + */ +static int kbasep_kinstr_prfcnt_hwcnt_reader_buffer_ready( + struct kbase_kinstr_prfcnt_client *cli) +{ + WARN_ON(!cli); + return atomic_read(&cli->write_idx) != atomic_read(&cli->meta_idx); +} + +/** + * kbasep_kinstr_prfcnt_hwcnt_reader_poll() - hwcnt reader's poll. + * @filp: Non-NULL pointer to file structure. + * @wait: Non-NULL pointer to poll table. + * + * Return: POLLIN if data can be read without blocking, 0 if data can not be + * read without blocking, else error code. + */ +static unsigned int kbasep_kinstr_prfcnt_hwcnt_reader_poll(struct file *filp, + poll_table *wait) +{ + struct kbase_kinstr_prfcnt_client *cli; + + if (!filp || !wait) + return -EINVAL; + + cli = filp->private_data; + + if (!cli) + return -EINVAL; + + poll_wait(filp, &cli->waitq, wait); + + if (kbasep_kinstr_prfcnt_hwcnt_reader_buffer_ready(cli)) + return POLLIN; + + return 0; +} + +/** + * kbasep_kinstr_prfcnt_hwcnt_reader_ioctl() - hwcnt reader's ioctl. + * @filp: Non-NULL pointer to file structure. + * @cmd: User command. + * @arg: Command's argument. + * + * Return: 0 on success, else error code. + */ +static long kbasep_kinstr_prfcnt_hwcnt_reader_ioctl(struct file *filp, + unsigned int cmd, + unsigned long arg) +{ + long rcode; + struct kbase_kinstr_prfcnt_client *cli; + + if (!filp || (_IOC_TYPE(cmd) != KBASE_HWCNT_READER)) + return -EINVAL; + + cli = filp->private_data; + + if (!cli) + return -EINVAL; + + switch (_IOC_NR(cmd)) { + default: + pr_warn("Unknown HWCNT ioctl 0x%x nr:%d", cmd, _IOC_NR(cmd)); + rcode = -EINVAL; + break; + } + + return rcode; +} + +/** + * kbasep_kinstr_prfcnt_hwcnt_reader_mmap() - hwcnt reader's mmap. + * @filp: Non-NULL pointer to file structure. + * @vma: Non-NULL pointer to vma structure. + * + * Return: 0 on success, else error code. + */ +static int kbasep_kinstr_prfcnt_hwcnt_reader_mmap(struct file *filp, + struct vm_area_struct *vma) +{ + struct kbase_kinstr_prfcnt_client *cli; + unsigned long vm_size, size, addr, pfn, offset; + + if (!filp || !vma) + return -EINVAL; + + cli = filp->private_data; + + if (!cli) + return -EINVAL; + + vm_size = vma->vm_end - vma->vm_start; + + /* The mapping is allowed to span the entirety of the page allocation, + * not just the chunk where the dump buffers are allocated. + * This accommodates the corner case where the combined size of the + * dump buffers is smaller than a single page. + * This does not pose a security risk as the pages are zeroed on + * allocation, and anything out of bounds of the dump buffers is never + * written to. + */ + size = (1ull << cli->sample_arr.page_order) * PAGE_SIZE; + + if (vma->vm_pgoff > (size >> PAGE_SHIFT)) + return -EINVAL; + + offset = vma->vm_pgoff << PAGE_SHIFT; + + if (vm_size > size - offset) + return -EINVAL; + + addr = __pa(cli->sample_arr.page_addr + offset); + pfn = addr >> PAGE_SHIFT; + + return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, + vma->vm_page_prot); +} + +static void kbasep_kinstr_prfcnt_sample_array_free( + struct kbase_kinstr_prfcnt_sample_array *sample_arr) +{ + if (!sample_arr) + return; + + kfree((void *)sample_arr->samples); + kfree((void *)(size_t)sample_arr->page_addr); + memset(sample_arr, 0, sizeof(*sample_arr)); +} + +/** + * kbasep_kinstr_prfcnt_client_destroy() - Destroy a kinstr_prfcnt client. + * @cli: kinstr_prfcnt client. Must not be attached to a kinstr_prfcnt context. + */ +static void +kbasep_kinstr_prfcnt_client_destroy(struct kbase_kinstr_prfcnt_client *cli) +{ + if (!cli) + return; + + kbase_hwcnt_virtualizer_client_destroy(cli->hvcli); + kfree(cli->dump_bufs_meta); + kbasep_kinstr_prfcnt_sample_array_free(&cli->sample_arr); + kbase_hwcnt_dump_buffer_free(&cli->tmp_buf); + kbase_hwcnt_enable_map_free(&cli->enable_map); + kfree(cli); +} + +/** + * kbasep_kinstr_prfcnt_hwcnt_reader_release() - hwcnt reader's release. + * @inode: Non-NULL pointer to inode structure. + * @filp: Non-NULL pointer to file structure. + * + * Return: 0 always. + */ +static int kbasep_kinstr_prfcnt_hwcnt_reader_release(struct inode *inode, + struct file *filp) +{ + struct kbase_kinstr_prfcnt_client *cli = filp->private_data; + + mutex_lock(&cli->kinstr_ctx->lock); + + WARN_ON(cli->kinstr_ctx->client_count == 0); + if (cli->kinstr_ctx->client_count > 0) + cli->kinstr_ctx->client_count--; + list_del(&cli->node); + + mutex_unlock(&cli->kinstr_ctx->lock); + + kbasep_kinstr_prfcnt_client_destroy(cli); + + return 0; +} + +/* kinstr_prfcnt client file operations */ +static const struct file_operations kinstr_prfcnt_client_fops = { + .owner = THIS_MODULE, + .poll = kbasep_kinstr_prfcnt_hwcnt_reader_poll, + .unlocked_ioctl = kbasep_kinstr_prfcnt_hwcnt_reader_ioctl, + .compat_ioctl = kbasep_kinstr_prfcnt_hwcnt_reader_ioctl, + .mmap = kbasep_kinstr_prfcnt_hwcnt_reader_mmap, + .release = kbasep_kinstr_prfcnt_hwcnt_reader_release, +}; + +static size_t kbasep_kinstr_prfcnt_get_sample_size( + const struct kbase_hwcnt_metadata *metadata, + struct kbase_hwcnt_dump_buffer *dump_buf) +{ + size_t dump_buf_bytes; + size_t clk_cnt_buf_bytes; + size_t sample_meta_bytes; + size_t block_count = 0; + size_t grp, blk, blk_inst; + + if (!metadata) + return 0; + + kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) + block_count++; + + /* Reserve one for last sentinel item. */ + block_count++; + + sample_meta_bytes = sizeof(struct prfcnt_metadata) * block_count; + dump_buf_bytes = metadata->dump_buf_bytes; + clk_cnt_buf_bytes = sizeof(*dump_buf->clk_cnt_buf) * metadata->clk_cnt; + + return (sample_meta_bytes + dump_buf_bytes + clk_cnt_buf_bytes); +} + +/** + * kbasep_kinstr_prfcnt_dump_worker()- Dump worker, that dumps all periodic + * clients that need to be dumped, then + * reschedules itself. + * @work: Work structure. + */ +static void kbasep_kinstr_prfcnt_dump_worker(struct work_struct *work) +{ + /* Do nothing. */ +} + +/** + * kbasep_kinstr_prfcnt_dump_timer() - Dump timer that schedules the dump worker for + * execution as soon as possible. + * @timer: Timer structure. + */ +static enum hrtimer_restart +kbasep_kinstr_prfcnt_dump_timer(struct hrtimer *timer) +{ + return HRTIMER_NORESTART; +} + +int kbase_kinstr_prfcnt_init(struct kbase_hwcnt_virtualizer *hvirt, + struct kbase_kinstr_prfcnt_context **out_kinstr_ctx) +{ + struct kbase_kinstr_prfcnt_context *kinstr_ctx; + const struct kbase_hwcnt_metadata *metadata; + + if (!hvirt || !out_kinstr_ctx) + return -EINVAL; + + metadata = kbase_hwcnt_virtualizer_metadata(hvirt); + + if (!metadata) + return -EINVAL; + + kinstr_ctx = kzalloc(sizeof(*kinstr_ctx), GFP_KERNEL); + + if (!kinstr_ctx) + return -ENOMEM; + + kinstr_ctx->hvirt = hvirt; + kinstr_ctx->metadata = metadata; + + mutex_init(&kinstr_ctx->lock); + INIT_LIST_HEAD(&kinstr_ctx->clients); + hrtimer_init(&kinstr_ctx->dump_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + kinstr_ctx->dump_timer.function = kbasep_kinstr_prfcnt_dump_timer; + INIT_WORK(&kinstr_ctx->dump_work, kbasep_kinstr_prfcnt_dump_worker); + + *out_kinstr_ctx = kinstr_ctx; + return 0; +} + +void kbase_kinstr_prfcnt_term(struct kbase_kinstr_prfcnt_context *kinstr_ctx) +{ + if (!kinstr_ctx) + return; + + cancel_work_sync(&kinstr_ctx->dump_work); + + /* Non-zero client count implies client leak */ + if (WARN_ON(kinstr_ctx->client_count > 0)) { + struct kbase_kinstr_prfcnt_client *pos, *n; + + list_for_each_entry_safe(pos, n, &kinstr_ctx->clients, node) { + list_del(&pos->node); + kinstr_ctx->client_count--; + kbasep_kinstr_prfcnt_client_destroy(pos); + } + } + + WARN_ON(kinstr_ctx->client_count > 0); + kfree(kinstr_ctx); +} + +void kbase_kinstr_prfcnt_suspend(struct kbase_kinstr_prfcnt_context *kinstr_ctx) +{ + if (WARN_ON(!kinstr_ctx)) + return; + + mutex_lock(&kinstr_ctx->lock); + + if (!WARN_ON(kinstr_ctx->suspend_count == SIZE_MAX)) + kinstr_ctx->suspend_count++; + + mutex_unlock(&kinstr_ctx->lock); + + /* Always sync cancel the timer and then the worker, regardless of the + * new suspend count. + * + * This ensures concurrent calls to kbase_kinstr_prfcnt_suspend() always block + * until kinstr_prfcnt is fully suspended. + * + * The timer is canceled before the worker, as the timer + * unconditionally re-enqueues the worker, but the worker checks the + * suspend_count that we just incremented before rescheduling the timer. + * + * Therefore if we cancel the worker first, the timer might re-enqueue + * the worker before we cancel the timer, but the opposite is not + * possible. + */ + hrtimer_cancel(&kinstr_ctx->dump_timer); + cancel_work_sync(&kinstr_ctx->dump_work); +} + +void kbase_kinstr_prfcnt_resume(struct kbase_kinstr_prfcnt_context *kinstr_ctx) +{ + if (WARN_ON(!kinstr_ctx)) + return; + + mutex_lock(&kinstr_ctx->lock); + + if (!WARN_ON(kinstr_ctx->suspend_count == 0)) { + kinstr_ctx->suspend_count--; + + /* Last resume, so re-enqueue the worker if we have any periodic + * clients. + */ + if (kinstr_ctx->suspend_count == 0) { + struct kbase_kinstr_prfcnt_client *pos; + bool has_periodic_clients = false; + + list_for_each_entry(pos, &kinstr_ctx->clients, node) { + if (pos->dump_interval_ns != 0) { + has_periodic_clients = true; + break; + } + } + + if (has_periodic_clients) + kbase_hwcnt_virtualizer_queue_work( + kinstr_ctx->hvirt, + &kinstr_ctx->dump_work); + } + } + + mutex_unlock(&kinstr_ctx->lock); +} + +static int kbasep_kinstr_prfcnt_sample_array_alloc( + const struct kbase_hwcnt_metadata *metadata, size_t n, + struct kbase_kinstr_prfcnt_sample_array *sample_arr) +{ + struct kbase_kinstr_prfcnt_sample *samples; + size_t sample_idx; + u64 addr; + unsigned int order; + size_t dump_buf_bytes; + size_t clk_cnt_buf_bytes; + size_t sample_meta_bytes; + size_t block_count = 0; + size_t sample_size; + size_t grp, blk, blk_inst; + + if (!metadata || !sample_arr) + return -EINVAL; + + kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) + block_count++; + + /* Reserve one for last sentinel item. */ + block_count++; + + sample_meta_bytes = sizeof(struct prfcnt_metadata) * block_count; + dump_buf_bytes = metadata->dump_buf_bytes; + clk_cnt_buf_bytes = + sizeof(*samples->dump_buf.clk_cnt_buf) * metadata->clk_cnt; + sample_size = sample_meta_bytes + dump_buf_bytes + clk_cnt_buf_bytes; + + samples = kmalloc_array(n, sizeof(*samples), GFP_KERNEL); + + if (!samples) + return -ENOMEM; + + order = get_order(sample_size * n); + addr = (u64)(uintptr_t)kzalloc(sample_size * n, GFP_KERNEL); + + if (!addr) { + kfree((void *)samples); + return -ENOMEM; + } + + sample_arr->page_addr = addr; + sample_arr->page_order = order; + sample_arr->sample_count = n; + sample_arr->samples = samples; + + for (sample_idx = 0; sample_idx < n; sample_idx++) { + const size_t sample_meta_offset = sample_size * sample_idx; + const size_t dump_buf_offset = + sample_meta_offset + sample_meta_bytes; + const size_t clk_cnt_buf_offset = + dump_buf_offset + dump_buf_bytes; + + /* Internal layout in a sample buffer: [sample metadata, dump_buf, clk_cnt_buf]. */ + samples[sample_idx].dump_buf.metadata = metadata; + samples[sample_idx].sample_meta = + (u64 *)(uintptr_t)(addr + sample_meta_offset); + samples[sample_idx].dump_buf.dump_buf = + (u64 *)(uintptr_t)(addr + dump_buf_offset); + samples[sample_idx].dump_buf.clk_cnt_buf = + (u64 *)(uintptr_t)(addr + clk_cnt_buf_offset); + } + + return 0; +} + +static bool prfcnt_mode_supported(u8 mode) +{ + return (mode == PRFCNT_MODE_MANUAL) || (mode == PRFCNT_MODE_PERIODIC); +} + +static void +kbasep_kinstr_prfcnt_block_enable_to_physical(uint32_t *phys_em, + const uint64_t *enable_mask) +{ + *phys_em |= kbase_hwcnt_backend_gpu_block_map_to_physical( + enable_mask[0], enable_mask[1]); +} + +/** + * kbasep_kinstr_prfcnt_parse_request_enable - Parse an enable request + * @req_enable: Performance counters enable request to parse. + * @config: Client object the session configuration should be written to. + * + * This function parses a performance counters enable request. + * This type of request specifies a bitmask of HW counters to enable + * for one performance counters block type. In addition to that, + * a performance counters enable request may also set "global" + * configuration properties that affect the whole session, like the + * performance counters set, which shall be compatible with the same value + * set by other performance request items. + * + * Return: 0 on success, else error code. + */ +static int kbasep_kinstr_prfcnt_parse_request_enable( + const struct prfcnt_request_enable *req_enable, + struct kbase_kinstr_prfcnt_client_config *config) +{ + int err = 0; + u8 req_set = KBASE_HWCNT_SET_UNDEFINED, default_set; + + switch (req_enable->set) { + case PRFCNT_SET_PRIMARY: + req_set = KBASE_HWCNT_SET_PRIMARY; + break; + case PRFCNT_SET_SECONDARY: + req_set = KBASE_HWCNT_SET_SECONDARY; + break; + case PRFCNT_SET_TERTIARY: + req_set = KBASE_HWCNT_SET_TERTIARY; + break; + default: + err = -EINVAL; + break; + } + + /* The performance counter set is a "global" property that affects + * the whole session. Either this is the first request that sets + * the value, or it shall be identical to all previous requests. + */ + if (!err) { + if (config->counter_set == KBASE_HWCNT_SET_UNDEFINED) + config->counter_set = req_set; + else if (config->counter_set != req_set) + err = -EINVAL; + } + + /* Temporarily, the requested set cannot be different from the default + * set because it's the only one to be supported. This will change in + * the future. + */ +#if defined(CONFIG_MALI_PRFCNT_SET_SECONDARY) + default_set = KBASE_HWCNT_SET_SECONDARY; +#elif defined(CONFIG_MALI_PRFCNT_SET_TERTIARY) + default_set = KBASE_HWCNT_SET_TERTIARY; +#else + /* Default to primary */ + default_set = KBASE_HWCNT_SET_PRIMARY; +#endif + + if (req_set != default_set) + err = -EINVAL; + + if (err < 0) + return err; + + /* Enable the performance counters based on the bitmask provided + * by the user space client. + * It is possible to receive multiple requests for the same counter + * block, in which case the bitmask will be a logical OR of all the + * bitmasks given by the client. + */ + switch (req_enable->block_type) { + case PRFCNT_BLOCK_TYPE_FE: + kbasep_kinstr_prfcnt_block_enable_to_physical( + &config->phys_em.fe_bm, req_enable->enable_mask); + break; + case PRFCNT_BLOCK_TYPE_TILER: + kbasep_kinstr_prfcnt_block_enable_to_physical( + &config->phys_em.tiler_bm, req_enable->enable_mask); + break; + case PRFCNT_BLOCK_TYPE_MEMORY: + kbasep_kinstr_prfcnt_block_enable_to_physical( + &config->phys_em.mmu_l2_bm, req_enable->enable_mask); + break; + case PRFCNT_BLOCK_TYPE_SHADER_CORE: + kbasep_kinstr_prfcnt_block_enable_to_physical( + &config->phys_em.shader_bm, req_enable->enable_mask); + break; + default: + err = -EINVAL; + break; + } + + return err; +} + +/** + * kbasep_kinstr_prfcnt_parse_setup - Parse session setup + * @kinstr_ctx: Pointer to the kinstr_prfcnt context. + * @setup: Session setup information to parse. + * @config: Client object the session configuration should be written to. + * + * This function parses the list of "request" items sent by the user space + * client, and writes the configuration for the new client to be created + * for the session. + * + * Return: 0 on success, else error code. + */ +static int kbasep_kinstr_prfcnt_parse_setup( + struct kbase_kinstr_prfcnt_context *kinstr_ctx, + union kbase_ioctl_kinstr_prfcnt_setup *setup, + struct kbase_kinstr_prfcnt_client_config *config) +{ + uint32_t i; + struct prfcnt_request_item *req_arr; + int err = 0; + + if (!setup->in.requests_ptr || (setup->in.request_item_count == 0) || + (setup->in.request_item_size == 0)) { + return -EINVAL; + } + + req_arr = + (struct prfcnt_request_item *)(uintptr_t)setup->in.requests_ptr; + + if (req_arr[setup->in.request_item_count - 1].hdr.item_type != + FLEX_LIST_TYPE_NONE) { + return -EINVAL; + } + + if (req_arr[setup->in.request_item_count - 1].hdr.item_version != 0) + return -EINVAL; + + /* The session configuration can only feature one value for some + * properties (like capture mode and block counter set), but the client + * may potential issue multiple requests and try to set more than one + * value for those properties. While issuing multiple requests for the + * same property is allowed by the protocol, asking for different values + * is illegal. Leaving these properties as undefined is illegal, too. + */ + config->prfcnt_mode = PRFCNT_MODE_RESERVED; + config->counter_set = KBASE_HWCNT_SET_UNDEFINED; + + for (i = 0; i < setup->in.request_item_count - 1; i++) { + if (req_arr[i].hdr.item_version > PRFCNT_READER_API_VERSION) { + err = -EINVAL; + break; + } + + switch (req_arr[i].hdr.item_type) { + /* Capture mode is initialized as undefined. + * The first request of this type sets the capture mode. + * The protocol allows the client to send redundant requests, + * but only if they replicate the same value that has already + * been set by the first request. + */ + case PRFCNT_REQUEST_TYPE_MODE: + if (!prfcnt_mode_supported(req_arr[i].u.req_mode.mode)) + err = -EINVAL; + else if (config->prfcnt_mode == PRFCNT_MODE_RESERVED) + config->prfcnt_mode = + req_arr[i].u.req_mode.mode; + else if (req_arr[i].u.req_mode.mode != + config->prfcnt_mode) + err = -EINVAL; + + if (err < 0) + break; + + if (config->prfcnt_mode == PRFCNT_MODE_PERIODIC) { + config->period_us = + req_arr[i] + .u.req_mode.mode_config.periodic + .period_us; + + if ((config->period_us != 0) && + (config->period_us < + DUMP_INTERVAL_MIN_US)) { + config->period_us = + DUMP_INTERVAL_MIN_US; + } + } + break; + + case PRFCNT_REQUEST_TYPE_ENABLE: + err = kbasep_kinstr_prfcnt_parse_request_enable( + &req_arr[i].u.req_enable, config); + break; + + default: + err = -EINVAL; + break; + } + + if (err < 0) + break; + } + + /* Verify that properties (like capture mode and block counter set) + * have been defined by the user space client. + */ + if (config->prfcnt_mode == PRFCNT_MODE_RESERVED) + err = -EINVAL; + + if (config->counter_set == KBASE_HWCNT_SET_UNDEFINED) + err = -EINVAL; + + return err; +} + +/** + * kbasep_kinstr_prfcnt_client_create() - Create a kinstr_prfcnt client. + * Does not attach to the kinstr_prfcnt + * context. + * @kinstr_ctx: Non-NULL pointer to kinstr_prfcnt context. + * @setup: Non-NULL pointer to hardware counter ioctl setup structure. + * @out_vcli: Non-NULL pointer to where created client will be stored on + * success. + * + * Return: 0 on success, else error code. + */ +static int kbasep_kinstr_prfcnt_client_create( + struct kbase_kinstr_prfcnt_context *kinstr_ctx, + union kbase_ioctl_kinstr_prfcnt_setup *setup, + struct kbase_kinstr_prfcnt_client **out_vcli) +{ + int err; + struct kbase_kinstr_prfcnt_client *cli; + struct kbase_hwcnt_physical_enable_map phys_em; + + WARN_ON(!kinstr_ctx); + WARN_ON(!setup); + + cli = kzalloc(sizeof(*cli), GFP_KERNEL); + + if (!cli) + return -ENOMEM; + + cli->kinstr_ctx = kinstr_ctx; + err = kbasep_kinstr_prfcnt_parse_setup(kinstr_ctx, setup, &cli->config); + + if (err < 0) + goto error; + + cli->config.buffer_count = MAX_BUFFER_COUNT; + cli->dump_interval_ns = cli->config.period_us * NSEC_PER_USEC; + cli->next_dump_time_ns = 0; + err = kbase_hwcnt_enable_map_alloc(kinstr_ctx->metadata, + &cli->enable_map); + + if (err < 0) + goto error; + + phys_em.fe_bm = 0; + phys_em.shader_bm = 0; + phys_em.tiler_bm = 0; + phys_em.mmu_l2_bm = 0; + + kbase_hwcnt_gpu_enable_map_from_physical(&cli->enable_map, &phys_em); + + cli->sample_count = cli->config.buffer_count; + cli->sample_size = kbasep_kinstr_prfcnt_get_sample_size( + kinstr_ctx->metadata, &cli->tmp_buf); + + /* Use virtualizer's metadata to alloc tmp buffer which interacts with + * the HWC virtualizer. + */ + err = kbase_hwcnt_dump_buffer_alloc(kinstr_ctx->metadata, + &cli->tmp_buf); + + if (err < 0) + goto error; + + /* Enable all the available clk_enable_map. */ + cli->enable_map.clk_enable_map = + (1ull << kinstr_ctx->metadata->clk_cnt) - 1; + + /* Use metadata from virtualizer to allocate dump buffers if + * kinstr_prfcnt doesn't have the truncated metadata. + */ + err = kbasep_kinstr_prfcnt_sample_array_alloc(kinstr_ctx->metadata, + cli->config.buffer_count, + &cli->sample_arr); + + if (err < 0) + goto error; + + err = -ENOMEM; + + cli->dump_bufs_meta = + kmalloc_array(cli->config.buffer_count, + sizeof(*cli->dump_bufs_meta), GFP_KERNEL); + + if (!cli->dump_bufs_meta) + goto error; + + err = kbase_hwcnt_virtualizer_client_create( + kinstr_ctx->hvirt, &cli->enable_map, &cli->hvcli); + + if (err < 0) + goto error; + + init_waitqueue_head(&cli->waitq); + *out_vcli = cli; + + return 0; + +error: + kbasep_kinstr_prfcnt_client_destroy(cli); + return err; +} + +static size_t kbasep_kinstr_prfcnt_get_block_info_count( + const struct kbase_hwcnt_metadata *metadata) +{ + size_t grp; + size_t block_info_count = 0; + + if (!metadata) + return 0; + + for (grp = 0; grp < kbase_hwcnt_metadata_group_count(metadata); grp++) { + block_info_count += + kbase_hwcnt_metadata_block_count(metadata, grp); + } + + return block_info_count; +} + +static void kbasep_kinstr_prfcnt_get_request_info_list( + struct kbase_kinstr_prfcnt_context *kinstr_ctx, + struct prfcnt_enum_item *item_arr, size_t *arr_idx) +{ + memcpy(&item_arr[*arr_idx], kinstr_prfcnt_supported_requests, + sizeof(kinstr_prfcnt_supported_requests)); + *arr_idx += ARRAY_SIZE(kinstr_prfcnt_supported_requests); +} + +static enum prfcnt_block_type +kbase_hwcnt_metadata_block_type_to_prfcnt_block_type(u64 type) +{ + enum prfcnt_block_type block_type; + + switch (type) { + case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE: + case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE2: + case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_FE3: + block_type = PRFCNT_BLOCK_TYPE_FE; + break; + + case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER: + block_type = PRFCNT_BLOCK_TYPE_TILER; + break; + + case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC: + case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2: + case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC3: + block_type = PRFCNT_BLOCK_TYPE_SHADER_CORE; + break; + + case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS: + case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2: + block_type = PRFCNT_BLOCK_TYPE_MEMORY; + break; + + case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_UNDEFINED: + default: + block_type = PRFCNT_BLOCK_TYPE_RESERVED; + break; + } + + return block_type; +} + +static int kbasep_kinstr_prfcnt_get_block_info_list( + const struct kbase_hwcnt_metadata *metadata, size_t block_set, + struct prfcnt_enum_item *item_arr, size_t *arr_idx) +{ + size_t grp; + size_t blk; + + if (!metadata || !item_arr || !arr_idx) + return -EINVAL; + + for (grp = 0; grp < kbase_hwcnt_metadata_group_count(metadata); grp++) { + for (blk = 0; + blk < kbase_hwcnt_metadata_block_count(metadata, grp); + blk++, (*arr_idx)++) { + item_arr[*arr_idx].hdr.item_type = + PRFCNT_ENUM_TYPE_BLOCK; + item_arr[*arr_idx].hdr.item_version = + PRFCNT_READER_API_VERSION; + item_arr[*arr_idx].u.block_counter.set = block_set; + + item_arr[*arr_idx].u.block_counter.block_type = + kbase_hwcnt_metadata_block_type_to_prfcnt_block_type( + kbase_hwcnt_metadata_block_type( + metadata, grp, blk)); + item_arr[*arr_idx].u.block_counter.num_instances = + kbase_hwcnt_metadata_block_instance_count( + metadata, grp, blk); + item_arr[*arr_idx].u.block_counter.num_values = + kbase_hwcnt_metadata_block_values_count( + metadata, grp, blk); + + /* The bitmask of available counters should be dynamic. + * Temporarily, it is set to U64_MAX, waiting for the + * required functionality to be available in the future. + */ + item_arr[*arr_idx].u.block_counter.counter_mask[0] = + U64_MAX; + item_arr[*arr_idx].u.block_counter.counter_mask[1] = + U64_MAX; + } + } + + return 0; +} + +static int kbasep_kinstr_prfcnt_enum_info_count( + struct kbase_kinstr_prfcnt_context *kinstr_ctx, + struct kbase_ioctl_kinstr_prfcnt_enum_info *enum_info) +{ + int err = 0; + uint32_t count = 0; + size_t block_info_count = 0; + const struct kbase_hwcnt_metadata *metadata; + + count = ARRAY_SIZE(kinstr_prfcnt_supported_requests); + metadata = kbase_hwcnt_virtualizer_metadata(kinstr_ctx->hvirt); + block_info_count = kbasep_kinstr_prfcnt_get_block_info_count(metadata); + count += block_info_count; + + /* Reserve one for the last sentinel item. */ + count++; + enum_info->info_item_count = count; + enum_info->info_item_size = sizeof(struct prfcnt_enum_item); + kinstr_ctx->info_item_count = count; + + return err; +} + +static int kbasep_kinstr_prfcnt_enum_info_list( + struct kbase_kinstr_prfcnt_context *kinstr_ctx, + struct kbase_ioctl_kinstr_prfcnt_enum_info *enum_info) +{ + struct prfcnt_enum_item *prfcnt_item_arr; + size_t arr_idx = 0; + int err = 0; + size_t block_info_count = 0; + const struct kbase_hwcnt_metadata *metadata; + + if ((enum_info->info_item_size == 0) || + (enum_info->info_item_count == 0) || !enum_info->info_list_ptr) + return -EINVAL; + + if (enum_info->info_item_count != kinstr_ctx->info_item_count) + return -EINVAL; + + prfcnt_item_arr = + (struct prfcnt_enum_item *)(uintptr_t)enum_info->info_list_ptr; + kbasep_kinstr_prfcnt_get_request_info_list(kinstr_ctx, prfcnt_item_arr, + &arr_idx); + metadata = kbase_hwcnt_virtualizer_metadata(kinstr_ctx->hvirt); + block_info_count = kbasep_kinstr_prfcnt_get_block_info_count(metadata); + + if (arr_idx + block_info_count >= enum_info->info_item_count) + err = -EINVAL; + + if (!err) { + size_t counter_set; + +#if defined(CONFIG_MALI_PRFCNT_SET_SECONDARY) + counter_set = KBASE_HWCNT_SET_SECONDARY; +#elif defined(CONFIG_MALI_PRFCNT_SET_TERTIARY) + counter_set = KBASE_HWCNT_SET_TERTIARY; +#else + /* Default to primary */ + counter_set = KBASE_HWCNT_SET_PRIMARY; +#endif + kbasep_kinstr_prfcnt_get_block_info_list( + metadata, counter_set, prfcnt_item_arr, &arr_idx); + if (arr_idx != enum_info->info_item_count - 1) + err = -EINVAL; + } + + /* The last sentinel item. */ + prfcnt_item_arr[enum_info->info_item_count - 1].hdr.item_type = + FLEX_LIST_TYPE_NONE; + prfcnt_item_arr[enum_info->info_item_count - 1].hdr.item_version = 0; + + return err; +} + +int kbase_kinstr_prfcnt_enum_info( + struct kbase_kinstr_prfcnt_context *kinstr_ctx, + struct kbase_ioctl_kinstr_prfcnt_enum_info *enum_info) +{ + int err; + + if (!kinstr_ctx || !enum_info) + return -EINVAL; + + if (!enum_info->info_list_ptr) + err = kbasep_kinstr_prfcnt_enum_info_count(kinstr_ctx, + enum_info); + else + err = kbasep_kinstr_prfcnt_enum_info_list(kinstr_ctx, + enum_info); + + return err; +} + +int kbase_kinstr_prfcnt_setup(struct kbase_kinstr_prfcnt_context *kinstr_ctx, + union kbase_ioctl_kinstr_prfcnt_setup *setup) +{ + int err; + struct kbase_kinstr_prfcnt_client *cli = NULL; + + if (!kinstr_ctx || !setup) + return -EINVAL; + + err = kbasep_kinstr_prfcnt_client_create(kinstr_ctx, setup, &cli); + + if (err < 0) + goto error; + + mutex_lock(&kinstr_ctx->lock); + kinstr_ctx->client_count++; + list_add(&cli->node, &kinstr_ctx->clients); + mutex_unlock(&kinstr_ctx->lock); + + setup->out.prfcnt_metadata_item_size = sizeof(struct prfcnt_metadata); + setup->out.prfcnt_mmap_size_bytes = + cli->sample_size * cli->sample_count; + + /* Expose to user-space only once the client is fully initialized */ + err = anon_inode_getfd("[mali_kinstr_prfcnt_desc]", + &kinstr_prfcnt_client_fops, cli, + O_RDONLY | O_CLOEXEC); + + if (err < 0) + goto client_installed_error; + + return err; + +client_installed_error: + mutex_lock(&kinstr_ctx->lock); + kinstr_ctx->client_count--; + list_del(&cli->node); + mutex_unlock(&kinstr_ctx->lock); +error: + kbasep_kinstr_prfcnt_client_destroy(cli); + return err; +} diff --git a/mali_kbase/mali_kbase_kinstr_prfcnt.h b/mali_kbase/mali_kbase_kinstr_prfcnt.h new file mode 100644 index 0000000..83d76be --- /dev/null +++ b/mali_kbase/mali_kbase_kinstr_prfcnt.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +/* + * Kinstr_prfcnt, used to provide an ioctl for userspace access to + * performance counters. + */ +#ifndef _KBASE_KINSTR_PRFCNT_H_ +#define _KBASE_KINSTR_PRFCNT_H_ + +struct kbase_kinstr_prfcnt_context; +struct kbase_hwcnt_virtualizer; +struct kbase_ioctl_hwcnt_reader_setup; +struct kbase_ioctl_kinstr_prfcnt_enum_info; +union kbase_ioctl_kinstr_prfcnt_setup; + +/** + * kbase_kinstr_prfcnt_init() - Initialize a kinstr_prfcnt context. + * @hvirt: Non-NULL pointer to the hardware counter virtualizer. + * @out_kinstr_ctx: Non-NULL pointer to where the pointer to the created + * kinstr_prfcnt context will be stored on success. + * + * On creation, the suspend count of the context will be 0. + * + * Return: 0 on success, else error code. + */ +int kbase_kinstr_prfcnt_init( + struct kbase_hwcnt_virtualizer *hvirt, + struct kbase_kinstr_prfcnt_context **out_kinstr_ctx); + +/** + * kbase_kinstr_prfcnt_term() - Terminate a kinstr_prfcnt context. + * @kinstr_ctx: Pointer to the kinstr_prfcnt context to be terminated. + */ +void kbase_kinstr_prfcnt_term(struct kbase_kinstr_prfcnt_context *kinstr_ctx); + +/** + * kbase_kinstr_prfcnt_suspend() - Increment the suspend count of the context. + * @kinstr_ctx: Non-NULL pointer to the kinstr_prfcnt context to be suspended. + * + * After this function call returns, it is guaranteed that all timers and + * workers in kinstr_prfcnt will be canceled, and will not be re-triggered until + * after the context has been resumed. In effect, this means no new counter + * dumps will occur for any existing or subsequently added periodic clients. + */ +void kbase_kinstr_prfcnt_suspend(struct kbase_kinstr_prfcnt_context *kinstr_ctx); + +/** + * kbase_kinstr_prfcnt_resume() - Decrement the suspend count of the context. + * @kinstr_ctx: Non-NULL pointer to the kinstr_prfcnt context to be resumed. + * + * If a call to this function decrements the suspend count from 1 to 0, then + * normal operation of kinstr_prfcnt will be resumed (i.e. counter dumps will once + * again be automatically triggered for all periodic clients). + * + * It is only valid to call this function one time for each prior returned call + * to kbase_kinstr_prfcnt_suspend. + */ +void kbase_kinstr_prfcnt_resume(struct kbase_kinstr_prfcnt_context *kinstr_ctx); + +/** + * kbase_kinstr_prfcnt_enum_info - Enumerate performance counter information. + * @kinstr_ctx: Non-NULL pointer to the kinstr_prfcnt context. + * @enum_info: Non-NULL pointer to the enumeration information. + * + * Enumerate which counter blocks and banks exist, and what counters are + * available within them. + * + * Return: 0 on success, else error code. + */ +int kbase_kinstr_prfcnt_enum_info( + struct kbase_kinstr_prfcnt_context *kinstr_ctx, + struct kbase_ioctl_kinstr_prfcnt_enum_info *enum_info); + +/** + * kbase_kinstr_prfcnt_setup() - Set up a new hardware counter reader client. + * @kinstr_ctx: Non-NULL pointer to the kinstr_prfcnt context. + * @setup: Non-NULL pointer to the hwcnt reader configuration. + * + * Start a session between a user client and the kinstr_prfcnt component. + * A file descriptor shall be provided to the client as a handle to the + * hardware counter reader client that represents the session. + * + * Return: file descriptor on success, else error code. + */ +int kbase_kinstr_prfcnt_setup(struct kbase_kinstr_prfcnt_context *kinstr_ctx, + union kbase_ioctl_kinstr_prfcnt_setup *setup); + +#endif /* _KBASE_KINSTR_PRFCNT_H_ */ diff --git a/mali_kbase/mali_kbase_mem.c b/mali_kbase/mali_kbase_mem.c index a68e4ea..320ffef 100644 --- a/mali_kbase/mali_kbase_mem.c +++ b/mali_kbase/mali_kbase_mem.c @@ -351,6 +351,7 @@ static struct kbase_va_region *kbase_region_tracker_find_region_meeting_reqs( /** * Remove a region object from the global list. + * @kbdev: The kbase device * @reg: Region object to remove * * The region reg is removed, possibly by merging with other free and @@ -358,7 +359,8 @@ static struct kbase_va_region *kbase_region_tracker_find_region_meeting_reqs( * region lock held. The associated memory is not released (see * kbase_free_alloced_region). Internal use only. */ -int kbase_remove_va_region(struct kbase_va_region *reg) +void kbase_remove_va_region(struct kbase_device *kbdev, + struct kbase_va_region *reg) { struct rb_node *rbprev; struct kbase_va_region *prev = NULL; @@ -368,20 +370,26 @@ int kbase_remove_va_region(struct kbase_va_region *reg) int merged_front = 0; int merged_back = 0; - int err = 0; reg_rbtree = reg->rbtree; + if (WARN_ON(RB_EMPTY_ROOT(reg_rbtree))) + return; + /* Try to merge with the previous block first */ rbprev = rb_prev(&(reg->rblink)); if (rbprev) { prev = rb_entry(rbprev, struct kbase_va_region, rblink); if (prev->flags & KBASE_REG_FREE) { /* We're compatible with the previous VMA, merge with - * it + * it, handling any gaps for robustness. */ + u64 prev_end_pfn = prev->start_pfn + prev->nr_pages; + WARN_ON((prev->flags & KBASE_REG_ZONE_MASK) != (reg->flags & KBASE_REG_ZONE_MASK)); + if (!WARN_ON(reg->start_pfn < prev_end_pfn)) + prev->nr_pages += reg->start_pfn - prev_end_pfn; prev->nr_pages += reg->nr_pages; rb_erase(&(reg->rblink), reg_rbtree); reg = prev; @@ -393,11 +401,17 @@ int kbase_remove_va_region(struct kbase_va_region *reg) /* Note we do the lookup here as the tree may have been rebalanced. */ rbnext = rb_next(&(reg->rblink)); if (rbnext) { - /* We're compatible with the next VMA, merge with it */ next = rb_entry(rbnext, struct kbase_va_region, rblink); if (next->flags & KBASE_REG_FREE) { + /* We're compatible with the next VMA, merge with it, + * handling any gaps for robustness. + */ + u64 reg_end_pfn = reg->start_pfn + reg->nr_pages; + WARN_ON((next->flags & KBASE_REG_ZONE_MASK) != (reg->flags & KBASE_REG_ZONE_MASK)); + if (!WARN_ON(next->start_pfn < reg_end_pfn)) + next->nr_pages += next->start_pfn - reg_end_pfn; next->start_pfn = reg->start_pfn; next->nr_pages += reg->nr_pages; rb_erase(&(reg->rblink), reg_rbtree); @@ -412,8 +426,8 @@ int kbase_remove_va_region(struct kbase_va_region *reg) /* If we failed to merge then we need to add a new block */ if (!(merged_front || merged_back)) { /* - * We didn't merge anything. Add a new free - * placeholder and remove the original one. + * We didn't merge anything. Try to add a new free + * placeholder, and in any case, remove the original one. */ struct kbase_va_region *free_reg; @@ -421,14 +435,37 @@ int kbase_remove_va_region(struct kbase_va_region *reg) reg->start_pfn, reg->nr_pages, reg->flags & KBASE_REG_ZONE_MASK); if (!free_reg) { - err = -ENOMEM; + /* In case of failure, we cannot allocate a replacement + * free region, so we will be left with a 'gap' in the + * region tracker's address range (though, the rbtree + * will itself still be correct after erasing + * 'reg'). + * + * The gap will be rectified when an adjacent region is + * removed by one of the above merging paths. Other + * paths will gracefully fail to allocate if they try + * to allocate in the gap. + * + * There is nothing that the caller can do, since free + * paths must not fail. The existing 'reg' cannot be + * repurposed as the free region as callers must have + * freedom of use with it by virtue of it being owned + * by them, not the region tracker insert/remove code. + */ + dev_warn( + kbdev->dev, + "Could not alloc a replacement free region for 0x%.16llx..0x%.16llx", + (unsigned long long)reg->start_pfn << PAGE_SHIFT, + (unsigned long long)(reg->start_pfn + reg->nr_pages) << PAGE_SHIFT); + rb_erase(&(reg->rblink), reg_rbtree); + goto out; } rb_replace_node(&(reg->rblink), &(free_reg->rblink), reg_rbtree); } - out: - return err; +out: + return; } KBASE_EXPORT_TEST_API(kbase_remove_va_region); @@ -456,6 +493,9 @@ static int kbase_insert_va_region_nolock(struct kbase_va_region *new_reg, KBASE_DEBUG_ASSERT((start_pfn >= at_reg->start_pfn) && (start_pfn < at_reg->start_pfn + at_reg->nr_pages)); /* at least nr_pages from start_pfn should be contained within at_reg */ KBASE_DEBUG_ASSERT(start_pfn + nr_pages <= at_reg->start_pfn + at_reg->nr_pages); + /* having at_reg means the rb_tree should not be empty */ + if (WARN_ON(RB_EMPTY_ROOT(reg_rbtree))) + return -ENOMEM; new_reg->start_pfn = start_pfn; new_reg->nr_pages = nr_pages; @@ -862,6 +902,8 @@ static bool kbase_region_tracker_has_allocs(struct kbase_context *kctx) unsigned long zone_bits = KBASE_REG_ZONE(zone_idx); unsigned long reg_zone; + if (!kbase_is_ctx_reg_zone(zone_bits)) + continue; zone = kbase_ctx_reg_zone_get(kctx, zone_bits); zone_base_addr = zone->base_pfn << PAGE_SHIFT; @@ -1457,7 +1499,9 @@ void kbase_free_alloced_region(struct kbase_va_region *reg) KBASE_EXPORT_TEST_API(kbase_free_alloced_region); -int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg, u64 addr, size_t nr_pages, size_t align) +int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg, + u64 addr, size_t nr_pages, size_t align, + enum kbase_caller_mmu_sync_info mmu_sync_info) { int err; size_t i = 0; @@ -1494,14 +1538,16 @@ int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg, u64 KBASE_DEBUG_ASSERT(alloc->imported.alias.aliased); for (i = 0; i < alloc->imported.alias.nents; i++) { if (alloc->imported.alias.aliased[i].alloc) { - err = kbase_mmu_insert_pages(kctx->kbdev, - &kctx->mmu, - reg->start_pfn + (i * stride), - alloc->imported.alias.aliased[i].alloc->pages + alloc->imported.alias.aliased[i].offset, - alloc->imported.alias.aliased[i].length, - reg->flags & gwt_mask, - kctx->as_nr, - group_id); + err = kbase_mmu_insert_pages( + kctx->kbdev, &kctx->mmu, + reg->start_pfn + (i * stride), + alloc->imported.alias.aliased[i] + .alloc->pages + + alloc->imported.alias.aliased[i] + .offset, + alloc->imported.alias.aliased[i].length, + reg->flags & gwt_mask, kctx->as_nr, + group_id, mmu_sync_info); if (err) goto bad_insert; @@ -1509,26 +1555,24 @@ int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg, u64 * creation time */ } else { - err = kbase_mmu_insert_single_page(kctx, - reg->start_pfn + i * stride, + err = kbase_mmu_insert_single_page( + kctx, reg->start_pfn + i * stride, kctx->aliasing_sink_page, alloc->imported.alias.aliased[i].length, (reg->flags & mask & gwt_mask) | attr, - group_id); + group_id, mmu_sync_info); if (err) goto bad_insert; } } } else { - err = kbase_mmu_insert_pages(kctx->kbdev, - &kctx->mmu, - reg->start_pfn, - kbase_get_gpu_phy_pages(reg), - kbase_reg_current_backed_size(reg), - reg->flags & gwt_mask, - kctx->as_nr, - group_id); + err = kbase_mmu_insert_pages(kctx->kbdev, &kctx->mmu, + reg->start_pfn, + kbase_get_gpu_phy_pages(reg), + kbase_reg_current_backed_size(reg), + reg->flags & gwt_mask, kctx->as_nr, + group_id, mmu_sync_info); if (err) goto bad_insert; kbase_mem_phy_alloc_gpu_mapped(alloc); @@ -1548,13 +1592,12 @@ int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg, u64 * Assume reg->gpu_alloc->nents is the number of actual pages * in the dma-buf memory. */ - err = kbase_mmu_insert_single_page(kctx, - reg->start_pfn + reg->gpu_alloc->nents, - kctx->aliasing_sink_page, - reg->nr_pages - reg->gpu_alloc->nents, - (reg->flags | KBASE_REG_GPU_RD) & - ~KBASE_REG_GPU_WR, - KBASE_MEM_GROUP_SINK); + err = kbase_mmu_insert_single_page( + kctx, reg->start_pfn + reg->gpu_alloc->nents, + kctx->aliasing_sink_page, + reg->nr_pages - reg->gpu_alloc->nents, + (reg->flags | KBASE_REG_GPU_RD) & ~KBASE_REG_GPU_WR, + KBASE_MEM_GROUP_SINK, mmu_sync_info); if (err) goto bad_insert; } @@ -1566,7 +1609,7 @@ bad_insert: reg->start_pfn, reg->nr_pages, kctx->as_nr); - kbase_remove_va_region(reg); + kbase_remove_va_region(kctx->kbdev, reg); return err; } @@ -1588,7 +1631,28 @@ int kbase_gpu_munmap(struct kbase_context *kctx, struct kbase_va_region *reg) /* Tear down down GPU page tables, depending on memory type. */ switch (reg->gpu_alloc->type) { - case KBASE_MEM_TYPE_ALIAS: /* Fall-through */ + case KBASE_MEM_TYPE_ALIAS: { + size_t i = 0; + struct kbase_mem_phy_alloc *alloc = reg->gpu_alloc; + + /* Due to the way the number of valid PTEs and ATEs are tracked + * currently, only the GPU virtual range that is backed & mapped + * should be passed to the kbase_mmu_teardown_pages() function, + * hence individual aliased regions needs to be unmapped + * separately. + */ + for (i = 0; i < alloc->imported.alias.nents; i++) { + if (alloc->imported.alias.aliased[i].alloc) { + err = kbase_mmu_teardown_pages( + kctx->kbdev, &kctx->mmu, + reg->start_pfn + + (i * + alloc->imported.alias.stride), + alloc->imported.alias.aliased[i].length, + kctx->as_nr); + } + } + } break; case KBASE_MEM_TYPE_IMPORTED_UMM: err = kbase_mmu_teardown_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn, reg->nr_pages, kctx->as_nr); @@ -1622,7 +1686,7 @@ int kbase_gpu_munmap(struct kbase_context *kctx, struct kbase_va_region *reg) } } } - /* Fall-through */ + fallthrough; default: kbase_mem_phy_alloc_gpu_unmapped(reg->gpu_alloc); break; @@ -3698,7 +3762,8 @@ static size_t kbase_mem_jit_trim_pages(struct kbase_context *kctx, static int kbase_jit_grow(struct kbase_context *kctx, const struct base_jit_alloc_info *info, struct kbase_va_region *reg, - struct kbase_sub_alloc **prealloc_sas) + struct kbase_sub_alloc **prealloc_sas, + enum kbase_caller_mmu_sync_info mmu_sync_info) { size_t delta; size_t pages_required; @@ -3795,7 +3860,7 @@ static int kbase_jit_grow(struct kbase_context *kctx, spin_unlock(&kctx->mem_partials_lock); ret = kbase_mem_grow_gpu_mapping(kctx, reg, info->commit_pages, - old_size); + old_size, mmu_sync_info); /* * The grow failed so put the allocation back in the * pool and return failure. @@ -4010,6 +4075,11 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx, struct kbase_sub_alloc *prealloc_sas[2] = { NULL, NULL }; int i; + /* Calls to this function are inherently synchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC; + #if MALI_USE_CSF lockdep_assert_held(&kctx->csf.kcpu_queues.lock); #else @@ -4102,7 +4172,8 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx, * so any state protected by that lock might need to be * re-evaluated if more code is added here in future. */ - ret = kbase_jit_grow(kctx, info, reg, prealloc_sas); + ret = kbase_jit_grow(kctx, info, reg, prealloc_sas, + mmu_sync_info); #if MALI_JIT_PRESSURE_LIMIT_BASE if (!ignore_pressure_limit) @@ -4150,7 +4221,7 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx, flags |= BASE_MEM_TILER_ALIGN_TOP; #endif /* !MALI_USE_CSF */ - flags |= base_mem_group_id_set(kctx->jit_group_id); + flags |= kbase_mem_group_id_set(kctx->jit_group_id); #if MALI_JIT_PRESSURE_LIMIT_BASE if (!ignore_pressure_limit) { flags |= BASEP_MEM_PERFORM_JIT_TRIM; @@ -4166,7 +4237,8 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx, kbase_gpu_vm_unlock(kctx); reg = kbase_mem_alloc(kctx, info->va_pages, info->commit_pages, - info->extension, &flags, &gpu_addr); + info->extension, &flags, &gpu_addr, + mmu_sync_info); if (!reg) { /* Most likely not enough GPU virtual space left for * the new JIT allocation. @@ -4455,6 +4527,15 @@ void kbase_jit_report_update_pressure(struct kbase_context *kctx, } #endif /* MALI_JIT_PRESSURE_LIMIT_BASE */ +void kbase_unpin_user_buf_page(struct page *page) +{ +#if KERNEL_VERSION(5, 9, 0) > LINUX_VERSION_CODE + put_page(page); +#else + unpin_user_page(page); +#endif +} + #if MALI_USE_CSF static void kbase_jd_user_buf_unpin_pages(struct kbase_mem_phy_alloc *alloc) { @@ -4465,7 +4546,7 @@ static void kbase_jd_user_buf_unpin_pages(struct kbase_mem_phy_alloc *alloc) WARN_ON(alloc->nents != alloc->imported.user_buf.nr_pages); for (i = 0; i < alloc->nents; i++) - put_page(pages[i]); + kbase_unpin_user_buf_page(pages[i]); } } #endif @@ -4524,11 +4605,10 @@ KERNEL_VERSION(4, 5, 0) > LINUX_VERSION_CODE reg->flags & KBASE_REG_GPU_WR ? FOLL_WRITE : 0, pages, NULL, NULL); #else - pinned_pages = get_user_pages_remote(mm, - address, - alloc->imported.user_buf.nr_pages, - reg->flags & KBASE_REG_GPU_WR ? FOLL_WRITE : 0, - pages, NULL, NULL); + pinned_pages = pin_user_pages_remote( + mm, address, alloc->imported.user_buf.nr_pages, + reg->flags & KBASE_REG_GPU_WR ? FOLL_WRITE : 0, pages, NULL, + NULL); #endif if (pinned_pages <= 0) @@ -4536,7 +4616,7 @@ KERNEL_VERSION(4, 5, 0) > LINUX_VERSION_CODE if (pinned_pages != alloc->imported.user_buf.nr_pages) { for (i = 0; i < pinned_pages; i++) - put_page(pages[i]); + kbase_unpin_user_buf_page(pages[i]); return -ENOMEM; } @@ -4560,6 +4640,11 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx, unsigned long gwt_mask = ~0; int err = kbase_jd_user_buf_pin_pages(kctx, reg); + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + if (err) return err; @@ -4596,9 +4681,9 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx, #endif err = kbase_mmu_insert_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn, - pa, kbase_reg_current_backed_size(reg), - reg->flags & gwt_mask, kctx->as_nr, - alloc->group_id); + pa, kbase_reg_current_backed_size(reg), + reg->flags & gwt_mask, kctx->as_nr, + alloc->group_id, mmu_sync_info); if (err == 0) return 0; @@ -4612,7 +4697,7 @@ unwind: } while (++i < pinned_pages) { - put_page(pages[i]); + kbase_unpin_user_buf_page(pages[i]); pages[i] = NULL; } @@ -4642,7 +4727,7 @@ static void kbase_jd_user_buf_unmap(struct kbase_context *kctx, if (writeable) set_page_dirty_lock(pages[i]); #if !MALI_USE_CSF - put_page(pages[i]); + kbase_unpin_user_buf_page(pages[i]); pages[i] = NULL; #endif diff --git a/mali_kbase/mali_kbase_mem.h b/mali_kbase/mali_kbase_mem.h index e9ac809..95533f5 100644 --- a/mali_kbase/mali_kbase_mem.h +++ b/mali_kbase/mali_kbase_mem.h @@ -506,6 +506,21 @@ struct kbase_va_region { int va_refcnt; }; +/** + * kbase_is_ctx_reg_zone - determine whether a KBASE_REG_ZONE_<...> is for a + * context or for a device + * @zone_bits: A KBASE_REG_ZONE_<...> to query + * + * Return: True if the zone for @zone_bits is a context zone, False otherwise + */ +static inline bool kbase_is_ctx_reg_zone(unsigned long zone_bits) +{ + WARN_ON((zone_bits & KBASE_REG_ZONE_MASK) != zone_bits); + return (zone_bits == KBASE_REG_ZONE_SAME_VA || + zone_bits == KBASE_REG_ZONE_CUSTOM_VA || + zone_bits == KBASE_REG_ZONE_EXEC_VA); +} + /* Special marker for failed JIT allocations that still must be marked as * in-use */ @@ -529,12 +544,14 @@ static inline bool kbase_is_region_invalid_or_free(struct kbase_va_region *reg) return (kbase_is_region_invalid(reg) || kbase_is_region_free(reg)); } -int kbase_remove_va_region(struct kbase_va_region *reg); -static inline void kbase_region_refcnt_free(struct kbase_va_region *reg) +void kbase_remove_va_region(struct kbase_device *kbdev, + struct kbase_va_region *reg); +static inline void kbase_region_refcnt_free(struct kbase_device *kbdev, + struct kbase_va_region *reg) { /* If region was mapped then remove va region*/ if (reg->start_pfn) - kbase_remove_va_region(reg); + kbase_remove_va_region(kbdev, reg); /* To detect use-after-free in debug builds */ KBASE_DEBUG_CODE(reg->flags |= KBASE_REG_FREE); @@ -569,7 +586,7 @@ static inline struct kbase_va_region *kbase_va_region_alloc_put( dev_dbg(kctx->kbdev->dev, "va_refcnt %d after put %pK\n", region->va_refcnt, (void *)region); if (!region->va_refcnt) - kbase_region_refcnt_free(region); + kbase_region_refcnt_free(kctx->kbdev, region); return NULL; } @@ -1167,10 +1184,13 @@ int kbase_alloc_phy_pages(struct kbase_va_region *reg, size_t vsize, size_t size * @addr: the address to insert the region at * @nr_pages: the number of pages in the region * @align: the minimum alignment in pages + * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops. * * Call kbase_add_va_region() and map the region on the GPU. */ -int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg, u64 addr, size_t nr_pages, size_t align); +int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg, + u64 addr, size_t nr_pages, size_t align, + enum kbase_caller_mmu_sync_info mmu_sync_info); /** * Remove the region from the GPU and unregister it. @@ -1798,6 +1818,11 @@ struct kbase_mem_phy_alloc *kbase_map_external_resource( void kbase_unmap_external_resource(struct kbase_context *kctx, struct kbase_va_region *reg, struct kbase_mem_phy_alloc *alloc); +/** + * kbase_unpin_user_buf_page - Unpin a page of a user buffer. + * @page: page to unpin + */ +void kbase_unpin_user_buf_page(struct page *page); /** * kbase_jd_user_buf_pin_pages - Pin the pages of a user buffer. @@ -2025,7 +2050,7 @@ int kbase_mem_copy_to_pinned_user_pages(struct page **dest_pages, unsigned int *target_page_nr, size_t offset); /** - * kbase_ctx_reg_zone_end_pfn - return the end Page Frame Number of @zone + * kbase_reg_zone_end_pfn - return the end Page Frame Number of @zone * @zone: zone to query * * Return: The end of the zone corresponding to @zone @@ -2050,7 +2075,7 @@ static inline void kbase_ctx_reg_zone_init(struct kbase_context *kctx, struct kbase_reg_zone *zone; lockdep_assert_held(&kctx->reg_lock); - WARN_ON((zone_bits & KBASE_REG_ZONE_MASK) != zone_bits); + WARN_ON(!kbase_is_ctx_reg_zone(zone_bits)); zone = &kctx->reg_zone[KBASE_REG_ZONE_IDX(zone_bits)]; *zone = (struct kbase_reg_zone){ @@ -2073,7 +2098,7 @@ static inline struct kbase_reg_zone * kbase_ctx_reg_zone_get_nolock(struct kbase_context *kctx, unsigned long zone_bits) { - WARN_ON((zone_bits & KBASE_REG_ZONE_MASK) != zone_bits); + WARN_ON(!kbase_is_ctx_reg_zone(zone_bits)); return &kctx->reg_zone[KBASE_REG_ZONE_IDX(zone_bits)]; } @@ -2091,9 +2116,60 @@ static inline struct kbase_reg_zone * kbase_ctx_reg_zone_get(struct kbase_context *kctx, unsigned long zone_bits) { lockdep_assert_held(&kctx->reg_lock); - WARN_ON((zone_bits & KBASE_REG_ZONE_MASK) != zone_bits); + WARN_ON(!kbase_is_ctx_reg_zone(zone_bits)); return &kctx->reg_zone[KBASE_REG_ZONE_IDX(zone_bits)]; } +/** + * kbase_mem_allow_alloc - Check if allocation of GPU memory is allowed + * @kctx: Pointer to kbase context + * + * Don't allow the allocation of GPU memory until user space has set up the + * tracking page (which sets kctx->process_mm) or if the ioctl has been issued + * from the forked child process using the mali device file fd inherited from + * the parent process. + */ +static inline bool kbase_mem_allow_alloc(struct kbase_context *kctx) +{ + bool allow_alloc = true; + + rcu_read_lock(); + allow_alloc = (rcu_dereference(kctx->process_mm) == current->mm); + rcu_read_unlock(); + + return allow_alloc; +} + +/** + * kbase_mem_group_id_get - Get group ID from flags + * @flags: Flags to pass to base_mem_alloc + * + * This inline function extracts the encoded group ID from flags + * and converts it into numeric value (0~15). + * + * Return: group ID(0~15) extracted from the parameter + */ +static inline int kbase_mem_group_id_get(base_mem_alloc_flags flags) +{ + KBASE_DEBUG_ASSERT((flags & ~BASE_MEM_FLAGS_INPUT_MASK) == 0); + return (int)BASE_MEM_GROUP_ID_GET(flags); +} + +/** + * kbase_mem_group_id_set - Set group ID into base_mem_alloc_flags + * @id: group ID(0~15) you want to encode + * + * This inline function encodes specific group ID into base_mem_alloc_flags. + * Parameter 'id' should lie in-between 0 to 15. + * + * Return: base_mem_alloc_flags with the group ID (id) encoded + * + * The return value can be combined with other flags against base_mem_alloc + * to identify a specific memory group. + */ +static inline base_mem_alloc_flags kbase_mem_group_id_set(int id) +{ + return BASE_MEM_GROUP_ID_SET(id); +} #endif /* _KBASE_MEM_H_ */ diff --git a/mali_kbase/mali_kbase_mem_linux.c b/mali_kbase/mali_kbase_mem_linux.c index 21302c1..527bec4 100644 --- a/mali_kbase/mali_kbase_mem_linux.c +++ b/mali_kbase/mali_kbase_mem_linux.c @@ -291,9 +291,10 @@ void kbase_phy_alloc_mapping_put(struct kbase_context *kctx, */ } -struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx, - u64 va_pages, u64 commit_pages, - u64 extension, u64 *flags, u64 *gpu_va) +struct kbase_va_region * +kbase_mem_alloc(struct kbase_context *kctx, u64 va_pages, u64 commit_pages, + u64 extension, u64 *flags, u64 *gpu_va, + enum kbase_caller_mmu_sync_info mmu_sync_info) { int zone; struct kbase_va_region *reg; @@ -387,7 +388,7 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx, goto invalid_flags; if (kbase_reg_prepare_native(reg, kctx, - base_mem_group_id_get(*flags)) != 0) { + kbase_mem_group_id_get(*flags)) != 0) { dev_err(dev, "Failed to prepare region"); goto prepare_failed; } @@ -469,7 +470,8 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx, *gpu_va = (u64) cookie; } else /* we control the VA */ { - if (kbase_gpu_mmap(kctx, reg, *gpu_va, va_pages, 1) != 0) { + if (kbase_gpu_mmap(kctx, reg, *gpu_va, va_pages, 1, + mmu_sync_info) != 0) { dev_warn(dev, "Failed to map memory on GPU"); kbase_gpu_vm_unlock(kctx); goto no_mmap; @@ -604,7 +606,7 @@ int kbase_mem_query(struct kbase_context *kctx, if (KBASE_REG_GPU_VA_SAME_4GB_PAGE & reg->flags) *out |= BASE_MEM_GPU_VA_SAME_4GB_PAGE; - *out |= base_mem_group_id_set(reg->cpu_alloc->group_id); + *out |= kbase_mem_group_id_set(reg->cpu_alloc->group_id); WARN(*out & ~BASE_MEM_FLAGS_QUERYABLE, "BASE_MEM_FLAGS_QUERYABLE needs updating\n"); @@ -827,6 +829,11 @@ bool kbase_mem_evictable_unmake(struct kbase_mem_phy_alloc *gpu_alloc) struct kbase_context *kctx = gpu_alloc->imported.native.kctx; int err = 0; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + lockdep_assert_held(&kctx->reg_lock); mutex_lock(&kctx->jit_evict_lock); @@ -856,9 +863,9 @@ bool kbase_mem_evictable_unmake(struct kbase_mem_phy_alloc *gpu_alloc) * pre-eviction size. */ if (!err) - err = kbase_mem_grow_gpu_mapping(kctx, - gpu_alloc->reg, - gpu_alloc->evicted, 0); + err = kbase_mem_grow_gpu_mapping( + kctx, gpu_alloc->reg, + gpu_alloc->evicted, 0, mmu_sync_info); gpu_alloc->evicted = 0; } @@ -1215,6 +1222,11 @@ int kbase_mem_umm_map(struct kbase_context *kctx, struct kbase_mem_phy_alloc *alloc; unsigned long gwt_mask = ~0; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + lockdep_assert_held(&kctx->reg_lock); alloc = reg->gpu_alloc; @@ -1241,14 +1253,11 @@ int kbase_mem_umm_map(struct kbase_context *kctx, gwt_mask = ~KBASE_REG_GPU_WR; #endif - err = kbase_mmu_insert_pages(kctx->kbdev, - &kctx->mmu, - reg->start_pfn, + err = kbase_mmu_insert_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn, kbase_get_gpu_phy_pages(reg), kbase_reg_current_backed_size(reg), - reg->flags & gwt_mask, - kctx->as_nr, - alloc->group_id); + reg->flags & gwt_mask, kctx->as_nr, + alloc->group_id, mmu_sync_info); if (err) goto bad_insert; @@ -1261,13 +1270,11 @@ int kbase_mem_umm_map(struct kbase_context *kctx, * Assume alloc->nents is the number of actual pages in the * dma-buf memory. */ - err = kbase_mmu_insert_single_page(kctx, - reg->start_pfn + alloc->nents, - kctx->aliasing_sink_page, - reg->nr_pages - alloc->nents, - (reg->flags | KBASE_REG_GPU_RD) & - ~KBASE_REG_GPU_WR, - KBASE_MEM_GROUP_SINK); + err = kbase_mmu_insert_single_page( + kctx, reg->start_pfn + alloc->nents, + kctx->aliasing_sink_page, reg->nr_pages - alloc->nents, + (reg->flags | KBASE_REG_GPU_RD) & ~KBASE_REG_GPU_WR, + KBASE_MEM_GROUP_SINK, mmu_sync_info); if (err) goto bad_pad_insert; } @@ -1640,9 +1647,12 @@ KERNEL_VERSION(4, 5, 0) > LINUX_VERSION_CODE #elif KERNEL_VERSION(4, 9, 0) > LINUX_VERSION_CODE faulted_pages = get_user_pages(address, *va_pages, write, 0, pages, NULL); -#else +#elif KERNEL_VERSION(5, 9, 0) > LINUX_VERSION_CODE faulted_pages = get_user_pages(address, *va_pages, write ? FOLL_WRITE : 0, pages, NULL); +#else + faulted_pages = pin_user_pages(address, *va_pages, + write ? FOLL_WRITE : 0, pages, NULL); #endif up_read(kbase_mem_get_process_mmap_lock()); @@ -1694,7 +1704,7 @@ unwind_dma_map: fault_mismatch: if (pages) { for (i = 0; i < faulted_pages; i++) - put_page(pages[i]); + kbase_unpin_user_buf_page(pages[i]); } no_page_array: invalid_flags: @@ -1718,6 +1728,11 @@ u64 kbase_mem_alias(struct kbase_context *kctx, u64 *flags, u64 stride, size_t i; bool coherent; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + KBASE_DEBUG_ASSERT(kctx); KBASE_DEBUG_ASSERT(flags); KBASE_DEBUG_ASSERT(ai); @@ -1891,7 +1906,8 @@ u64 kbase_mem_alias(struct kbase_context *kctx, u64 *flags, u64 stride, #else if (1) { #endif - if (kbase_gpu_mmap(kctx, reg, 0, *num_pages, 1) != 0) { + if (kbase_gpu_mmap(kctx, reg, 0, *num_pages, 1, + mmu_sync_info) != 0) { dev_warn(kctx->kbdev->dev, "Failed to map memory on GPU"); goto no_mmap; } @@ -1936,6 +1952,11 @@ int kbase_mem_import(struct kbase_context *kctx, enum base_mem_import_type type, { struct kbase_va_region *reg; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + KBASE_DEBUG_ASSERT(kctx); KBASE_DEBUG_ASSERT(gpu_va); KBASE_DEBUG_ASSERT(va_pages); @@ -2035,7 +2056,8 @@ int kbase_mem_import(struct kbase_context *kctx, enum base_mem_import_type type, } else if (*flags & KBASE_MEM_IMPORT_HAVE_PAGES) { /* we control the VA, mmap now to the GPU */ - if (kbase_gpu_mmap(kctx, reg, 0, *va_pages, 1) != 0) + if (kbase_gpu_mmap(kctx, reg, 0, *va_pages, 1, mmu_sync_info) != + 0) goto no_gpu_va; /* return real GPU VA */ *gpu_va = reg->start_pfn << PAGE_SHIFT; @@ -2069,8 +2091,9 @@ bad_flags: } int kbase_mem_grow_gpu_mapping(struct kbase_context *kctx, - struct kbase_va_region *reg, - u64 new_pages, u64 old_pages) + struct kbase_va_region *reg, u64 new_pages, + u64 old_pages, + enum kbase_caller_mmu_sync_info mmu_sync_info) { struct tagged_addr *phy_pages; u64 delta = new_pages - old_pages; @@ -2081,8 +2104,10 @@ int kbase_mem_grow_gpu_mapping(struct kbase_context *kctx, /* Map the new pages into the GPU */ phy_pages = kbase_get_gpu_phy_pages(reg); ret = kbase_mmu_insert_pages(kctx->kbdev, &kctx->mmu, - reg->start_pfn + old_pages, phy_pages + old_pages, delta, - reg->flags, kctx->as_nr, reg->gpu_alloc->group_id); + reg->start_pfn + old_pages, + phy_pages + old_pages, delta, reg->flags, + kctx->as_nr, reg->gpu_alloc->group_id, + mmu_sync_info); return ret; } @@ -2136,6 +2161,11 @@ int kbase_mem_commit(struct kbase_context *kctx, u64 gpu_addr, u64 new_pages) struct kbase_va_region *reg; bool read_locked = false; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + KBASE_DEBUG_ASSERT(kctx); KBASE_DEBUG_ASSERT(gpu_addr != 0); @@ -2227,8 +2257,8 @@ int kbase_mem_commit(struct kbase_context *kctx, u64 gpu_addr, u64 new_pages) /* No update required for CPU mappings, that's done on fault. */ /* Update GPU mapping. */ - res = kbase_mem_grow_gpu_mapping(kctx, reg, - new_pages, old_pages); + res = kbase_mem_grow_gpu_mapping(kctx, reg, new_pages, + old_pages, mmu_sync_info); /* On error free the new pages */ if (res) { @@ -2647,6 +2677,11 @@ static int kbasep_reg_mmap(struct kbase_context *kctx, struct kbase_va_region *reg; int err = 0; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + *aligned_offset = 0; dev_dbg(kctx->kbdev->dev, "in kbasep_reg_mmap\n"); @@ -2681,7 +2716,7 @@ static int kbasep_reg_mmap(struct kbase_context *kctx, *nr_pages = kbase_reg_current_backed_size(reg); if (kbase_gpu_mmap(kctx, reg, vma->vm_start + *aligned_offset, - reg->nr_pages, 1) != 0) { + reg->nr_pages, 1, mmu_sync_info) != 0) { dev_err(kctx->kbdev->dev, "%s:%d\n", __FILE__, __LINE__); /* Unable to map in GPU space. */ WARN_ON(1); @@ -2747,17 +2782,10 @@ int kbase_context_mmap(struct kbase_context *const kctx, goto out_unlock; } - /* if not the MTP, verify that the MTP has been mapped */ - rcu_read_lock(); - /* catches both when the special page isn't present or - * when we've forked - */ - if (rcu_dereference(kctx->process_mm) != current->mm) { + if (!kbase_mem_allow_alloc(kctx)) { err = -EINVAL; - rcu_read_unlock(); goto out_unlock; } - rcu_read_unlock(); switch (vma->vm_pgoff) { case PFN_DOWN(BASEP_MEM_INVALID_HANDLE): diff --git a/mali_kbase/mali_kbase_mem_linux.h b/mali_kbase/mali_kbase_mem_linux.h index 36159c1..f123d17 100644 --- a/mali_kbase/mali_kbase_mem_linux.h +++ b/mali_kbase/mali_kbase_mem_linux.h @@ -45,12 +45,14 @@ struct kbase_hwc_dma_mapping { * properties for the new allocation. * @gpu_va: Start address of the memory region which was allocated from GPU * virtual address space. + * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops. * * Return: 0 on success or error code */ -struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx, - u64 va_pages, u64 commit_pages, - u64 extension, u64 *flags, u64 *gpu_va); +struct kbase_va_region * +kbase_mem_alloc(struct kbase_context *kctx, u64 va_pages, u64 commit_pages, + u64 extension, u64 *flags, u64 *gpu_va, + enum kbase_caller_mmu_sync_info mmu_sync_info); /** * kbase_mem_query - Query properties of a GPU memory region @@ -169,6 +171,7 @@ void kbase_mem_evictable_deinit(struct kbase_context *kctx); * @reg: The GPU region * @new_pages: The number of pages after the grow * @old_pages: The number of pages before the grow + * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops. * * Return: 0 on success, -errno on error. * @@ -178,8 +181,9 @@ void kbase_mem_evictable_deinit(struct kbase_context *kctx); * Note: Caller must be holding the region lock. */ int kbase_mem_grow_gpu_mapping(struct kbase_context *kctx, - struct kbase_va_region *reg, - u64 new_pages, u64 old_pages); + struct kbase_va_region *reg, u64 new_pages, + u64 old_pages, + enum kbase_caller_mmu_sync_info mmu_sync_info); /** * kbase_mem_evictable_make - Make a physical allocation eligible for eviction diff --git a/mali_kbase/mali_kbase_mem_profile_debugfs.c b/mali_kbase/mali_kbase_mem_profile_debugfs.c index 201ff51..7e77963 100644 --- a/mali_kbase/mali_kbase_mem_profile_debugfs.c +++ b/mali_kbase/mali_kbase_mem_profile_debugfs.c @@ -84,9 +84,9 @@ int kbasep_mem_profile_debugfs_insert(struct kbase_context *kctx, char *data, if (!kbase_ctx_flag(kctx, KCTX_MEM_PROFILE_INITIALIZED)) { if (IS_ERR_OR_NULL(kctx->kctx_dentry)) { err = -ENOMEM; - } else if (!debugfs_create_file("mem_profile", mode, - kctx->kctx_dentry, kctx, - &kbasep_mem_profile_debugfs_fops)) { + } else if (IS_ERR_OR_NULL(debugfs_create_file("mem_profile", + mode, kctx->kctx_dentry, kctx, + &kbasep_mem_profile_debugfs_fops))) { err = -EAGAIN; } else { kbase_ctx_flag_set(kctx, diff --git a/mali_kbase/mali_kbase_mem_profile_debugfs_buf_size.h b/mali_kbase/mali_kbase_mem_profile_debugfs_buf_size.h index 3184a98..1210ed5 100644 --- a/mali_kbase/mali_kbase_mem_profile_debugfs_buf_size.h +++ b/mali_kbase/mali_kbase_mem_profile_debugfs_buf_size.h @@ -30,8 +30,7 @@ * The size of the buffer to accumulate the histogram report text in * @see @ref CCTXP_HIST_BUF_SIZE_MAX_LENGTH_REPORT */ -#define KBASE_MEM_PROFILE_MAX_BUF_SIZE \ - ((size_t) (64 + ((80 + (56 * 64)) * 53) + 56)) +#define KBASE_MEM_PROFILE_MAX_BUF_SIZE ((size_t)(64 + ((80 + (56 * 64)) * 54) + 56)) #endif /*_KBASE_MEM_PROFILE_DEBUGFS_BUF_SIZE_H_*/ diff --git a/mali_kbase/mali_kbase_pbha.c b/mali_kbase/mali_kbase_pbha.c new file mode 100644 index 0000000..3e58a7b --- /dev/null +++ b/mali_kbase/mali_kbase_pbha.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +/* + * + * (C) COPYRIGHT 2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#include "mali_kbase_pbha.h" + +#include <device/mali_kbase_device.h> +#include <mali_kbase.h> +#define DTB_SET_SIZE 2 + +static bool read_setting_valid(unsigned int id, unsigned int read_setting) +{ + switch (id) { + /* Valid ID - fall through all */ + case SYSC_ALLOC_ID_R_OTHER: + case SYSC_ALLOC_ID_R_CSF: + case SYSC_ALLOC_ID_R_MMU: + case SYSC_ALLOC_ID_R_TILER_VERT: + case SYSC_ALLOC_ID_R_TILER_PTR: + case SYSC_ALLOC_ID_R_TILER_INDEX: + case SYSC_ALLOC_ID_R_TILER_OTHER: + case SYSC_ALLOC_ID_R_IC: + case SYSC_ALLOC_ID_R_ATTR: + case SYSC_ALLOC_ID_R_SCM: + case SYSC_ALLOC_ID_R_FSDC: + case SYSC_ALLOC_ID_R_VL: + case SYSC_ALLOC_ID_R_PLR: + case SYSC_ALLOC_ID_R_TEX: + case SYSC_ALLOC_ID_R_LSC: + switch (read_setting) { + /* Valid setting value - fall through all */ + case SYSC_ALLOC_L2_ALLOC: + case SYSC_ALLOC_NEVER_ALLOC: + case SYSC_ALLOC_ALWAYS_ALLOC: + case SYSC_ALLOC_PTL_ALLOC: + case SYSC_ALLOC_L2_PTL_ALLOC: + return true; + default: + return false; + } + default: + return false; + } + + /* Unreachable */ + return false; +} + +static bool write_setting_valid(unsigned int id, unsigned int write_setting) +{ + switch (id) { + /* Valid ID - fall through all */ + case SYSC_ALLOC_ID_W_OTHER: + case SYSC_ALLOC_ID_W_CSF: + case SYSC_ALLOC_ID_W_PCB: + case SYSC_ALLOC_ID_W_TILER_PTR: + case SYSC_ALLOC_ID_W_TILER_VERT_PLIST: + case SYSC_ALLOC_ID_W_TILER_OTHER: + case SYSC_ALLOC_ID_W_L2_EVICT: + case SYSC_ALLOC_ID_W_L2_FLUSH: + case SYSC_ALLOC_ID_W_TIB_COLOR: + case SYSC_ALLOC_ID_W_TIB_COLOR_AFBCH: + case SYSC_ALLOC_ID_W_TIB_COLOR_AFBCB: + case SYSC_ALLOC_ID_W_TIB_CRC: + case SYSC_ALLOC_ID_W_TIB_DS: + case SYSC_ALLOC_ID_W_TIB_DS_AFBCH: + case SYSC_ALLOC_ID_W_TIB_DS_AFBCB: + case SYSC_ALLOC_ID_W_LSC: + switch (write_setting) { + /* Valid setting value - fall through all */ + case SYSC_ALLOC_L2_ALLOC: + case SYSC_ALLOC_NEVER_ALLOC: + case SYSC_ALLOC_ALWAYS_ALLOC: + case SYSC_ALLOC_PTL_ALLOC: + case SYSC_ALLOC_L2_PTL_ALLOC: + return true; + default: + return false; + } + default: + return false; + } + + /* Unreachable */ + return false; +} + +static bool settings_valid(unsigned int id, unsigned int read_setting, + unsigned int write_setting) +{ + bool settings_valid = false; + + if (id < SYSC_ALLOC_COUNT * sizeof(u32)) { + settings_valid = read_setting_valid(id, read_setting) && + write_setting_valid(id, write_setting); + } + + return settings_valid; +} + +bool kbasep_pbha_supported(struct kbase_device *kbdev) +{ + const u32 arch_maj_rev = + ARCH_MAJOR_REV_REG(kbdev->gpu_props.props.raw_props.gpu_id); + + return (arch_maj_rev >= GPU_ID2_ARCH_MAJOR_REV_MAKE(11, 3)); +} + +int kbase_pbha_record_settings(struct kbase_device *kbdev, bool runtime, + unsigned int id, unsigned int read_setting, + unsigned int write_setting) +{ + bool const valid = settings_valid(id, read_setting, write_setting); + + if (valid) { + unsigned int const sysc_alloc_num = id / sizeof(u32); + u32 modified_reg; + if (runtime) { + int i; + + kbase_pm_context_active(kbdev); + /* Ensure host copy of SYSC_ALLOC is up to date */ + for (i = 0; i < SYSC_ALLOC_COUNT; i++) + kbdev->sysc_alloc[i] = kbase_reg_read( + kbdev, GPU_CONTROL_REG(SYSC_ALLOC(i))); + kbase_pm_context_idle(kbdev); + } + + modified_reg = kbdev->sysc_alloc[sysc_alloc_num]; + + switch (id % sizeof(u32)) { + case 0: + modified_reg = SYSC_ALLOC_R_SYSC_ALLOC0_SET( + modified_reg, read_setting); + modified_reg = SYSC_ALLOC_W_SYSC_ALLOC0_SET( + modified_reg, write_setting); + break; + case 1: + modified_reg = SYSC_ALLOC_R_SYSC_ALLOC1_SET( + modified_reg, read_setting); + modified_reg = SYSC_ALLOC_W_SYSC_ALLOC1_SET( + modified_reg, write_setting); + break; + case 2: + modified_reg = SYSC_ALLOC_R_SYSC_ALLOC2_SET( + modified_reg, read_setting); + modified_reg = SYSC_ALLOC_W_SYSC_ALLOC2_SET( + modified_reg, write_setting); + break; + case 3: + modified_reg = SYSC_ALLOC_R_SYSC_ALLOC3_SET( + modified_reg, read_setting); + modified_reg = SYSC_ALLOC_W_SYSC_ALLOC3_SET( + modified_reg, write_setting); + break; + } + + kbdev->sysc_alloc[sysc_alloc_num] = modified_reg; + } + + return valid ? 0 : -EINVAL; +} + +void kbase_pbha_write_settings(struct kbase_device *kbdev) +{ + if (kbasep_pbha_supported(kbdev)) { + int i; + for (i = 0; i < SYSC_ALLOC_COUNT; ++i) + kbase_reg_write(kbdev, GPU_CONTROL_REG(SYSC_ALLOC(i)), + kbdev->sysc_alloc[i]); + } +} + +int kbase_pbha_read_dtb(struct kbase_device *kbdev) +{ + u32 dtb_data[SYSC_ALLOC_COUNT * sizeof(u32) * DTB_SET_SIZE]; + const struct device_node *pbha_node; + int sz, i; + bool valid = true; + + if (!kbasep_pbha_supported(kbdev)) + return 0; + + pbha_node = of_get_child_by_name(kbdev->dev->of_node, "pbha"); + if (!pbha_node) + return 0; + + sz = of_property_count_elems_of_size(pbha_node, "int_id_override", + sizeof(u32)); + if (sz <= 0 || (sz % DTB_SET_SIZE != 0)) { + dev_err(kbdev->dev, "Bad DTB format: pbha.int_id_override\n"); + return -EINVAL; + } + if (of_property_read_u32_array(pbha_node, "int_id_override", dtb_data, + sz) != 0) { + dev_err(kbdev->dev, + "Failed to read DTB pbha.int_id_override\n"); + return -EINVAL; + } + + for (i = 0; valid && i < sz; i = i + DTB_SET_SIZE) { + unsigned int rdset = + SYSC_ALLOC_R_SYSC_ALLOC0_GET(dtb_data[i + 1]); + unsigned int wrset = + SYSC_ALLOC_W_SYSC_ALLOC0_GET(dtb_data[i + 1]); + valid = valid && + (kbase_pbha_record_settings(kbdev, false, dtb_data[i], + rdset, wrset) == 0); + if (valid) + dev_info(kbdev->dev, + "pbha.int_id_override 0x%x r0x%x w0x%x\n", + dtb_data[i], rdset, wrset); + } + if (i != sz || (!valid)) { + dev_err(kbdev->dev, + "Failed recording DTB data (pbha.int_id_override)\n"); + return -EINVAL; + } + return 0; +} diff --git a/mali_kbase/mali_kbase_pbha.h b/mali_kbase/mali_kbase_pbha.h new file mode 100644 index 0000000..6861773 --- /dev/null +++ b/mali_kbase/mali_kbase_pbha.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * + * (C) COPYRIGHT 2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#ifndef _KBASE_PBHA_H +#define _KBASE_PBHA_H + +#include <mali_kbase.h> + +/** + * kbasep_pbha_supported - check whether PBHA registers are + * available + * + * Should only be used in mali_kbase_pbha* files - thus the + * kbase[p] prefix. + * + * @kbdev: Device pointer + * + * Return: True if pbha is supported, false otherwise + */ +bool kbasep_pbha_supported(struct kbase_device *kbdev); + +/** + * kbase_pbha_record_settings - record PBHA settings to be applied when + * L2 is powered down + * + * @kbdev: Device pointer + * @runtime: true if it's called at runtime and false if it's called on init. + * @id: memory access source ID + * @read_setting: Read setting + * @write_setting: Write setting + * + * Return: 0 on success, otherwise error code. + */ +int kbase_pbha_record_settings(struct kbase_device *kbdev, bool runtime, + unsigned int id, unsigned int read_setting, + unsigned int write_setting); + +/** + * kbase_pbha_write_settings - write recorded PBHA settings to GPU + * registers + * + * Only valid to call this function when L2 is powered down, otherwise + * this will not affect PBHA settings. + * + * @kbdev: Device pointer + */ +void kbase_pbha_write_settings(struct kbase_device *kbdev); + +/** + * kbase_pbha_read_dtb - read PBHA settings from DTB and record it to be + * applied when L2 is powered down + * + * @kbdev: Device pointer + * + * Return: 0 on success, otherwise error code. + */ +int kbase_pbha_read_dtb(struct kbase_device *kbdev); + +#endif /* _KBASE_PBHA_H */ diff --git a/mali_kbase/mali_kbase_pbha_debugfs.c b/mali_kbase/mali_kbase_pbha_debugfs.c new file mode 100644 index 0000000..47eab63 --- /dev/null +++ b/mali_kbase/mali_kbase_pbha_debugfs.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +/* + * + * (C) COPYRIGHT 2021 ARM Limited. All rights reserved. + * + * This program is free software and is provided to you under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation, and any use by you of this program is subject to the terms + * of such GNU license. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + */ + +#include "mali_kbase_pbha_debugfs.h" + +#include "mali_kbase_pbha.h" + +#include <device/mali_kbase_device.h> +#include <mali_kbase_reset_gpu.h> +#include <mali_kbase.h> + +static int int_id_overrides_show(struct seq_file *sfile, void *data) +{ + struct kbase_device *kbdev = sfile->private; + int i; + + kbase_pm_context_active(kbdev); + + /* Minimal header for readability */ + seq_puts(sfile, "// R W\n"); + for (i = 0; i < SYSC_ALLOC_COUNT; ++i) { + int j; + u32 reg = kbase_reg_read(kbdev, GPU_CONTROL_REG(SYSC_ALLOC(i))); + + for (j = 0; j < sizeof(u32); ++j) { + u8 r_val; + u8 w_val; + + switch (j) { + case 0: + r_val = SYSC_ALLOC_R_SYSC_ALLOC0_GET(reg); + w_val = SYSC_ALLOC_W_SYSC_ALLOC0_GET(reg); + break; + case 1: + r_val = SYSC_ALLOC_R_SYSC_ALLOC1_GET(reg); + w_val = SYSC_ALLOC_W_SYSC_ALLOC1_GET(reg); + break; + case 2: + r_val = SYSC_ALLOC_R_SYSC_ALLOC2_GET(reg); + w_val = SYSC_ALLOC_W_SYSC_ALLOC2_GET(reg); + break; + case 3: + r_val = SYSC_ALLOC_R_SYSC_ALLOC3_GET(reg); + w_val = SYSC_ALLOC_W_SYSC_ALLOC3_GET(reg); + break; + } + seq_printf(sfile, "%2zu 0x%x 0x%x\n", + (i * sizeof(u32)) + j, r_val, w_val); + } + } + kbase_pm_context_idle(kbdev); + + return 0; +} + +static ssize_t int_id_overrides_write(struct file *file, + const char __user *ubuf, size_t count, + loff_t *ppos) +{ + struct seq_file *sfile = file->private_data; + struct kbase_device *kbdev = sfile->private; + char raw_str[128]; + unsigned int id; + unsigned int r_val; + unsigned int w_val; + + if (count >= sizeof(raw_str)) + return -E2BIG; + if (copy_from_user(raw_str, ubuf, count)) + return -EINVAL; + raw_str[count] = '\0'; + + if (sscanf(raw_str, "%u %x %x", &id, &r_val, &w_val) != 3) + return -EINVAL; + + if (kbase_pbha_record_settings(kbdev, true, id, r_val, w_val)) + return -EINVAL; + + /* This is a debugfs config write, so reset GPU such that changes take effect ASAP */ + kbase_pm_context_active(kbdev); + if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE)) + kbase_reset_gpu(kbdev); + kbase_pm_context_idle(kbdev); + + return count; +} + +static int int_id_overrides_open(struct inode *in, struct file *file) +{ + return single_open(file, int_id_overrides_show, in->i_private); +} + +static const struct file_operations pbha_int_id_overrides_fops = { + .owner = THIS_MODULE, + .open = int_id_overrides_open, + .read = seq_read, + .write = int_id_overrides_write, + .llseek = seq_lseek, + .release = single_release, +}; + +void kbase_pbha_debugfs_init(struct kbase_device *kbdev) +{ + if (kbasep_pbha_supported(kbdev)) { +#if (KERNEL_VERSION(4, 7, 0) <= LINUX_VERSION_CODE) + /* only for newer kernel version debug file system is safe */ + const mode_t mode = 0644; +#else + const mode_t mode = 0600; +#endif + struct dentry *debugfs_pbha_dir = debugfs_create_dir( + "pbha", kbdev->mali_debugfs_directory); + if (IS_ERR_OR_NULL(debugfs_pbha_dir)) { + dev_err(kbdev->dev, + "Couldn't create mali debugfs page-based hardware attributes directory\n"); + return; + } + + debugfs_create_file("int_id_overrides", mode, debugfs_pbha_dir, + kbdev, &pbha_int_id_overrides_fops); + } +} diff --git a/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_control_registers.h b/mali_kbase/mali_kbase_pbha_debugfs.h index b62a8b0..3f477b4 100644 --- a/common/include/uapi/gpu/arm/midgard/csf/mali_gpu_csf_control_registers.h +++ b/mali_kbase/mali_kbase_pbha_debugfs.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * - * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved. + * (C) COPYRIGHT 2021 ARM Limited. All rights reserved. * * This program is free software and is provided to you under the terms of the * GNU General Public License version 2 as published by the Free Software @@ -19,14 +19,16 @@ * */ -/* - * This header was autogenerated, it should not be edited. - */ +#ifndef _KBASE_PBHA_DEBUGFS_H +#define _KBASE_PBHA_DEBUGFS_H -#ifndef _UAPI_GPU_CSF_CONTROL_REGISTERS_H_ -#define _UAPI_GPU_CSF_CONTROL_REGISTERS_H_ +#include <mali_kbase.h> -/* GPU_REGISTERS register offsets */ -#define GPU_CONTROL_MCU 0x3000 /* () MCU control registers */ +/** + * kbasep_pbha_debugfs_init - Initialize pbha debugfs directory + * + * @kbdev: Device pointer + */ +void kbase_pbha_debugfs_init(struct kbase_device *kbdev); -#endif /* _UAPI_GPU_CSF_CONTROL_REGISTERS_H_ */ +#endif /* _KBASE_PBHA_DEBUGFS_H */ diff --git a/mali_kbase/mali_kbase_pm.c b/mali_kbase/mali_kbase_pm.c index de100dd..4078da1 100644 --- a/mali_kbase/mali_kbase_pm.c +++ b/mali_kbase/mali_kbase_pm.c @@ -26,6 +26,7 @@ #include <mali_kbase.h> #include <gpu/mali_kbase_gpu_regmap.h> #include <mali_kbase_vinstr.h> +#include <mali_kbase_kinstr_prfcnt.h> #include <mali_kbase_hwcnt_context.h> #include <mali_kbase_pm.h> @@ -76,13 +77,13 @@ int kbase_pm_context_active_handle_suspend(struct kbase_device *kbdev, case KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE: if (kbdev->pm.active_count != 0) break; - /* FALLTHROUGH */ + fallthrough; case KBASE_PM_SUSPEND_HANDLER_DONT_INCREASE: kbase_pm_unlock(kbdev); return 1; case KBASE_PM_SUSPEND_HANDLER_NOT_POSSIBLE: - /* FALLTHROUGH */ + fallthrough; default: KBASE_DEBUG_ASSERT_MSG(false, "unreachable"); break; @@ -147,10 +148,11 @@ void kbase_pm_driver_suspend(struct kbase_device *kbdev) { KBASE_DEBUG_ASSERT(kbdev); - /* Suspend vinstr. This blocks until the vinstr worker and timer are - * no longer running. + /* Suspend HW counter intermediaries. This blocks until workers and timers + * are no longer running. */ kbase_vinstr_suspend(kbdev->vinstr_ctx); + kbase_kinstr_prfcnt_suspend(kbdev->kinstr_prfcnt_ctx); /* Disable GPU hardware counters. * This call will block until counters are disabled. @@ -266,8 +268,9 @@ void kbase_pm_driver_resume(struct kbase_device *kbdev, bool arb_gpu_start) spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); #endif - /* Resume vinstr */ + /* Resume HW counters intermediaries. */ kbase_vinstr_resume(kbdev->vinstr_ctx); + kbase_kinstr_prfcnt_resume(kbdev->kinstr_prfcnt_ctx); } void kbase_pm_suspend(struct kbase_device *kbdev) diff --git a/mali_kbase/mali_kbase_regs_history_debugfs.h b/mali_kbase/mali_kbase_regs_history_debugfs.h index 3b181d3..26decb4 100644 --- a/mali_kbase/mali_kbase_regs_history_debugfs.h +++ b/mali_kbase/mali_kbase_regs_history_debugfs.h @@ -70,6 +70,15 @@ void kbase_io_history_dump(struct kbase_device *kbdev); void kbasep_regs_history_debugfs_init(struct kbase_device *kbdev); #else /* defined(CONFIG_DEBUG_FS) && !IS_ENABLED(CONFIG_MALI_NO_MALI) */ + +#define kbase_io_history_init(...) ((int)0) + +#define kbase_io_history_term CSTD_NOP + +#define kbase_io_history_dump CSTD_NOP + +#define kbasep_regs_history_debugfs_init CSTD_NOP + #endif /* defined(CONFIG_DEBUG_FS) && !IS_ENABLED(CONFIG_MALI_NO_MALI) */ #endif /*_KBASE_REGS_HISTORY_DEBUGFS_H*/ diff --git a/mali_kbase/mali_kbase_reset_gpu.h b/mali_kbase/mali_kbase_reset_gpu.h index 897b732..7502fe8 100644 --- a/mali_kbase/mali_kbase_reset_gpu.h +++ b/mali_kbase/mali_kbase_reset_gpu.h @@ -91,7 +91,8 @@ int kbase_reset_gpu_prevent_and_wait(struct kbase_device *kbdev); * Refer to kbase_reset_gpu_prevent_and_wait() for more information. * * Return: 0 on success. -EAGAIN if a reset is currently happening. Other - * negative error codes on failure. + * negative error codes on failure, where -ENOMEM indicates that GPU reset + * had failed. */ int kbase_reset_gpu_try_prevent(struct kbase_device *kbdev); diff --git a/mali_kbase/mali_kbase_vinstr.c b/mali_kbase/mali_kbase_vinstr.c index d00bc00..6a1e782 100644 --- a/mali_kbase/mali_kbase_vinstr.c +++ b/mali_kbase/mali_kbase_vinstr.c @@ -24,6 +24,7 @@ #include "mali_kbase_hwcnt_types.h" #include <uapi/gpu/arm/midgard/mali_kbase_hwcnt_reader.h> #include "mali_kbase_hwcnt_gpu.h" +#include "mali_kbase_hwcnt_gpu_narrow.h" #include <uapi/gpu/arm/midgard/mali_kbase_ioctl.h> #include "mali_malisw.h" #include "mali_kbase_debug.h" @@ -55,8 +56,8 @@ * @metadata: Hardware counter metadata provided by virtualizer. * @metadata_user: API compatible hardware counter metadata provided by vinstr. * For compatibility with the user driver interface, this - * contains a "truncated" version of the HWCNT metadata limited - * to 64 entries per block. NULL when not required. + * contains a narrowed version of the HWCNT metadata limited + * to 64 entries per block of 32 bits each. * @lock: Lock protecting all vinstr state. * @suspend_count: Suspend reference count. If non-zero, timer and worker are * prevented from being re-scheduled. @@ -68,7 +69,7 @@ struct kbase_vinstr_context { struct kbase_hwcnt_virtualizer *hvirt; const struct kbase_hwcnt_metadata *metadata; - const struct kbase_hwcnt_metadata *metadata_user; + const struct kbase_hwcnt_metadata_narrow *metadata_user; struct mutex lock; size_t suspend_count; size_t client_count; @@ -89,8 +90,8 @@ struct kbase_vinstr_context { * occur. If 0, not a periodic client. * @enable_map: Counters enable map. * @tmp_buf: Temporary buffer to use before handing dump to client. - * @dump_bufs: Array of dump buffers allocated by this client. - * @dump_bufs_meta: Metadata of dump buffers. + * @dump_bufs: Array of narrow dump buffers allocated by this client. + * @dump_bufs_meta: Metadata of hwcnt reader client buffers. * @meta_idx: Index of metadata being accessed by userspace. * @read_idx: Index of buffer read by userspace. * @write_idx: Index of buffer being written by dump worker. @@ -104,7 +105,7 @@ struct kbase_vinstr_client { u32 dump_interval_ns; struct kbase_hwcnt_enable_map enable_map; struct kbase_hwcnt_dump_buffer tmp_buf; - struct kbase_hwcnt_dump_buffer_array dump_bufs; + struct kbase_hwcnt_dump_buffer_narrow_array dump_bufs; struct kbase_hwcnt_reader_metadata *dump_bufs_meta; atomic_t meta_idx; atomic_t read_idx; @@ -190,7 +191,7 @@ static int kbasep_vinstr_client_dump( unsigned int write_idx; unsigned int read_idx; struct kbase_hwcnt_dump_buffer *tmp_buf; - struct kbase_hwcnt_dump_buffer *dump_buf; + struct kbase_hwcnt_dump_buffer_narrow *dump_buf; struct kbase_hwcnt_reader_metadata *meta; u8 clk_cnt; @@ -223,17 +224,11 @@ static int kbasep_vinstr_client_dump( * variant will explicitly zero any non-enabled counters to ensure * nothing except exactly what the user asked for is made visible. * - * If the metadata in vinstr (vctx->metadata_user) is not NULL, it means - * vinstr has the truncated metadata, so do a narrow copy since - * virtualizer has a bigger buffer but user only needs part of it. - * otherwise we do a full copy. + * A narrow copy is required since virtualizer has a bigger buffer + * but user only needs part of it. */ - if (vcli->vctx->metadata_user) - kbase_hwcnt_dump_buffer_copy_strict_narrow(dump_buf, tmp_buf, - &vcli->enable_map); - else - kbase_hwcnt_dump_buffer_copy_strict(dump_buf, tmp_buf, - &vcli->enable_map); + kbase_hwcnt_dump_buffer_copy_strict_narrow(dump_buf, tmp_buf, + &vcli->enable_map); clk_cnt = vcli->vctx->metadata->clk_cnt; @@ -388,7 +383,7 @@ static void kbasep_vinstr_client_destroy(struct kbase_vinstr_client *vcli) kbase_hwcnt_virtualizer_client_destroy(vcli->hvcli); kfree(vcli->dump_bufs_meta); - kbase_hwcnt_dump_buffer_array_free(&vcli->dump_bufs); + kbase_hwcnt_dump_buffer_narrow_array_free(&vcli->dump_bufs); kbase_hwcnt_dump_buffer_free(&vcli->tmp_buf); kbase_hwcnt_enable_map_free(&vcli->enable_map); kfree(vcli); @@ -446,20 +441,11 @@ static int kbasep_vinstr_client_create( /* Enable all the available clk_enable_map. */ vcli->enable_map.clk_enable_map = (1ull << vctx->metadata->clk_cnt) - 1; - if (vctx->metadata_user) - /* Use vinstr's truncated metadata to alloc dump buffers which - * interact with clients. - */ - errcode = - kbase_hwcnt_dump_buffer_array_alloc(vctx->metadata_user, - setup->buffer_count, - &vcli->dump_bufs); - else - /* Use metadata from virtualizer to allocate dump buffers if - * vinstr doesn't have the truncated metadata. - */ - errcode = kbase_hwcnt_dump_buffer_array_alloc( - vctx->metadata, setup->buffer_count, &vcli->dump_bufs); + /* Use vinstr's narrowed metadata to alloc narrow dump buffers which + * interact with clients. + */ + errcode = kbase_hwcnt_dump_buffer_narrow_array_alloc( + vctx->metadata_user, setup->buffer_count, &vcli->dump_bufs); if (errcode) goto error; @@ -504,9 +490,8 @@ int kbase_vinstr_init( vctx->hvirt = hvirt; vctx->metadata = metadata; - vctx->metadata_user = NULL; - errcode = kbase_hwcnt_gpu_metadata_create_truncate_64( - &vctx->metadata_user, metadata); + errcode = kbase_hwcnt_gpu_metadata_narrow_create(&vctx->metadata_user, + metadata); if (errcode) goto err_metadata_create; @@ -543,8 +528,7 @@ void kbase_vinstr_term(struct kbase_vinstr_context *vctx) } } - if (vctx->metadata_user) - kbase_hwcnt_metadata_destroy(vctx->metadata_user); + kbase_hwcnt_gpu_metadata_narrow_destroy(vctx->metadata_user); WARN_ON(vctx->client_count != 0); kfree(vctx); @@ -1007,14 +991,8 @@ static long kbasep_vinstr_hwcnt_reader_ioctl( cli, (u32 __user *)arg); break; case _IOC_NR(KBASE_HWCNT_READER_GET_BUFFER_SIZE): - if (cli->vctx->metadata_user) - rcode = put_user( - (u32)cli->vctx->metadata_user->dump_buf_bytes, - (u32 __user *)arg); - else - rcode = put_user( - (u32)cli->vctx->metadata->dump_buf_bytes, - (u32 __user *)arg); + rcode = put_user((u32)cli->vctx->metadata_user->dump_buf_bytes, + (u32 __user *)arg); break; case _IOC_NR(KBASE_HWCNT_READER_DUMP): rcode = kbasep_vinstr_hwcnt_reader_ioctl_dump(cli); diff --git a/mali_kbase/mali_malisw.h b/mali_kbase/mali_malisw.h index c0649f2..3ddfcd9 100644 --- a/mali_kbase/mali_malisw.h +++ b/mali_kbase/mali_malisw.h @@ -96,4 +96,9 @@ */ #define CSTD_STR2(x) CSTD_STR1(x) +/* LINUX_VERSION_CODE < 5.4 */ +#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE) +#define fallthrough CSTD_NOP(...) /* fallthrough */ +#endif + #endif /* _MALISW_H_ */ diff --git a/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c b/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c index 05253ae..c9ba3fc 100644 --- a/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c +++ b/mali_kbase/mmu/backend/mali_kbase_mmu_csf.c @@ -130,6 +130,7 @@ void kbase_mmu_report_mcu_as_fault_and_reset(struct kbase_device *kbdev, if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR)) kbase_reset_gpu(kbdev); + } KBASE_EXPORT_TEST_API(kbase_mmu_report_mcu_as_fault_and_reset); @@ -482,8 +483,6 @@ static void kbase_mmu_gpu_fault_worker(struct work_struct *data) kbase_csf_ctx_handle_fault(kctx, fault); kbase_ctx_sched_release_ctx_lock(kctx); - atomic_dec(&kbdev->faults_pending); - /* A work for GPU fault is complete. * Till reaching here, no further GPU fault will be reported. * Now clear the GPU fault to allow next GPU fault interrupt report. @@ -492,6 +491,8 @@ static void kbase_mmu_gpu_fault_worker(struct work_struct *data) kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND), GPU_COMMAND_CLEAR_FAULT); spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + atomic_dec(&kbdev->faults_pending); } /** diff --git a/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c b/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c index 01ca419..b050be8 100644 --- a/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c +++ b/mali_kbase/mmu/backend/mali_kbase_mmu_jm.c @@ -185,6 +185,7 @@ void kbase_mmu_report_fault_and_kill(struct kbase_context *kctx, KBASE_MMU_FAULT_TYPE_PAGE_UNEXPECTED); kbase_mmu_hw_enable_fault(kbdev, as, KBASE_MMU_FAULT_TYPE_PAGE_UNEXPECTED); + } /** diff --git a/mali_kbase/mmu/mali_kbase_mmu.c b/mali_kbase/mmu/mali_kbase_mmu.c index e3c5b15..5f6cc7a 100644 --- a/mali_kbase/mmu/mali_kbase_mmu.c +++ b/mali_kbase/mmu/mali_kbase_mmu.c @@ -43,7 +43,6 @@ #include <device/mali_kbase_device.h> #include <mali_kbase_trace_gpu_mem.h> -#define KBASE_MMU_PAGE_ENTRIES 512 /** * kbase_mmu_flush_invalidate() - Flush and invalidate the GPU caches. @@ -62,9 +61,12 @@ * If sync is set then accesses in the flushed region will be drained * before data is flush and invalidated through L1, L2 and into memory, * after which point this function will return. + * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops. */ -static void kbase_mmu_flush_invalidate(struct kbase_context *kctx, - u64 vpfn, size_t nr, bool sync); +static void +kbase_mmu_flush_invalidate(struct kbase_context *kctx, u64 vpfn, size_t nr, + bool sync, + enum kbase_caller_mmu_sync_info mmu_sync_info); /** * kbase_mmu_flush_invalidate_no_ctx() - Flush and invalidate the GPU caches. @@ -73,11 +75,13 @@ static void kbase_mmu_flush_invalidate(struct kbase_context *kctx, * @nr: The number of pages to flush. * @sync: Set if the operation should be synchronous or not. * @as_nr: GPU address space number for which flush + invalidate is required. + * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops. * * This is used for MMU tables which do not belong to a user space context. */ -static void kbase_mmu_flush_invalidate_no_ctx(struct kbase_device *kbdev, - u64 vpfn, size_t nr, bool sync, int as_nr); +static void kbase_mmu_flush_invalidate_no_ctx( + struct kbase_device *kbdev, u64 vpfn, size_t nr, bool sync, int as_nr, + enum kbase_caller_mmu_sync_info mmu_sync_info); /** * kbase_mmu_sync_pgd() - sync page directory to memory when needed. @@ -112,6 +116,31 @@ static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn, unsigned long flags, int group_id); /** + * kbase_mmu_update_and_free_parent_pgds() - Update number of valid entries and + * free memory of the page directories + * + * @kbdev: Device pointer. + * @mmut: GPU MMU page table. + * @pgds: Physical addresses of page directories to be freed. + * @vpfn: The virtual page frame number. + * @level: The level of MMU page table. + */ +static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev, + struct kbase_mmu_table *mmut, + phys_addr_t *pgds, u64 vpfn, + int level); +/** + * kbase_mmu_free_pgd() - Free memory of the page directory + * + * @kbdev: Device pointer. + * @mmut: GPU MMU page table. + * @pgd: Physical address of page directory to be freed. + * @dirty: Flag to indicate whether the page may be dirty in the cache. + */ +static void kbase_mmu_free_pgd(struct kbase_device *kbdev, + struct kbase_mmu_table *mmut, phys_addr_t pgd, + bool dirty); +/** * reg_grow_calc_extra_pages() - Calculate the number of backed pages to add to * a region on a GPU page fault * @kbdev: KBase device @@ -191,17 +220,31 @@ static size_t reg_grow_calc_extra_pages(struct kbase_device *kbdev, } #ifdef CONFIG_MALI_CINSTR_GWT -static void kbase_gpu_mmu_handle_write_faulting_as( - struct kbase_device *kbdev, - struct kbase_as *faulting_as, - u64 start_pfn, size_t nr, u32 op) +static void kbase_gpu_mmu_handle_write_faulting_as(struct kbase_device *kbdev, + struct kbase_as *faulting_as, + u64 start_pfn, size_t nr, + u32 kctx_id) { + /* Calls to this function are inherently synchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC; + struct kbase_mmu_hw_op_param op_param; + mutex_lock(&kbdev->mmu_hw_mutex); kbase_mmu_hw_clear_fault(kbdev, faulting_as, KBASE_MMU_FAULT_TYPE_PAGE); - kbase_mmu_hw_do_operation(kbdev, faulting_as, start_pfn, - nr, op, 1); + + /* flush L2 and unlock the VA (resumes the MMU) */ + op_param = (struct kbase_mmu_hw_op_param){ + .vpfn = start_pfn, + .nr = nr, + .op = KBASE_MMU_OP_FLUSH_PT, + .kctx_id = kctx_id, + .mmu_sync_info = mmu_sync_info, + }; + kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param); mutex_unlock(&kbdev->mmu_hw_mutex); @@ -217,7 +260,6 @@ static void kbase_gpu_mmu_handle_write_fault(struct kbase_context *kctx, struct kbase_device *kbdev; struct kbase_fault *fault; u64 fault_pfn, pfn_offset; - u32 op; int ret; int as_no; @@ -280,11 +322,8 @@ static void kbase_gpu_mmu_handle_write_fault(struct kbase_context *kctx, &kbase_get_gpu_phy_pages(region)[pfn_offset], 1, region->flags, region->gpu_alloc->group_id); - /* flush L2 and unlock the VA (resumes the MMU) */ - op = AS_COMMAND_FLUSH_PT; - - kbase_gpu_mmu_handle_write_faulting_as(kbdev, faulting_as, - fault_pfn, 1, op); + kbase_gpu_mmu_handle_write_faulting_as(kbdev, faulting_as, fault_pfn, 1, + kctx->id); kbase_gpu_vm_unlock(kctx); } @@ -554,6 +593,11 @@ void kbase_mmu_page_fault_worker(struct work_struct *data) size_t pages_trimmed = 0; #endif + /* Calls to this function are inherently synchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC; + faulting_as = container_of(data, struct kbase_as, work_pagefault); fault = &faulting_as->pf_data; fault_pfn = fault->addr >> PAGE_SHIFT; @@ -720,6 +764,8 @@ page_fault_retry: current_backed_size = kbase_reg_current_backed_size(region); if (fault_rel_pfn < current_backed_size) { + struct kbase_mmu_hw_op_param op_param; + dev_dbg(kbdev->dev, "Page fault @ 0x%llx in allocated region 0x%llx-0x%llx of growable TMEM: Ignoring", fault->addr, region->start_pfn, @@ -738,8 +784,14 @@ page_fault_retry: * transaction (which should cause the other page fault to be * raised again). */ - kbase_mmu_hw_do_operation(kbdev, faulting_as, 0, 0, - AS_COMMAND_UNLOCK, 1); + op_param = (struct kbase_mmu_hw_op_param){ + .vpfn = 0, + .nr = 0, + .op = KBASE_MMU_OP_UNLOCK, + .kctx_id = kctx->id, + .mmu_sync_info = mmu_sync_info, + }; + kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param); mutex_unlock(&kbdev->mmu_hw_mutex); @@ -758,14 +810,23 @@ page_fault_retry: new_pages); if (new_pages == 0) { + struct kbase_mmu_hw_op_param op_param; + mutex_lock(&kbdev->mmu_hw_mutex); /* Duplicate of a fault we've already handled, nothing to do */ kbase_mmu_hw_clear_fault(kbdev, faulting_as, KBASE_MMU_FAULT_TYPE_PAGE); + /* See comment [1] about UNLOCK usage */ - kbase_mmu_hw_do_operation(kbdev, faulting_as, 0, 0, - AS_COMMAND_UNLOCK, 1); + op_param = (struct kbase_mmu_hw_op_param){ + .vpfn = 0, + .nr = 0, + .op = KBASE_MMU_OP_UNLOCK, + .kctx_id = kctx->id, + .mmu_sync_info = mmu_sync_info, + }; + kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param); mutex_unlock(&kbdev->mmu_hw_mutex); @@ -791,7 +852,7 @@ page_fault_retry: if (grown) { u64 pfn_offset; - u32 op; + struct kbase_mmu_hw_op_param op_param; /* alloc success */ WARN_ON(kbase_reg_current_backed_size(region) > @@ -854,9 +915,6 @@ page_fault_retry: /* AS transaction begin */ mutex_lock(&kbdev->mmu_hw_mutex); - /* flush L2 and unlock the VA (resumes the MMU) */ - op = AS_COMMAND_FLUSH_PT; - /* clear MMU interrupt - this needs to be done after updating * the page tables but before issuing a FLUSH command. The * FLUSH cmd has a side effect that it restarts stalled memory @@ -868,9 +926,15 @@ page_fault_retry: kbase_mmu_hw_clear_fault(kbdev, faulting_as, KBASE_MMU_FAULT_TYPE_PAGE); - kbase_mmu_hw_do_operation(kbdev, faulting_as, - fault->addr >> PAGE_SHIFT, - new_pages, op, 1); + /* flush L2 and unlock the VA (resumes the MMU) */ + op_param = (struct kbase_mmu_hw_op_param){ + .vpfn = fault->addr >> PAGE_SHIFT, + .nr = new_pages, + .op = KBASE_MMU_OP_FLUSH_PT, + .kctx_id = kctx->id, + .mmu_sync_info = mmu_sync_info, + }; + kbase_mmu_hw_do_operation(kbdev, faulting_as, &op_param); mutex_unlock(&kbdev->mmu_hw_mutex); /* AS transaction end */ @@ -1073,7 +1137,7 @@ static int mmu_get_next_pgd(struct kbase_device *kbdev, return -ENOMEM; } - kbdev->mmu_mode->entry_set_pte(&page[vpfn], target_pgd); + kbdev->mmu_mode->entry_set_pte(page, vpfn, target_pgd); kbase_mmu_sync_pgd(kbdev, kbase_dma_addr(p), PAGE_SIZE); /* Rely on the caller to update the address space flags. */ @@ -1149,6 +1213,8 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev, unsigned int left = to_vpfn - vpfn; int level; u64 *page; + register unsigned int num_of_valid_entries; + phys_addr_t pgds[MIDGARD_MMU_BOTTOMLEVEL + 1]; if (count > left) count = left; @@ -1159,6 +1225,7 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev, for (level = MIDGARD_MMU_TOPLEVEL; level <= MIDGARD_MMU_BOTTOMLEVEL; level++) { idx = (vpfn >> ((3 - level) * 9)) & 0x1FF; + pgds[level] = pgd; page = kmap(phys_to_page(pgd)); if (mmu_mode->ate_is_valid(page[idx], level)) break; /* keep the mapping */ @@ -1181,15 +1248,33 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev, goto next; } + num_of_valid_entries = mmu_mode->get_num_valid_entries(page); + if (WARN_ON_ONCE(num_of_valid_entries < pcount)) + num_of_valid_entries = 0; + else + num_of_valid_entries -= pcount; + + if (!num_of_valid_entries) { + kunmap(phys_to_page(pgd)); + + kbase_mmu_free_pgd(kbdev, mmut, pgd, true); + + kbase_mmu_update_and_free_parent_pgds(kbdev, mmut, pgds, + vpfn, level); + vpfn += count; + continue; + } + /* Invalidate the entries we added */ for (i = 0; i < pcount; i++) mmu_mode->entry_invalidate(&page[idx + i]); + mmu_mode->set_num_valid_entries(page, num_of_valid_entries); + kbase_mmu_sync_pgd(kbdev, kbase_dma_addr(phys_to_page(pgd)) + 8 * idx, 8 * pcount); kunmap(phys_to_page(pgd)); - next: vpfn += count; } @@ -1199,8 +1284,9 @@ next: * Map the single page 'phys' 'nr' of times, starting at GPU PFN 'vpfn' */ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn, - struct tagged_addr phys, size_t nr, - unsigned long flags, int const group_id) + struct tagged_addr phys, size_t nr, + unsigned long flags, int const group_id, + enum kbase_caller_mmu_sync_info mmu_sync_info) { phys_addr_t pgd; u64 *pgd_page; @@ -1233,12 +1319,13 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn, unsigned int index = vpfn & 0x1FF; unsigned int count = KBASE_MMU_PAGE_ENTRIES - index; struct page *p; + register unsigned int num_of_valid_entries; if (count > remain) count = remain; /* - * Repeatedly calling mmu_get_bottom_pte() is clearly + * Repeatedly calling mmu_get_bottom_pgd() is clearly * suboptimal. We don't have to re-parse the whole tree * each time (just cache the l0-l2 sequence). * On the other hand, it's only a gain when we map more than @@ -1264,7 +1351,8 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn, mutex_lock(&kctx->mmu.mmu_lock); } while (!err); if (err) { - dev_warn(kbdev->dev, "kbase_mmu_insert_pages: mmu_get_bottom_pgd failure\n"); + dev_warn(kbdev->dev, "%s: mmu_get_bottom_pgd failure\n", + __func__); if (recover_required) { /* Invalidate the pages we have partially * completed @@ -1280,7 +1368,7 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn, p = pfn_to_page(PFN_DOWN(pgd)); pgd_page = kmap(p); if (!pgd_page) { - dev_warn(kbdev->dev, "kbase_mmu_insert_pages: kmap failure\n"); + dev_warn(kbdev->dev, "%s: kmap failure\n", __func__); if (recover_required) { /* Invalidate the pages we have partially * completed @@ -1294,6 +1382,9 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn, goto fail_unlock; } + num_of_valid_entries = + kbdev->mmu_mode->get_num_valid_entries(pgd_page); + for (i = 0; i < count; i++) { unsigned int ofs = index + i; @@ -1304,6 +1395,9 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn, phys, flags, MIDGARD_MMU_BOTTOMLEVEL, group_id); } + kbdev->mmu_mode->set_num_valid_entries( + pgd_page, num_of_valid_entries + count); + vpfn += count; remain -= count; @@ -1320,38 +1414,41 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn, recover_count += count; } mutex_unlock(&kctx->mmu.mmu_lock); - kbase_mmu_flush_invalidate(kctx, start_vpfn, nr, false); + kbase_mmu_flush_invalidate(kctx, start_vpfn, nr, false, mmu_sync_info); return 0; fail_unlock: mutex_unlock(&kctx->mmu.mmu_lock); - kbase_mmu_flush_invalidate(kctx, start_vpfn, nr, false); + kbase_mmu_flush_invalidate(kctx, start_vpfn, nr, false, mmu_sync_info); return err; } -static inline void cleanup_empty_pte(struct kbase_device *kbdev, - struct kbase_mmu_table *mmut, u64 *pte) +static void kbase_mmu_free_pgd(struct kbase_device *kbdev, + struct kbase_mmu_table *mmut, phys_addr_t pgd, + bool dirty) { - phys_addr_t tmp_pgd; - struct page *tmp_p; + struct page *p; + + lockdep_assert_held(&mmut->mmu_lock); + + p = pfn_to_page(PFN_DOWN(pgd)); - tmp_pgd = kbdev->mmu_mode->pte_to_phy_addr(*pte); - tmp_p = phys_to_page(tmp_pgd); #ifdef CONFIG_MALI_2MB_ALLOC kbase_mem_pool_free(&kbdev->mem_pools.large[mmut->group_id], #else kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id], #endif - tmp_p, false); + p, dirty); + + atomic_sub(1, &kbdev->memdev.used_pages); - /* If the MMU tables belong to a context then we accounted the memory - * usage to that context, so decrement here. + /* If MMU tables belong to a context then pages will have been accounted + * against it, so we must decrement the usage counts here. */ if (mmut->kctx) { kbase_process_page_usage_dec(mmut->kctx, 1); atomic_sub(1, &mmut->kctx->used_pages); } - atomic_sub(1, &kbdev->memdev.used_pages); kbase_trace_gpu_mem_usage_dec(kbdev, mmut->kctx, 1); } @@ -1399,6 +1496,7 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev, unsigned int count = KBASE_MMU_PAGE_ENTRIES - vindex; struct page *p; int cur_level; + register unsigned int num_of_valid_entries; if (count > remain) count = remain; @@ -1463,14 +1561,25 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev, goto fail_unlock; } + num_of_valid_entries = + mmu_mode->get_num_valid_entries(pgd_page); + if (cur_level == MIDGARD_MMU_LEVEL(2)) { int level_index = (insert_vpfn >> 9) & 0x1FF; u64 *target = &pgd_page[level_index]; - if (mmu_mode->pte_is_valid(*target, cur_level)) - cleanup_empty_pte(kbdev, mmut, target); + if (mmu_mode->pte_is_valid(*target, cur_level)) { + kbase_mmu_free_pgd( + kbdev, mmut, + kbdev->mmu_mode->pte_to_phy_addr( + *target), + false); + num_of_valid_entries--; + } *target = kbase_mmu_create_ate(kbdev, *phys, flags, cur_level, group_id); + + num_of_valid_entries++; } else { for (i = 0; i < count; i++) { unsigned int ofs = vindex + i; @@ -1488,8 +1597,11 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev, *target = kbase_mmu_create_ate(kbdev, phys[i], flags, cur_level, group_id); } + num_of_valid_entries += count; } + mmu_mode->set_num_valid_entries(pgd_page, num_of_valid_entries); + phys += count; insert_vpfn += count; remain -= count; @@ -1513,9 +1625,10 @@ fail_unlock: * number 'as_nr'. */ int kbase_mmu_insert_pages(struct kbase_device *kbdev, - struct kbase_mmu_table *mmut, u64 vpfn, - struct tagged_addr *phys, size_t nr, - unsigned long flags, int as_nr, int const group_id) + struct kbase_mmu_table *mmut, u64 vpfn, + struct tagged_addr *phys, size_t nr, + unsigned long flags, int as_nr, int const group_id, + enum kbase_caller_mmu_sync_info mmu_sync_info) { int err; @@ -1523,10 +1636,11 @@ int kbase_mmu_insert_pages(struct kbase_device *kbdev, phys, nr, flags, group_id); if (mmut->kctx) - kbase_mmu_flush_invalidate(mmut->kctx, vpfn, nr, false); + kbase_mmu_flush_invalidate(mmut->kctx, vpfn, nr, false, + mmu_sync_info); else - kbase_mmu_flush_invalidate_no_ctx(kbdev, vpfn, nr, false, - as_nr); + kbase_mmu_flush_invalidate_no_ctx(kbdev, vpfn, nr, false, as_nr, + mmu_sync_info); return err; } @@ -1539,30 +1653,36 @@ KBASE_EXPORT_TEST_API(kbase_mmu_insert_pages); * @kctx: The KBase context. * @vpfn: The virtual page frame number to start the flush on. * @nr: The number of pages to flush. - * @sync: Set if the operation should be synchronous or not. * * As per kbase_mmu_flush_invalidate but doesn't retain the kctx or do any * other locking. */ static void kbase_mmu_flush_invalidate_noretain(struct kbase_context *kctx, - u64 vpfn, size_t nr, bool sync) + u64 vpfn, size_t nr) { struct kbase_device *kbdev = kctx->kbdev; + struct kbase_mmu_hw_op_param op_param; int err; - u32 op; + + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; /* Early out if there is nothing to do */ if (nr == 0) return; - if (sync) - op = AS_COMMAND_FLUSH_MEM; - else - op = AS_COMMAND_FLUSH_PT; - - err = kbase_mmu_hw_do_operation(kbdev, - &kbdev->as[kctx->as_nr], - vpfn, nr, op, 0); + /* flush L2 and unlock the VA (resumes the MMU) */ + op_param = (struct kbase_mmu_hw_op_param){ + .vpfn = vpfn, + .nr = nr, + .op = KBASE_MMU_OP_FLUSH_MEM, + .kctx_id = kctx->id, + .mmu_sync_info = mmu_sync_info, + }; + err = kbase_mmu_hw_do_operation(kbdev, &kbdev->as[kctx->as_nr], + &op_param); if (err) { /* Flush failed to complete, assume the * GPU has hung and perform a reset to recover @@ -1576,14 +1696,15 @@ static void kbase_mmu_flush_invalidate_noretain(struct kbase_context *kctx, /* Perform a flush/invalidate on a particular address space */ -static void kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev, - struct kbase_as *as, - u64 vpfn, size_t nr, bool sync) +static void +kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev, struct kbase_as *as, + u64 vpfn, size_t nr, bool sync, u32 kctx_id, + enum kbase_caller_mmu_sync_info mmu_sync_info) { int err; - u32 op; bool gpu_powered; unsigned long flags; + struct kbase_mmu_hw_op_param op_param; spin_lock_irqsave(&kbdev->hwaccess_lock, flags); gpu_powered = kbdev->pm.backend.gpu_powered; @@ -1611,13 +1732,19 @@ static void kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev, /* AS transaction begin */ mutex_lock(&kbdev->mmu_hw_mutex); + op_param = (struct kbase_mmu_hw_op_param){ + .vpfn = vpfn, + .nr = nr, + .kctx_id = kctx_id, + .mmu_sync_info = mmu_sync_info, + }; + if (sync) - op = AS_COMMAND_FLUSH_MEM; + op_param.op = KBASE_MMU_OP_FLUSH_MEM; else - op = AS_COMMAND_FLUSH_PT; + op_param.op = KBASE_MMU_OP_FLUSH_PT; - err = kbase_mmu_hw_do_operation(kbdev, - as, vpfn, nr, op, 0); + err = kbase_mmu_hw_do_operation(kbdev, as, &op_param); if (err) { /* Flush failed to complete, assume the GPU has hung and @@ -1636,18 +1763,23 @@ static void kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev, kbase_pm_context_idle(kbdev); } -static void kbase_mmu_flush_invalidate_no_ctx(struct kbase_device *kbdev, - u64 vpfn, size_t nr, bool sync, int as_nr) +static void +kbase_mmu_flush_invalidate_no_ctx(struct kbase_device *kbdev, u64 vpfn, + size_t nr, bool sync, int as_nr, + enum kbase_caller_mmu_sync_info mmu_sync_info) { /* Skip if there is nothing to do */ if (nr) { kbase_mmu_flush_invalidate_as(kbdev, &kbdev->as[as_nr], vpfn, - nr, sync); + nr, sync, 0xFFFFFFFF, + mmu_sync_info); } } -static void kbase_mmu_flush_invalidate(struct kbase_context *kctx, - u64 vpfn, size_t nr, bool sync) +static void +kbase_mmu_flush_invalidate(struct kbase_context *kctx, u64 vpfn, size_t nr, + bool sync, + enum kbase_caller_mmu_sync_info mmu_sync_info) { struct kbase_device *kbdev; bool ctx_is_in_runpool; @@ -1669,7 +1801,8 @@ static void kbase_mmu_flush_invalidate(struct kbase_context *kctx, KBASE_DEBUG_ASSERT(kctx->as_nr != KBASEP_AS_NR_INVALID); kbase_mmu_flush_invalidate_as(kbdev, &kbdev->as[kctx->as_nr], - vpfn, nr, sync); + vpfn, nr, sync, kctx->id, + mmu_sync_info); release_ctx(kbdev, kctx); } @@ -1714,17 +1847,58 @@ void kbase_mmu_disable(struct kbase_context *kctx) * The job scheduler code will already be holding the locks and context * so just do the flush. */ - kbase_mmu_flush_invalidate_noretain(kctx, 0, ~0, true); + kbase_mmu_flush_invalidate_noretain(kctx, 0, ~0); kctx->kbdev->mmu_mode->disable_as(kctx->kbdev, kctx->as_nr); } KBASE_EXPORT_TEST_API(kbase_mmu_disable); +static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev, + struct kbase_mmu_table *mmut, + phys_addr_t *pgds, u64 vpfn, + int level) +{ + int current_level; + + lockdep_assert_held(&mmut->mmu_lock); + + for (current_level = level - 1; current_level >= MIDGARD_MMU_LEVEL(0); + current_level--) { + u64 *current_page = kmap(phys_to_page(pgds[current_level])); + unsigned int current_valid_entries = + kbdev->mmu_mode->get_num_valid_entries(current_page); + + if (current_valid_entries == 1 && + current_level != MIDGARD_MMU_LEVEL(0)) { + kunmap(phys_to_page(pgds[current_level])); + + kbase_mmu_free_pgd(kbdev, mmut, pgds[current_level], + true); + } else { + int index = (vpfn >> ((3 - current_level) * 9)) & 0x1FF; + + kbdev->mmu_mode->entry_invalidate(¤t_page[index]); + + current_valid_entries--; + + kbdev->mmu_mode->set_num_valid_entries( + current_page, current_valid_entries); + + kbase_mmu_sync_pgd(kbdev, + kbase_dma_addr(phys_to_page( + pgds[current_level])) + + 8 * index, + 8 * 1); + + kunmap(phys_to_page(pgds[current_level])); + break; + } + } +} + /* - * We actually only discard the ATE, and not the page table - * pages. There is a potential DoS here, as we'll leak memory by - * having PTEs that are potentially unused. Will require physical - * page accounting, so MMU pages are part of the process allocation. + * We actually discard the ATE and free the page table pages if no valid entries + * exist in PGD. * * IMPORTANT: This uses kbasep_js_runpool_release_ctx() when the context is * currently scheduled into the runpool, and so potentially uses a lot of locks. @@ -1741,6 +1915,11 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev, struct kbase_mmu_mode const *mmu_mode; int err = -EFAULT; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + if (nr == 0) { /* early out if nothing to do */ return 0; @@ -1757,6 +1936,8 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev, unsigned int pcount; int level; u64 *page; + phys_addr_t pgds[MIDGARD_MMU_BOTTOMLEVEL + 1]; + register unsigned int num_of_valid_entries; if (count > nr) count = nr; @@ -1793,6 +1974,7 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev, goto next; } next_pgd = mmu_mode->pte_to_phy_addr(page[index]); + pgds[level] = pgd; kunmap(phys_to_page(pgd)); pgd = next_pgd; } @@ -1829,14 +2011,34 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev, continue; } + num_of_valid_entries = mmu_mode->get_num_valid_entries(page); + if (WARN_ON_ONCE(num_of_valid_entries < pcount)) + num_of_valid_entries = 0; + else + num_of_valid_entries -= pcount; + + if (!num_of_valid_entries) { + kunmap(phys_to_page(pgd)); + + kbase_mmu_free_pgd(kbdev, mmut, pgd, true); + + kbase_mmu_update_and_free_parent_pgds(kbdev, mmut, pgds, + vpfn, level); + + vpfn += count; + nr -= count; + continue; + } + /* Invalidate the entries we added */ for (i = 0; i < pcount; i++) mmu_mode->entry_invalidate(&page[index + i]); - kbase_mmu_sync_pgd(kbdev, - kbase_dma_addr(phys_to_page(pgd)) + - 8 * index, 8*pcount); + mmu_mode->set_num_valid_entries(page, num_of_valid_entries); + kbase_mmu_sync_pgd( + kbdev, kbase_dma_addr(phys_to_page(pgd)) + 8 * index, + 8 * pcount); next: kunmap(phys_to_page(pgd)); vpfn += count; @@ -1848,10 +2050,11 @@ out: if (mmut->kctx) kbase_mmu_flush_invalidate(mmut->kctx, start_vpfn, requested_nr, - true); + true, mmu_sync_info); else - kbase_mmu_flush_invalidate_no_ctx(kbdev, start_vpfn, requested_nr, - true, as_nr); + kbase_mmu_flush_invalidate_no_ctx(kbdev, start_vpfn, + requested_nr, true, as_nr, + mmu_sync_info); return err; } @@ -1903,6 +2106,7 @@ static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn, unsigned int index = vpfn & 0x1FF; size_t count = KBASE_MMU_PAGE_ENTRIES - index; struct page *p; + register unsigned int num_of_valid_entries; if (count > nr) count = nr; @@ -1940,10 +2144,22 @@ static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn, goto fail_unlock; } - for (i = 0; i < count; i++) + num_of_valid_entries = + kbdev->mmu_mode->get_num_valid_entries(pgd_page); + + for (i = 0; i < count; i++) { +#ifdef CONFIG_MALI_DEBUG + WARN_ON_ONCE(!kbdev->mmu_mode->ate_is_valid( + pgd_page[index + i], + MIDGARD_MMU_BOTTOMLEVEL)); +#endif pgd_page[index + i] = kbase_mmu_create_ate(kbdev, phys[i], flags, MIDGARD_MMU_BOTTOMLEVEL, group_id); + } + + kbdev->mmu_mode->set_num_valid_entries(pgd_page, + num_of_valid_entries); phys += count; vpfn += count; @@ -1970,9 +2186,14 @@ int kbase_mmu_update_pages(struct kbase_context *kctx, u64 vpfn, { int err; + /* Calls to this function are inherently asynchronous, with respect to + * MMU operations. + */ + const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; + err = kbase_mmu_update_pages_no_flush(kctx, vpfn, phys, nr, flags, group_id); - kbase_mmu_flush_invalidate(kctx, vpfn, nr, true); + kbase_mmu_flush_invalidate(kctx, vpfn, nr, true, mmu_sync_info); return err; } @@ -1981,13 +2202,18 @@ static void mmu_teardown_level(struct kbase_device *kbdev, int level, u64 *pgd_page_buffer) { phys_addr_t target_pgd; - struct page *p; u64 *pgd_page; int i; struct kbase_mmu_mode const *mmu_mode; lockdep_assert_held(&mmut->mmu_lock); + /* Early-out. No need to kmap to check entries for L3 PGD. */ + if (level == MIDGARD_MMU_BOTTOMLEVEL) { + kbase_mmu_free_pgd(kbdev, mmut, pgd, true); + return; + } + pgd_page = kmap_atomic(pfn_to_page(PFN_DOWN(pgd))); /* kmap_atomic should NEVER fail. */ if (WARN_ON(pgd_page == NULL)) @@ -2015,25 +2241,7 @@ static void mmu_teardown_level(struct kbase_device *kbdev, } } - p = pfn_to_page(PFN_DOWN(pgd)); -#ifdef CONFIG_MALI_2MB_ALLOC - kbase_mem_pool_free(&kbdev->mem_pools.large[mmut->group_id], -#else - kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id], -#endif - p, true); - - atomic_sub(1, &kbdev->memdev.used_pages); - - /* If MMU tables belong to a context then pages will have been accounted - * against it, so we must decrement the usage counts here. - */ - if (mmut->kctx) { - kbase_process_page_usage_dec(mmut->kctx, 1); - atomic_sub(1, &mmut->kctx->used_pages); - } - - kbase_trace_gpu_mem_usage_dec(kbdev, mmut->kctx, 1); + kbase_mmu_free_pgd(kbdev, mmut, pgd, true); } int kbase_mmu_init(struct kbase_device *const kbdev, @@ -2293,6 +2501,13 @@ void kbase_mmu_bus_fault_worker(struct work_struct *data) } +#if MALI_USE_CSF + /* Before the GPU power off, wait is done for the completion of + * in-flight MMU fault work items. So GPU is expected to remain + * powered up whilst the bus fault handling is being done. + */ + kbase_gpu_report_bus_fault_and_kill(kctx, faulting_as, fault); +#else /* NOTE: If GPU already powered off for suspend, * we don't need to switch to unmapped */ @@ -2301,6 +2516,7 @@ void kbase_mmu_bus_fault_worker(struct work_struct *data) kbase_gpu_report_bus_fault_and_kill(kctx, faulting_as, fault); kbase_pm_context_idle(kbdev); } +#endif release_ctx(kbdev, kctx); diff --git a/mali_kbase/mmu/mali_kbase_mmu.h b/mali_kbase/mmu/mali_kbase_mmu.h index a2d1a8e..45a628c 100644 --- a/mali_kbase/mmu/mali_kbase_mmu.h +++ b/mali_kbase/mmu/mali_kbase_mmu.h @@ -22,6 +22,29 @@ #ifndef _KBASE_MMU_H_ #define _KBASE_MMU_H_ +#include <uapi/gpu/arm/midgard/mali_base_kernel.h> + +#define KBASE_MMU_PAGE_ENTRIES 512 + +struct kbase_context; +struct kbase_mmu_table; + +/** + * MMU-synchronous caller info. A pointer to this type is passed down from the outer-most callers + * in the kbase module - where the information resides as to the synchronous / asynchronous + * nature of the call flow, with respect to MMU operations. ie - does the call flow relate to + * existing GPU work does it come from requests (like ioctl) from user-space, power management, + * etc. + */ +enum kbase_caller_mmu_sync_info { + /* default value must be invalid to avoid accidental choice ov a 'valid' value. */ + CALLER_MMU_UNSET_SYNCHRONICITY, + /* Arbitrary value for 'synchronous that isn't easy to choose by accident. */ + CALLER_MMU_SYNC = 0x02, + /* Also hard to choose by accident */ + CALLER_MMU_ASYNC +}; + /** * kbase_mmu_as_init() - Initialising GPU address space object. * @@ -111,10 +134,12 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev, int kbase_mmu_insert_pages(struct kbase_device *kbdev, struct kbase_mmu_table *mmut, u64 vpfn, struct tagged_addr *phys, size_t nr, - unsigned long flags, int as_nr, int group_id); + unsigned long flags, int as_nr, int group_id, + enum kbase_caller_mmu_sync_info mmu_sync_info); int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn, - struct tagged_addr phys, size_t nr, - unsigned long flags, int group_id); + struct tagged_addr phys, size_t nr, + unsigned long flags, int group_id, + enum kbase_caller_mmu_sync_info mmu_sync_info); int kbase_mmu_teardown_pages(struct kbase_device *kbdev, struct kbase_mmu_table *mmut, u64 vpfn, @@ -152,4 +177,22 @@ int kbase_mmu_bus_fault_interrupt(struct kbase_device *kbdev, u32 status, void kbase_mmu_gpu_fault_interrupt(struct kbase_device *kbdev, u32 status, u32 as_nr, u64 address, bool as_valid); +/** + * kbase_context_mmu_group_id_get - Decode a memory group ID from + * base_context_create_flags + * + * Memory allocated for GPU page tables will come from the returned group. + * + * @flags: Bitmask of flags to pass to base_context_init. + * + * Return: Physical memory group ID. Valid range is 0..(BASE_MEM_GROUP_COUNT-1). + */ +static inline int +kbase_context_mmu_group_id_get(base_context_create_flags const flags) +{ + KBASE_DEBUG_ASSERT(flags == + (flags & BASEP_CONTEXT_CREATE_ALLOWED_FLAGS)); + return (int)BASE_CONTEXT_MMU_GROUP_ID_GET(flags); +} + #endif /* _KBASE_MMU_H_ */ diff --git a/mali_kbase/mmu/mali_kbase_mmu_hw.h b/mali_kbase/mmu/mali_kbase_mmu_hw.h index d1f1ff2..7c0e95e 100644 --- a/mali_kbase/mmu/mali_kbase_mmu_hw.h +++ b/mali_kbase/mmu/mali_kbase_mmu_hw.h @@ -31,6 +31,8 @@ #ifndef _KBASE_MMU_HW_H_ #define _KBASE_MMU_HW_H_ +#include "mali_kbase_mmu.h" + /* Forward declarations */ struct kbase_device; struct kbase_as; @@ -53,6 +55,42 @@ enum kbase_mmu_fault_type { }; /** + * enum kbase_mmu_cache_flush_type - enum for MMU operations + * @KBASE_MMU_OP_NONE: To help catch uninitialized struct + * @KBASE_MMU_OP_FIRST: The lower boundary of enum + * @KBASE_MMU_OP_LOCK: Lock memory region + * @KBASE_MMU_OP_UNLOCK: Unlock memory region + * @KBASE_MMU_OP_FLUSH_PT: Flush page table (CLN+INV L2 only) + * @KBASE_MMU_OP_FLUSH_MEM: Flush memory (CLN+INV L2+LSC) + * @KBASE_MMU_OP_COUNT: The upper boundary of enum + */ +enum kbase_mmu_op_type { + KBASE_MMU_OP_NONE = 0, /* Must be zero */ + KBASE_MMU_OP_FIRST, /* Must be the first non-zero op */ + KBASE_MMU_OP_LOCK = KBASE_MMU_OP_FIRST, + KBASE_MMU_OP_UNLOCK, + KBASE_MMU_OP_FLUSH_PT, + KBASE_MMU_OP_FLUSH_MEM, + KBASE_MMU_OP_COUNT /* Must be the last in enum */ +}; + +/** + * struct kbase_mmu_hw_op_param - parameters for kbase_mmu_hw_do_operation() + * @vpfn: MMU Virtual Page Frame Number to start the operation on. + * @nr: Number of pages to work on. + * @type: Operation type (written to ASn_COMMAND). + * @kctx_id: Kernel context ID for MMU command tracepoint + * @mmu_sync_info: Indicates whether this call is synchronous wrt MMU ops. + */ +struct kbase_mmu_hw_op_param { + u64 vpfn; + u32 nr; + enum kbase_mmu_op_type op; + u32 kctx_id; + enum kbase_caller_mmu_sync_info mmu_sync_info; +}; + +/** * kbase_mmu_hw_configure - Configure an address space for use. * @kbdev: kbase device to configure. * @as: address space to configure. @@ -67,11 +105,7 @@ void kbase_mmu_hw_configure(struct kbase_device *kbdev, * kbase_mmu_hw_do_operation - Issue an operation to the MMU. * @kbdev: kbase device to issue the MMU operation on. * @as: address space to issue the MMU operation on. - * @vpfn: MMU Virtual Page Frame Number to start the operation on. - * @nr: Number of pages to work on. - * @type: Operation type (written to ASn_COMMAND). - * @handling_irq: Is this operation being called during the handling - * of an interrupt? + * @op_param: parameters for the operation. * * Issue an operation (MMU invalidate, MMU flush, etc) on the address space that * is associated with the provided kbase_context over the specified range @@ -79,8 +113,7 @@ void kbase_mmu_hw_configure(struct kbase_device *kbdev, * Return: Zero if the operation was successful, non-zero otherwise. */ int kbase_mmu_hw_do_operation(struct kbase_device *kbdev, struct kbase_as *as, - u64 vpfn, u32 nr, u32 type, - unsigned int handling_irq); + struct kbase_mmu_hw_op_param *op_param); /** * kbase_mmu_hw_clear_fault - Clear a fault that has been previously reported by diff --git a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c index a99b988..6306946 100644 --- a/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c +++ b/mali_kbase/mmu/mali_kbase_mmu_hw_direct.c @@ -19,12 +19,13 @@ * */ +#include <device/mali_kbase_device.h> #include <linux/bitops.h> #include <mali_kbase.h> +#include <mali_kbase_ctx_sched.h> #include <mali_kbase_mem.h> #include <mmu/mali_kbase_mmu_hw.h> #include <tl/mali_kbase_tracepoints.h> -#include <device/mali_kbase_device.h> /** * lock_region() - Generate lockaddr to lock memory region in MMU @@ -35,47 +36,87 @@ * The lockaddr value is a combination of the starting address and * the size of the region that encompasses all the memory pages to lock. * - * The size is expressed as a logarithm: it is represented in a way - * that is compatible with the HW specification and it also determines - * how many of the lowest bits of the address are cleared. + * Bits 5:0 are used to represent the size, which must be a power of 2. + * The smallest amount of memory to be locked corresponds to 32 kB, + * i.e. 8 memory pages, because a MMU cache line is made of 64 bytes + * and every page table entry is 8 bytes. Therefore it is not possible + * to lock less than 8 memory pages at a time. + * + * The size is expressed as a logarithm minus one: + * - A value of 14 is thus interpreted as log(32 kB) = 15, where 32 kB + * is the smallest possible size. + * - Likewise, a value of 47 is interpreted as log(256 TB) = 48, where 256 TB + * is the largest possible size (implementation defined value according + * to the HW spec). + * + * Bits 11:6 are reserved. + * + * Bits 63:12 are used to represent the base address of the region to lock. + * Only the upper bits of the address are used; lowest bits are cleared + * to avoid confusion. + * + * The address is aligned to a multiple of the region size. This has profound + * implications on the region size itself: often the MMU will lock a region + * larger than the given number of pages, because the lock region cannot start + * from any arbitrary address. * * Return: 0 if success, or an error code on failure. */ static int lock_region(u64 pfn, u32 num_pages, u64 *lockaddr) { const u64 lockaddr_base = pfn << PAGE_SHIFT; - u64 lockaddr_size_log2, region_frame_number_start, - region_frame_number_end; + const u64 lockaddr_end = ((pfn + num_pages) << PAGE_SHIFT) - 1; + u64 lockaddr_size_log2; if (num_pages == 0) return -EINVAL; - /* The size is expressed as a logarithm and should take into account - * the possibility that some pages might spill into the next region. + /* The MMU lock region is a self-aligned region whose size + * is a power of 2 and that contains both start and end + * of the address range determined by pfn and num_pages. + * The size of the MMU lock region can be defined as the + * largest divisor that yields the same result when both + * start and end addresses are divided by it. + * + * For instance: pfn=0x4F000 num_pages=2 describe the + * address range between 0x4F000 and 0x50FFF. It is only + * 2 memory pages. However there isn't a single lock region + * of 8 kB that encompasses both addresses because 0x4F000 + * would fall into the [0x4E000, 0x4FFFF] region while + * 0x50000 would fall into the [0x50000, 0x51FFF] region. + * The minimum lock region size that includes the entire + * address range is 128 kB, and the region would be + * [0x40000, 0x5FFFF]. + * + * The region size can be found by comparing the desired + * start and end addresses and finding the highest bit + * that differs. The smallest naturally aligned region + * must include this bit change, hence the desired region + * starts with this bit (and subsequent bits) set to 0 + * and ends with the bit (and subsequent bits) set to 1. + * + * In the example above: 0x4F000 ^ 0x50FFF = 0x1FFFF + * therefore the highest bit that differs is bit #16 + * and the region size (as a logarithm) is 16 + 1 = 17, i.e. 128 kB. */ - lockaddr_size_log2 = fls(num_pages) + PAGE_SHIFT - 1; - - /* Round up if the number of pages is not a power of 2. */ - if (num_pages != ((u32)1 << (lockaddr_size_log2 - PAGE_SHIFT))) - lockaddr_size_log2 += 1; - - /* Round up if some memory pages spill into the next region. */ - region_frame_number_start = pfn >> (lockaddr_size_log2 - PAGE_SHIFT); - region_frame_number_end = - (pfn + num_pages - 1) >> (lockaddr_size_log2 - PAGE_SHIFT); - - if (region_frame_number_start < region_frame_number_end) - lockaddr_size_log2 += 1; - - /* Represent the size according to the HW specification. */ - lockaddr_size_log2 = MAX(lockaddr_size_log2, - KBASE_LOCK_REGION_MIN_SIZE_LOG2); + lockaddr_size_log2 = fls(lockaddr_base ^ lockaddr_end); + /* Cap the size against minimum and maximum values allowed. */ if (lockaddr_size_log2 > KBASE_LOCK_REGION_MAX_SIZE_LOG2) return -EINVAL; - /* The lowest bits are cleared and then set to size - 1 to represent - * the size in a way that is compatible with the HW specification. + lockaddr_size_log2 = + MAX(lockaddr_size_log2, KBASE_LOCK_REGION_MIN_SIZE_LOG2); + + /* Represent the result in a way that is compatible with HW spec. + * + * Upper bits are used for the base address, whose lower bits + * are cleared to avoid confusion because they are going to be ignored + * by the MMU anyway, since lock regions shall be aligned with + * a multiple of their size and cannot start from any address. + * + * Lower bits are used for the size, which is represented as + * logarithm minus one of the actual size. */ *lockaddr = lockaddr_base & ~((1ull << lockaddr_size_log2) - 1); *lockaddr |= lockaddr_size_log2 - 1; @@ -170,20 +211,30 @@ void kbase_mmu_hw_configure(struct kbase_device *kbdev, struct kbase_as *as) } int kbase_mmu_hw_do_operation(struct kbase_device *kbdev, struct kbase_as *as, - u64 vpfn, u32 nr, u32 op, - unsigned int handling_irq) + struct kbase_mmu_hw_op_param *op_param) { int ret; + u64 lock_addr = 0x0; lockdep_assert_held(&kbdev->mmu_hw_mutex); - if (op == AS_COMMAND_UNLOCK) { + if (op_param->op == KBASE_MMU_OP_UNLOCK) { /* Unlock doesn't require a lock first */ ret = write_cmd(kbdev, as->number, AS_COMMAND_UNLOCK); - } else { - u64 lock_addr; - ret = lock_region(vpfn, nr, &lock_addr); + /* Wait for UNLOCK command to complete */ + ret = wait_ready(kbdev, as->number); + + if (!ret) { + /* read MMU_AS_CONTROL.LOCKADDR register */ + lock_addr |= (u64)kbase_reg_read(kbdev, + MMU_AS_REG(as->number, AS_LOCKADDR_HI)) << 32; + lock_addr |= (u64)kbase_reg_read(kbdev, + MMU_AS_REG(as->number, AS_LOCKADDR_LO)); + } + } else if (op_param->op >= KBASE_MMU_OP_FIRST && + op_param->op < KBASE_MMU_OP_COUNT) { + ret = lock_region(op_param->vpfn, op_param->nr, &lock_addr); if (!ret) { /* Lock the region that needs to be updated */ @@ -195,12 +246,49 @@ int kbase_mmu_hw_do_operation(struct kbase_device *kbdev, struct kbase_as *as, (lock_addr >> 32) & 0xFFFFFFFFUL); write_cmd(kbdev, as->number, AS_COMMAND_LOCK); - /* Run the MMU operation */ - write_cmd(kbdev, as->number, op); - - /* Wait for the flush to complete */ + /* Translate and send operation to HW */ + switch (op_param->op) { + case KBASE_MMU_OP_FLUSH_PT: + write_cmd(kbdev, as->number, + AS_COMMAND_FLUSH_PT); + break; + case KBASE_MMU_OP_FLUSH_MEM: + write_cmd(kbdev, as->number, + AS_COMMAND_FLUSH_MEM); + break; + case KBASE_MMU_OP_LOCK: + /* No further operation. */ + break; + default: + dev_warn(kbdev->dev, + "Unsupported MMU operation (op=%d).\n", + op_param->op); + return -EINVAL; + }; + + /* Wait for the command to complete */ ret = wait_ready(kbdev, as->number); } + } else { + /* Code should not reach here. */ + dev_warn(kbdev->dev, "Invalid mmu operation (op=%d).\n", + op_param->op); + return -EINVAL; + } + + /* MMU command instrumentation */ + if (!ret) { + u64 lock_addr_base = AS_LOCKADDR_LOCKADDR_BASE_GET(lock_addr); + u32 lock_addr_size = AS_LOCKADDR_LOCKADDR_SIZE_GET(lock_addr); + + bool is_mmu_synchronous = false; + + if (op_param->mmu_sync_info == CALLER_MMU_SYNC) + is_mmu_synchronous = true; + + KBASE_TLSTREAM_AUX_MMU_COMMAND(kbdev, op_param->kctx_id, + op_param->op, is_mmu_synchronous, + lock_addr_base, lock_addr_size); } return ret; diff --git a/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c b/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c index 16b928d..6ef4c9d 100644 --- a/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c +++ b/mali_kbase/mmu/mali_kbase_mmu_mode_aarch64.c @@ -42,6 +42,9 @@ #define ENTRY_ACCESS_BIT (1ULL << 10) #define ENTRY_NX_BIT (1ULL << 54) +#define UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR (55) +#define VALID_ENTRY_MASK ((u64)0xF << UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR) + /* Helper Function to perform assignment of page table entries, to * ensure the use of strd, which is required on LPAE systems. */ @@ -85,6 +88,7 @@ static phys_addr_t pte_to_phy_addr(u64 entry) if (!(entry & 1)) return 0; + entry &= ~VALID_ENTRY_MASK; return entry & ~0xFFF; } @@ -151,10 +155,48 @@ static void entry_set_ate(u64 *entry, ENTRY_ACCESS_BIT | ENTRY_IS_ATE_L02); } -static void entry_set_pte(u64 *entry, phys_addr_t phy) +static unsigned int get_num_valid_entries(u64 *pgd) +{ + register unsigned int num_of_valid_entries; + + num_of_valid_entries = + (unsigned int)((pgd[2] & VALID_ENTRY_MASK) >> + (UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR - 8)); + num_of_valid_entries |= + (unsigned int)((pgd[1] & VALID_ENTRY_MASK) >> + (UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR - 4)); + num_of_valid_entries |= + (unsigned int)((pgd[0] & VALID_ENTRY_MASK) >> + (UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR)); + + return num_of_valid_entries; +} + +static void set_num_valid_entries(u64 *pgd, unsigned int num_of_valid_entries) +{ + WARN_ON_ONCE(num_of_valid_entries > KBASE_MMU_PAGE_ENTRIES); + + pgd[0] &= ~VALID_ENTRY_MASK; + pgd[0] |= ((u64)(num_of_valid_entries & 0xF) + << UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR); + + pgd[1] &= ~VALID_ENTRY_MASK; + pgd[1] |= ((u64)((num_of_valid_entries >> 4) & 0xF) + << UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR); + + pgd[2] &= ~VALID_ENTRY_MASK; + pgd[2] |= ((u64)((num_of_valid_entries >> 8) & 0xF) + << UNUSED_BIT_POSITION_IN_PAGE_DESCRIPTOR); +} + +static void entry_set_pte(u64 *pgd, u64 vpfn, phys_addr_t phy) { - page_table_entry_set(entry, (phy & PAGE_MASK) | - ENTRY_ACCESS_BIT | ENTRY_IS_PTE); + unsigned int nr_entries = get_num_valid_entries(pgd); + + page_table_entry_set(&pgd[vpfn], (phy & PAGE_MASK) | ENTRY_ACCESS_BIT | + ENTRY_IS_PTE); + + set_num_valid_entries(pgd, nr_entries + 1); } static void entry_invalidate(u64 *entry) @@ -172,6 +214,8 @@ static struct kbase_mmu_mode const aarch64_mode = { .entry_set_ate = entry_set_ate, .entry_set_pte = entry_set_pte, .entry_invalidate = entry_invalidate, + .get_num_valid_entries = get_num_valid_entries, + .set_num_valid_entries = set_num_valid_entries, .flags = KBASE_MMU_MODE_HAS_NON_CACHEABLE }; diff --git a/mali_kbase/platform/devicetree/mali_kbase_runtime_pm.c b/mali_kbase/platform/devicetree/mali_kbase_runtime_pm.c index 3b84d74..9ae2c02 100644 --- a/mali_kbase/platform/devicetree/mali_kbase_runtime_pm.c +++ b/mali_kbase/platform/devicetree/mali_kbase_runtime_pm.c @@ -77,13 +77,28 @@ static int pm_callback_power_on(struct kbase_device *kbdev) { int ret = 1; /* Assume GPU has been powered off */ int error; + unsigned long flags; - dev_dbg(kbdev->dev, "pm_callback_power_on %p\n", + dev_dbg(kbdev->dev, "%s %p\n", __func__, (void *)kbdev->dev->pm_domain); + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + WARN_ON(kbdev->pm.backend.gpu_powered); +#if MALI_USE_CSF + if (likely(kbdev->csf.firmware_inited)) { + WARN_ON(!kbdev->pm.active_count); + WARN_ON(kbdev->pm.runtime_active); + } + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + enable_gpu_power_control(kbdev); + CSTD_UNUSED(error); +#else + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + enable_gpu_power_control(kbdev); error = pm_runtime_get_sync(kbdev->dev); + if (error == 1) { /* * Let core know that the chip has not been @@ -93,22 +108,93 @@ static int pm_callback_power_on(struct kbase_device *kbdev) } dev_dbg(kbdev->dev, "pm_runtime_get_sync returned %d\n", error); +#endif /* MALI_USE_CSF */ return ret; } static void pm_callback_power_off(struct kbase_device *kbdev) { - dev_dbg(kbdev->dev, "pm_callback_power_off\n"); + unsigned long flags; + + dev_dbg(kbdev->dev, "%s\n", __func__); + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + WARN_ON(kbdev->pm.backend.gpu_powered); +#if MALI_USE_CSF + if (likely(kbdev->csf.firmware_inited)) { + WARN_ON(kbase_csf_scheduler_get_nr_active_csgs(kbdev)); + WARN_ON(kbdev->pm.backend.mcu_state != KBASE_MCU_OFF); + } + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + /* Power down the GPU immediately */ + disable_gpu_power_control(kbdev); +#else /* MALI_USE_CSF */ + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); +#ifdef KBASE_PM_RUNTIME pm_runtime_mark_last_busy(kbdev->dev); pm_runtime_put_autosuspend(kbdev->dev); - -#ifndef KBASE_PM_RUNTIME +#else + /* Power down the GPU immediately as runtime PM is disabled */ disable_gpu_power_control(kbdev); #endif +#endif /* MALI_USE_CSF */ +} + +#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME) +static void pm_callback_runtime_gpu_active(struct kbase_device *kbdev) +{ + unsigned long flags; + int error; + + lockdep_assert_held(&kbdev->pm.lock); + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + WARN_ON(!kbdev->pm.backend.gpu_powered); + WARN_ON(!kbdev->pm.active_count); + WARN_ON(kbdev->pm.runtime_active); + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + if (pm_runtime_status_suspended(kbdev->dev)) { + error = pm_runtime_get_sync(kbdev->dev); + dev_dbg(kbdev->dev, "pm_runtime_get_sync returned %d", error); + } else { + /* Call the async version here, otherwise there could be + * a deadlock if the runtime suspend operation is ongoing. + * Caller would have taken the kbdev->pm.lock and/or the + * scheduler lock, and the runtime suspend callback function + * will also try to acquire the same lock(s). + */ + error = pm_runtime_get(kbdev->dev); + dev_dbg(kbdev->dev, "pm_runtime_get returned %d", error); + } + + kbdev->pm.runtime_active = true; } +static void pm_callback_runtime_gpu_idle(struct kbase_device *kbdev) +{ + unsigned long flags; + + lockdep_assert_held(&kbdev->pm.lock); + + dev_dbg(kbdev->dev, "%s", __func__); + + spin_lock_irqsave(&kbdev->hwaccess_lock, flags); + WARN_ON(!kbdev->pm.backend.gpu_powered); + WARN_ON(kbdev->pm.backend.l2_state != KBASE_L2_OFF); + WARN_ON(kbdev->pm.active_count); + WARN_ON(!kbdev->pm.runtime_active); + spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags); + + pm_runtime_mark_last_busy(kbdev->dev); + pm_runtime_put_autosuspend(kbdev->dev); + kbdev->pm.runtime_active = false; +} +#endif + #ifdef KBASE_PM_RUNTIME static int kbase_device_runtime_init(struct kbase_device *kbdev) { @@ -124,7 +210,12 @@ static int kbase_device_runtime_init(struct kbase_device *kbdev) if (!pm_runtime_enabled(kbdev->dev)) { dev_warn(kbdev->dev, "pm_runtime not enabled"); - ret = -ENOSYS; + ret = -EINVAL; + } else if (atomic_read(&kbdev->dev->power.usage_count)) { + dev_warn(kbdev->dev, + "%s: Device runtime usage count unexpectedly non zero %d", + __func__, atomic_read(&kbdev->dev->power.usage_count)); + ret = -EINVAL; } return ret; @@ -133,9 +224,15 @@ static int kbase_device_runtime_init(struct kbase_device *kbdev) static void kbase_device_runtime_disable(struct kbase_device *kbdev) { dev_dbg(kbdev->dev, "kbase_device_runtime_disable\n"); + + if (atomic_read(&kbdev->dev->power.usage_count)) + dev_warn(kbdev->dev, + "%s: Device runtime usage count unexpectedly non zero %d", + __func__, atomic_read(&kbdev->dev->power.usage_count)); + pm_runtime_disable(kbdev->dev); } -#endif +#endif /* KBASE_PM_RUNTIME */ static int pm_callback_runtime_on(struct kbase_device *kbdev) { @@ -180,6 +277,14 @@ struct kbase_pm_callback_conf pm_callbacks = { .power_runtime_on_callback = NULL, .power_runtime_off_callback = NULL, #endif /* KBASE_PM_RUNTIME */ + +#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME) + .power_runtime_gpu_idle_callback = pm_callback_runtime_gpu_idle, + .power_runtime_gpu_active_callback = pm_callback_runtime_gpu_active, +#else + .power_runtime_gpu_idle_callback = NULL, + .power_runtime_gpu_active_callback = NULL, +#endif }; diff --git a/mali_kbase/tests/include/kutf/kutf_helpers.h b/mali_kbase/tests/include/kutf/kutf_helpers.h index c4c713c..79b1eac 100644 --- a/mali_kbase/tests/include/kutf/kutf_helpers.h +++ b/mali_kbase/tests/include/kutf/kutf_helpers.h @@ -81,4 +81,17 @@ int kutf_helper_input_enqueue(struct kutf_context *context, */ void kutf_helper_input_enqueue_end_of_data(struct kutf_context *context); +/* kutf_helper_external_reset_gpu() - Mimic power-on-reset using external reset + * + * Reset GPU using FPGA SYSCTL register. + * + * Note that + * - It must be called on the platform that has FPGA SYSCTL + * register available such as Juno board. + * - It won't reinitialize GPU related settings such as interrupt for kbase. + * + * Return: 0 on success, negative value otherwise. + */ +int kutf_helper_external_reset_gpu(void); + #endif /* _KERNEL_UTF_HELPERS_H_ */ diff --git a/mali_kbase/tests/kutf/kutf_helpers.c b/mali_kbase/tests/kutf/kutf_helpers.c index c075428..d76cebe 100644 --- a/mali_kbase/tests/kutf/kutf_helpers.c +++ b/mali_kbase/tests/kutf/kutf_helpers.c @@ -21,7 +21,6 @@ /* Kernel UTF test helpers */ #include <kutf/kutf_helpers.h> - #include <linux/err.h> #include <linux/jiffies.h> #include <linux/sched.h> @@ -29,6 +28,10 @@ #include <linux/wait.h> #include <linux/uaccess.h> #include <linux/export.h> +#include <linux/io.h> +#include <linux/delay.h> +#include "gpu/mali_kbase_gpu_regmap.h" +#include <device/mali_kbase_device.h> static DEFINE_SPINLOCK(kutf_input_lock); @@ -128,3 +131,44 @@ void kutf_helper_input_enqueue_end_of_data(struct kutf_context *context) { kutf_helper_input_enqueue(context, NULL, 0); } + +/* Values are taken from juno-fpga.dtsi */ +#define FPGA_SYSCTL_START_ADDR ((resource_size_t)0x6f020000) +#define FPGA_SYSCTL_SIZE ((size_t)0xCC) + +/* Offset of FPGA_SYSCTL_GPU_RESET_REG register */ +#define FPGA_SYSCTL_GPU_RESET_REG 0x64 +#define GPU_RESET_HIGH 0x1 +#define GPU_RESET_LOW 0x0 + +int kutf_helper_external_reset_gpu(void) +{ + void __iomem *regs = NULL; + void __iomem *gpu_reset_reg = NULL; + int error = -ENXIO; + int repeat = 100; + + regs = ioremap(FPGA_SYSCTL_START_ADDR, FPGA_SYSCTL_SIZE); + if (!regs) + return -ENOMEM; + + /* Reset GPU via SYSCTL_GPU_RESET by rising & falling the reset signal */ + gpu_reset_reg = regs + FPGA_SYSCTL_GPU_RESET_REG; + while (error && repeat--) { + writel(GPU_RESET_HIGH, gpu_reset_reg); + if (readl(gpu_reset_reg) == GPU_RESET_HIGH) { + mdelay(100); + writel(GPU_RESET_LOW, gpu_reset_reg); + mdelay(100); + + /* Succeed in resetting GPU */ + if (readl(gpu_reset_reg) == GPU_RESET_LOW) + error = 0; + } + } + + iounmap(regs); + + return error; +} +EXPORT_SYMBOL(kutf_helper_external_reset_gpu); diff --git a/mali_kbase/tests/kutf/kutf_suite.c b/mali_kbase/tests/kutf/kutf_suite.c index 6745299..d45d9df 100644 --- a/mali_kbase/tests/kutf/kutf_suite.c +++ b/mali_kbase/tests/kutf/kutf_suite.c @@ -582,7 +582,7 @@ static int create_fixture_variant(struct kutf_test_function *test_func, snprintf(name, sizeof(name), "%d", fixture_index); test_fix->dir = debugfs_create_dir(name, test_func->dir); - if (!test_func->dir) { + if (IS_ERR_OR_NULL(test_func->dir)) { pr_err("Failed to create debugfs directory when adding fixture\n"); /* Might not be the right error, we don't get it passed back to us */ err = -EEXIST; @@ -591,7 +591,7 @@ static int create_fixture_variant(struct kutf_test_function *test_func, tmp = debugfs_create_file("type", S_IROTH, test_fix->dir, "fixture\n", &kutf_debugfs_const_string_ops); - if (!tmp) { + if (IS_ERR_OR_NULL(tmp)) { pr_err("Failed to create debugfs file \"type\" when adding fixture\n"); /* Might not be the right error, we don't get it passed back to us */ err = -EEXIST; @@ -606,7 +606,7 @@ static int create_fixture_variant(struct kutf_test_function *test_func, "run", 0600, test_fix->dir, test_fix, &kutf_debugfs_run_ops); - if (!tmp) { + if (IS_ERR_OR_NULL(tmp)) { pr_err("Failed to create debugfs file \"run\" when adding fixture\n"); /* Might not be the right error, we don't get it passed back to us */ err = -EEXIST; @@ -666,14 +666,14 @@ void kutf_add_test_with_filters_and_data( INIT_LIST_HEAD(&test_func->variant_list); test_func->dir = debugfs_create_dir(name, suite->dir); - if (!test_func->dir) { + if (IS_ERR_OR_NULL(test_func->dir)) { pr_err("Failed to create debugfs directory when adding test %s\n", name); goto fail_dir; } tmp = debugfs_create_file("type", S_IROTH, test_func->dir, "test\n", &kutf_debugfs_const_string_ops); - if (!tmp) { + if (IS_ERR_OR_NULL(tmp)) { pr_err("Failed to create debugfs file \"type\" when adding test %s\n", name); goto fail_file; } @@ -686,7 +686,7 @@ void kutf_add_test_with_filters_and_data( tmp = debugfs_create_x32("filters", S_IROTH, test_func->dir, &test_func->filters); #endif - if (!tmp) { + if (IS_ERR_OR_NULL(tmp)) { pr_err("Failed to create debugfs file \"filters\" when adding test %s\n", name); goto fail_file; } @@ -698,7 +698,7 @@ void kutf_add_test_with_filters_and_data( #else tmp = debugfs_create_u32("test_id", S_IROTH, test_func->dir, &test_func->test_id); - if (!tmp) { + if (IS_ERR_OR_NULL(tmp)) { pr_err("Failed to create debugfs file \"test_id\" when adding test %s\n", name); goto fail_file; } @@ -805,14 +805,14 @@ struct kutf_suite *kutf_create_suite_with_filters_and_data( } suite->dir = debugfs_create_dir(name, app->dir); - if (!suite->dir) { + if (IS_ERR_OR_NULL(suite->dir)) { pr_err("Failed to create debugfs directory when adding test %s\n", name); goto fail_debugfs; } tmp = debugfs_create_file("type", S_IROTH, suite->dir, "suite\n", &kutf_debugfs_const_string_ops); - if (!tmp) { + if (IS_ERR_OR_NULL(tmp)) { pr_err("Failed to create debugfs file \"type\" when adding test %s\n", name); goto fail_file; } @@ -913,14 +913,14 @@ struct kutf_application *kutf_create_application(const char *name) } app->dir = debugfs_create_dir(name, base_dir); - if (!app->dir) { + if (IS_ERR_OR_NULL(app->dir)) { pr_err("Failed to create debugfs direcotry when creating application %s\n", name); goto fail_debugfs; } tmp = debugfs_create_file("type", S_IROTH, app->dir, "application\n", &kutf_debugfs_const_string_ops); - if (!tmp) { + if (IS_ERR_OR_NULL(tmp)) { pr_err("Failed to create debugfs file \"type\" when creating application %s\n", name); goto fail_file; } @@ -1172,7 +1172,7 @@ static int __init init_kutf_core(void) return -ENOMEM; base_dir = debugfs_create_dir("kutf_tests", NULL); - if (!base_dir) { + if (IS_ERR_OR_NULL(base_dir)) { destroy_workqueue(kutf_workq); kutf_workq = NULL; return -ENOMEM; diff --git a/mali_kbase/tests/mali_kutf_clk_rate_trace/kernel/mali_kutf_clk_rate_trace_test.c b/mali_kbase/tests/mali_kutf_clk_rate_trace/kernel/mali_kutf_clk_rate_trace_test.c index 5e9a2e7..87bcb31 100644 --- a/mali_kbase/tests/mali_kutf_clk_rate_trace/kernel/mali_kutf_clk_rate_trace_test.c +++ b/mali_kbase/tests/mali_kutf_clk_rate_trace/kernel/mali_kutf_clk_rate_trace_test.c @@ -530,7 +530,7 @@ static bool kutf_clk_trace_process_portal_cmd(struct kutf_context *context, errmsg = kutf_clk_trace_do_get_platform(context, cmd); break; case PORTAL_CMD_GET_CLK_RATE_MGR: - /* Fall through */ + fallthrough; case PORTAL_CMD_GET_CLK_RATE_TRACE: errmsg = kutf_clk_trace_do_get_rate(context, cmd); break; @@ -538,7 +538,7 @@ static bool kutf_clk_trace_process_portal_cmd(struct kutf_context *context, errmsg = kutf_clk_trace_do_get_snapshot(context, cmd); break; case PORTAL_CMD_INC_PM_CTX_CNT: - /* Fall through */ + fallthrough; case PORTAL_CMD_DEC_PM_CTX_CNT: errmsg = kutf_clk_trace_do_change_pm_ctx(context, cmd); break; diff --git a/mali_kbase/tl/backend/mali_kbase_timeline_csf.c b/mali_kbase/tl/backend/mali_kbase_timeline_csf.c index a2868da..c101563 100644 --- a/mali_kbase/tl/backend/mali_kbase_timeline_csf.c +++ b/mali_kbase/tl/backend/mali_kbase_timeline_csf.c @@ -25,6 +25,8 @@ #include <mali_kbase.h> +#define GPU_FEATURES_CROSS_STREAM_SYNC_MASK (1ull << 3ull) + void kbase_create_timeline_objects(struct kbase_device *kbdev) { unsigned int as_nr; @@ -33,6 +35,15 @@ void kbase_create_timeline_objects(struct kbase_device *kbdev) struct kbase_timeline *timeline = kbdev->timeline; struct kbase_tlstream *summary = &kbdev->timeline->streams[TL_STREAM_TYPE_OBJ_SUMMARY]; + u32 const kbdev_has_cross_stream_sync = + (kbdev->gpu_props.props.raw_props.gpu_features & + GPU_FEATURES_CROSS_STREAM_SYNC_MASK) ? + 1 : + 0; + u32 const arch_maj = (kbdev->gpu_props.props.raw_props.gpu_id & + GPU_ID2_ARCH_MAJOR) >> + GPU_ID2_ARCH_MAJOR_SHIFT; + u32 const num_sb_entries = arch_maj >= 11 ? 16 : 8; /* Summarize the Address Space objects. */ for (as_nr = 0; as_nr < kbdev->nr_hw_address_spaces; as_nr++) @@ -51,10 +62,11 @@ void kbase_create_timeline_objects(struct kbase_device *kbdev) kbdev); /* Trace the creation of a new kbase device and set its properties. */ - __kbase_tlstream_tl_kbase_new_device(summary, - kbdev->gpu_props.props.raw_props.gpu_id, + __kbase_tlstream_tl_kbase_new_device( + summary, kbdev->gpu_props.props.raw_props.gpu_id, kbdev->gpu_props.num_cores, kbdev->csf.global_iface.group_num, - kbdev->nr_hw_address_spaces); + kbdev->nr_hw_address_spaces, num_sb_entries, + kbdev_has_cross_stream_sync); /* Lock the context list, to ensure no changes to the list are made * while we're summarizing the contexts and their contents. @@ -74,9 +86,10 @@ void kbase_create_timeline_objects(struct kbase_device *kbdev) kbdev->csf.scheduler.csg_slots[slot_i].resident_group; if (group) - __kbase_tlstream_tl_kbase_device_program_csg(summary, + __kbase_tlstream_tl_kbase_device_program_csg( + summary, kbdev->gpu_props.props.raw_props.gpu_id, - group->handle, slot_i); + group->kctx->id, group->handle, slot_i); } /* Reset body stream buffers while holding the kctx lock. diff --git a/mali_kbase/tl/mali_kbase_timeline.c b/mali_kbase/tl/mali_kbase_timeline.c index 09818a5..af10cf5 100644 --- a/mali_kbase/tl/mali_kbase_timeline.c +++ b/mali_kbase/tl/mali_kbase_timeline.c @@ -224,13 +224,6 @@ int kbase_timeline_io_acquire(struct kbase_device *kbdev, u32 flags) timeline->obj_header_btc = obj_desc_header_size; timeline->aux_header_btc = aux_desc_header_size; - /* Start autoflush timer. */ - atomic_set(&timeline->autoflush_timer_active, 1); - rcode = mod_timer( - &timeline->autoflush_timer, - jiffies + msecs_to_jiffies(AUTOFLUSH_INTERVAL)); - CSTD_UNUSED(rcode); - #if !MALI_USE_CSF /* If job dumping is enabled, readjust the software event's * timeout as the default value of 3 seconds is often @@ -258,6 +251,16 @@ int kbase_timeline_io_acquire(struct kbase_device *kbdev, u32 flags) kbase_tlstream_current_devfreq_target(kbdev); #endif /* CONFIG_MALI_DEVFREQ */ + /* Start the autoflush timer. + * We must do this after creating timeline objects to ensure we + * don't auto-flush the streams which will be reset during the + * summarization process. + */ + atomic_set(&timeline->autoflush_timer_active, 1); + rcode = mod_timer(&timeline->autoflush_timer, + jiffies + + msecs_to_jiffies(AUTOFLUSH_INTERVAL)); + CSTD_UNUSED(rcode); } else { ret = -EBUSY; } diff --git a/mali_kbase/tl/mali_kbase_tracepoints.c b/mali_kbase/tl/mali_kbase_tracepoints.c index 2c0de01..54e51f8 100644 --- a/mali_kbase/tl/mali_kbase_tracepoints.c +++ b/mali_kbase/tl/mali_kbase_tracepoints.c @@ -74,6 +74,7 @@ enum tl_msg_id_obj { KBASE_TL_KBASE_NEW_DEVICE, KBASE_TL_KBASE_DEVICE_PROGRAM_CSG, KBASE_TL_KBASE_DEVICE_DEPROGRAM_CSG, + KBASE_TL_KBASE_DEVICE_HALT_CSG, KBASE_TL_KBASE_NEW_CTX, KBASE_TL_KBASE_DEL_CTX, KBASE_TL_KBASE_CTX_ASSIGN_AS, @@ -121,6 +122,17 @@ enum tl_msg_id_obj { KBASE_TL_KBASE_KCPUQUEUE_EXECUTE_GROUP_SUSPEND_END, KBASE_TL_KBASE_CSFFW_TLSTREAM_OVERFLOW, KBASE_TL_KBASE_CSFFW_RESET, + KBASE_TL_JS_SCHED_START, + KBASE_TL_JS_SCHED_END, + KBASE_TL_JD_SUBMIT_ATOM_START, + KBASE_TL_JD_SUBMIT_ATOM_END, + KBASE_TL_JD_DONE_NO_LOCK_START, + KBASE_TL_JD_DONE_NO_LOCK_END, + KBASE_TL_JD_DONE_START, + KBASE_TL_JD_DONE_END, + KBASE_TL_JD_ATOM_COMPLETE, + KBASE_TL_RUN_ATOM_START, + KBASE_TL_RUN_ATOM_END, KBASE_OBJ_MSG_COUNT, }; @@ -137,6 +149,7 @@ enum tl_msg_id_aux { KBASE_AUX_JIT_STATS, KBASE_AUX_TILER_HEAP_STATS, KBASE_AUX_EVENT_JOB_SLOT, + KBASE_AUX_MMU_COMMAND, KBASE_AUX_MSG_COUNT, }; @@ -299,16 +312,20 @@ enum tl_msg_id_aux { "gpu") \ TRACEPOINT_DESC(KBASE_TL_KBASE_NEW_DEVICE, \ "New KBase Device", \ - "@IIII", \ - "kbase_device_id,kbase_device_gpu_core_count,kbase_device_max_num_csgs,kbase_device_as_count") \ + "@IIIIII", \ + "kbase_device_id,kbase_device_gpu_core_count,kbase_device_max_num_csgs,kbase_device_as_count,kbase_device_sb_entry_count,kbase_device_has_cross_stream_sync") \ TRACEPOINT_DESC(KBASE_TL_KBASE_DEVICE_PROGRAM_CSG, \ "CSG is programmed to a slot", \ - "@III", \ - "kbase_device_id,gpu_cmdq_grp_handle,kbase_device_csg_slot_index") \ + "@IIII", \ + "kbase_device_id,kernel_ctx_id,gpu_cmdq_grp_handle,kbase_device_csg_slot_index") \ TRACEPOINT_DESC(KBASE_TL_KBASE_DEVICE_DEPROGRAM_CSG, \ "CSG is deprogrammed from a slot", \ "@II", \ "kbase_device_id,kbase_device_csg_slot_index") \ + TRACEPOINT_DESC(KBASE_TL_KBASE_DEVICE_HALT_CSG, \ + "CSG is halted", \ + "@II", \ + "kbase_device_id,kbase_device_csg_slot_index") \ TRACEPOINT_DESC(KBASE_TL_KBASE_NEW_CTX, \ "New KBase Context", \ "@II", \ @@ -497,6 +514,50 @@ enum tl_msg_id_aux { "A reset has happened with the CSFFW", \ "@L", \ "csffw_cycle") \ + TRACEPOINT_DESC(KBASE_TL_JS_SCHED_START, \ + "Scheduling starts", \ + "@I", \ + "dummy") \ + TRACEPOINT_DESC(KBASE_TL_JS_SCHED_END, \ + "Scheduling ends", \ + "@I", \ + "dummy") \ + TRACEPOINT_DESC(KBASE_TL_JD_SUBMIT_ATOM_START, \ + "Submitting an atom starts", \ + "@p", \ + "atom") \ + TRACEPOINT_DESC(KBASE_TL_JD_SUBMIT_ATOM_END, \ + "Submitting an atom ends", \ + "@p", \ + "atom") \ + TRACEPOINT_DESC(KBASE_TL_JD_DONE_NO_LOCK_START, \ + "Within function jd_done_nolock", \ + "@p", \ + "atom") \ + TRACEPOINT_DESC(KBASE_TL_JD_DONE_NO_LOCK_END, \ + "Within function jd_done_nolock - end", \ + "@p", \ + "atom") \ + TRACEPOINT_DESC(KBASE_TL_JD_DONE_START, \ + "Start of kbase_jd_done", \ + "@p", \ + "atom") \ + TRACEPOINT_DESC(KBASE_TL_JD_DONE_END, \ + "End of kbase_jd_done", \ + "@p", \ + "atom") \ + TRACEPOINT_DESC(KBASE_TL_JD_ATOM_COMPLETE, \ + "Atom marked complete", \ + "@p", \ + "atom") \ + TRACEPOINT_DESC(KBASE_TL_RUN_ATOM_START, \ + "Running of atom starts", \ + "@pI", \ + "atom,atom_nr") \ + TRACEPOINT_DESC(KBASE_TL_RUN_ATOM_END, \ + "Running of atom ends", \ + "@pI", \ + "atom,atom_nr") \ #define MIPE_HEADER_BLOB_VAR_NAME __obj_desc_header #define MIPE_HEADER_STREAM_ID TL_STREAM_ID_KERNEL @@ -554,6 +615,10 @@ const size_t obj_desc_header_size = sizeof(__obj_desc_header); "event on a given job slot", \ "@pIII", \ "ctx,slot_nr,atom_nr,event") \ + TRACEPOINT_DESC(KBASE_AUX_MMU_COMMAND, \ + "mmu commands with synchronicity info", \ + "@IIILI", \ + "kernel_ctx_id,mmu_cmd_id,mmu_synchronicity,mmu_lock_addr,mmu_lock_page_num") \ #define MIPE_HEADER_BLOB_VAR_NAME __aux_desc_header #define MIPE_HEADER_STREAM_ID TL_STREAM_ID_KERNEL @@ -1936,12 +2001,52 @@ void __kbase_tlstream_aux_event_job_slot( kbase_tlstream_msgbuf_release(stream, acq_flags); } +void __kbase_tlstream_aux_mmu_command( + struct kbase_tlstream *stream, + u32 kernel_ctx_id, + u32 mmu_cmd_id, + u32 mmu_synchronicity, + u64 mmu_lock_addr, + u32 mmu_lock_page_num) +{ + const u32 msg_id = KBASE_AUX_MMU_COMMAND; + const size_t msg_size = sizeof(msg_id) + sizeof(u64) + + sizeof(kernel_ctx_id) + + sizeof(mmu_cmd_id) + + sizeof(mmu_synchronicity) + + sizeof(mmu_lock_addr) + + sizeof(mmu_lock_page_num) + ; + char *buffer; + unsigned long acq_flags; + size_t pos = 0; + + buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags); + + pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id)); + pos = kbasep_serialize_timestamp(buffer, pos); + pos = kbasep_serialize_bytes(buffer, + pos, &kernel_ctx_id, sizeof(kernel_ctx_id)); + pos = kbasep_serialize_bytes(buffer, + pos, &mmu_cmd_id, sizeof(mmu_cmd_id)); + pos = kbasep_serialize_bytes(buffer, + pos, &mmu_synchronicity, sizeof(mmu_synchronicity)); + pos = kbasep_serialize_bytes(buffer, + pos, &mmu_lock_addr, sizeof(mmu_lock_addr)); + pos = kbasep_serialize_bytes(buffer, + pos, &mmu_lock_page_num, sizeof(mmu_lock_page_num)); + + kbase_tlstream_msgbuf_release(stream, acq_flags); +} + void __kbase_tlstream_tl_kbase_new_device( struct kbase_tlstream *stream, u32 kbase_device_id, u32 kbase_device_gpu_core_count, u32 kbase_device_max_num_csgs, - u32 kbase_device_as_count) + u32 kbase_device_as_count, + u32 kbase_device_sb_entry_count, + u32 kbase_device_has_cross_stream_sync) { const u32 msg_id = KBASE_TL_KBASE_NEW_DEVICE; const size_t msg_size = sizeof(msg_id) + sizeof(u64) @@ -1949,6 +2054,8 @@ void __kbase_tlstream_tl_kbase_new_device( + sizeof(kbase_device_gpu_core_count) + sizeof(kbase_device_max_num_csgs) + sizeof(kbase_device_as_count) + + sizeof(kbase_device_sb_entry_count) + + sizeof(kbase_device_has_cross_stream_sync) ; char *buffer; unsigned long acq_flags; @@ -1966,6 +2073,10 @@ void __kbase_tlstream_tl_kbase_new_device( pos, &kbase_device_max_num_csgs, sizeof(kbase_device_max_num_csgs)); pos = kbasep_serialize_bytes(buffer, pos, &kbase_device_as_count, sizeof(kbase_device_as_count)); + pos = kbasep_serialize_bytes(buffer, + pos, &kbase_device_sb_entry_count, sizeof(kbase_device_sb_entry_count)); + pos = kbasep_serialize_bytes(buffer, + pos, &kbase_device_has_cross_stream_sync, sizeof(kbase_device_has_cross_stream_sync)); kbase_tlstream_msgbuf_release(stream, acq_flags); } @@ -1973,12 +2084,14 @@ void __kbase_tlstream_tl_kbase_new_device( void __kbase_tlstream_tl_kbase_device_program_csg( struct kbase_tlstream *stream, u32 kbase_device_id, + u32 kernel_ctx_id, u32 gpu_cmdq_grp_handle, u32 kbase_device_csg_slot_index) { const u32 msg_id = KBASE_TL_KBASE_DEVICE_PROGRAM_CSG; const size_t msg_size = sizeof(msg_id) + sizeof(u64) + sizeof(kbase_device_id) + + sizeof(kernel_ctx_id) + sizeof(gpu_cmdq_grp_handle) + sizeof(kbase_device_csg_slot_index) ; @@ -1993,6 +2106,8 @@ void __kbase_tlstream_tl_kbase_device_program_csg( pos = kbasep_serialize_bytes(buffer, pos, &kbase_device_id, sizeof(kbase_device_id)); pos = kbasep_serialize_bytes(buffer, + pos, &kernel_ctx_id, sizeof(kernel_ctx_id)); + pos = kbasep_serialize_bytes(buffer, pos, &gpu_cmdq_grp_handle, sizeof(gpu_cmdq_grp_handle)); pos = kbasep_serialize_bytes(buffer, pos, &kbase_device_csg_slot_index, sizeof(kbase_device_csg_slot_index)); @@ -2026,6 +2141,32 @@ void __kbase_tlstream_tl_kbase_device_deprogram_csg( kbase_tlstream_msgbuf_release(stream, acq_flags); } +void __kbase_tlstream_tl_kbase_device_halt_csg( + struct kbase_tlstream *stream, + u32 kbase_device_id, + u32 kbase_device_csg_slot_index) +{ + const u32 msg_id = KBASE_TL_KBASE_DEVICE_HALT_CSG; + const size_t msg_size = sizeof(msg_id) + sizeof(u64) + + sizeof(kbase_device_id) + + sizeof(kbase_device_csg_slot_index) + ; + char *buffer; + unsigned long acq_flags; + size_t pos = 0; + + buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags); + + pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id)); + pos = kbasep_serialize_timestamp(buffer, pos); + pos = kbasep_serialize_bytes(buffer, + pos, &kbase_device_id, sizeof(kbase_device_id)); + pos = kbasep_serialize_bytes(buffer, + pos, &kbase_device_csg_slot_index, sizeof(kbase_device_csg_slot_index)); + + kbase_tlstream_msgbuf_release(stream, acq_flags); +} + void __kbase_tlstream_tl_kbase_new_ctx( struct kbase_tlstream *stream, u32 kernel_ctx_id, @@ -3216,4 +3357,254 @@ void __kbase_tlstream_tl_kbase_csffw_reset( kbase_tlstream_msgbuf_release(stream, acq_flags); } +void __kbase_tlstream_tl_js_sched_start( + struct kbase_tlstream *stream, + u32 dummy) +{ + const u32 msg_id = KBASE_TL_JS_SCHED_START; + const size_t msg_size = sizeof(msg_id) + sizeof(u64) + + sizeof(dummy) + ; + char *buffer; + unsigned long acq_flags; + size_t pos = 0; + + buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags); + + pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id)); + pos = kbasep_serialize_timestamp(buffer, pos); + pos = kbasep_serialize_bytes(buffer, + pos, &dummy, sizeof(dummy)); + + kbase_tlstream_msgbuf_release(stream, acq_flags); +} + +void __kbase_tlstream_tl_js_sched_end( + struct kbase_tlstream *stream, + u32 dummy) +{ + const u32 msg_id = KBASE_TL_JS_SCHED_END; + const size_t msg_size = sizeof(msg_id) + sizeof(u64) + + sizeof(dummy) + ; + char *buffer; + unsigned long acq_flags; + size_t pos = 0; + + buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags); + + pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id)); + pos = kbasep_serialize_timestamp(buffer, pos); + pos = kbasep_serialize_bytes(buffer, + pos, &dummy, sizeof(dummy)); + + kbase_tlstream_msgbuf_release(stream, acq_flags); +} + +void __kbase_tlstream_tl_jd_submit_atom_start( + struct kbase_tlstream *stream, + const void *atom) +{ + const u32 msg_id = KBASE_TL_JD_SUBMIT_ATOM_START; + const size_t msg_size = sizeof(msg_id) + sizeof(u64) + + sizeof(atom) + ; + char *buffer; + unsigned long acq_flags; + size_t pos = 0; + + buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags); + + pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id)); + pos = kbasep_serialize_timestamp(buffer, pos); + pos = kbasep_serialize_bytes(buffer, + pos, &atom, sizeof(atom)); + + kbase_tlstream_msgbuf_release(stream, acq_flags); +} + +void __kbase_tlstream_tl_jd_submit_atom_end( + struct kbase_tlstream *stream, + const void *atom) +{ + const u32 msg_id = KBASE_TL_JD_SUBMIT_ATOM_END; + const size_t msg_size = sizeof(msg_id) + sizeof(u64) + + sizeof(atom) + ; + char *buffer; + unsigned long acq_flags; + size_t pos = 0; + + buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags); + + pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id)); + pos = kbasep_serialize_timestamp(buffer, pos); + pos = kbasep_serialize_bytes(buffer, + pos, &atom, sizeof(atom)); + + kbase_tlstream_msgbuf_release(stream, acq_flags); +} + +void __kbase_tlstream_tl_jd_done_no_lock_start( + struct kbase_tlstream *stream, + const void *atom) +{ + const u32 msg_id = KBASE_TL_JD_DONE_NO_LOCK_START; + const size_t msg_size = sizeof(msg_id) + sizeof(u64) + + sizeof(atom) + ; + char *buffer; + unsigned long acq_flags; + size_t pos = 0; + + buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags); + + pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id)); + pos = kbasep_serialize_timestamp(buffer, pos); + pos = kbasep_serialize_bytes(buffer, + pos, &atom, sizeof(atom)); + + kbase_tlstream_msgbuf_release(stream, acq_flags); +} + +void __kbase_tlstream_tl_jd_done_no_lock_end( + struct kbase_tlstream *stream, + const void *atom) +{ + const u32 msg_id = KBASE_TL_JD_DONE_NO_LOCK_END; + const size_t msg_size = sizeof(msg_id) + sizeof(u64) + + sizeof(atom) + ; + char *buffer; + unsigned long acq_flags; + size_t pos = 0; + + buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags); + + pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id)); + pos = kbasep_serialize_timestamp(buffer, pos); + pos = kbasep_serialize_bytes(buffer, + pos, &atom, sizeof(atom)); + + kbase_tlstream_msgbuf_release(stream, acq_flags); +} + +void __kbase_tlstream_tl_jd_done_start( + struct kbase_tlstream *stream, + const void *atom) +{ + const u32 msg_id = KBASE_TL_JD_DONE_START; + const size_t msg_size = sizeof(msg_id) + sizeof(u64) + + sizeof(atom) + ; + char *buffer; + unsigned long acq_flags; + size_t pos = 0; + + buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags); + + pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id)); + pos = kbasep_serialize_timestamp(buffer, pos); + pos = kbasep_serialize_bytes(buffer, + pos, &atom, sizeof(atom)); + + kbase_tlstream_msgbuf_release(stream, acq_flags); +} + +void __kbase_tlstream_tl_jd_done_end( + struct kbase_tlstream *stream, + const void *atom) +{ + const u32 msg_id = KBASE_TL_JD_DONE_END; + const size_t msg_size = sizeof(msg_id) + sizeof(u64) + + sizeof(atom) + ; + char *buffer; + unsigned long acq_flags; + size_t pos = 0; + + buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags); + + pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id)); + pos = kbasep_serialize_timestamp(buffer, pos); + pos = kbasep_serialize_bytes(buffer, + pos, &atom, sizeof(atom)); + + kbase_tlstream_msgbuf_release(stream, acq_flags); +} + +void __kbase_tlstream_tl_jd_atom_complete( + struct kbase_tlstream *stream, + const void *atom) +{ + const u32 msg_id = KBASE_TL_JD_ATOM_COMPLETE; + const size_t msg_size = sizeof(msg_id) + sizeof(u64) + + sizeof(atom) + ; + char *buffer; + unsigned long acq_flags; + size_t pos = 0; + + buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags); + + pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id)); + pos = kbasep_serialize_timestamp(buffer, pos); + pos = kbasep_serialize_bytes(buffer, + pos, &atom, sizeof(atom)); + + kbase_tlstream_msgbuf_release(stream, acq_flags); +} + +void __kbase_tlstream_tl_run_atom_start( + struct kbase_tlstream *stream, + const void *atom, + u32 atom_nr) +{ + const u32 msg_id = KBASE_TL_RUN_ATOM_START; + const size_t msg_size = sizeof(msg_id) + sizeof(u64) + + sizeof(atom) + + sizeof(atom_nr) + ; + char *buffer; + unsigned long acq_flags; + size_t pos = 0; + + buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags); + + pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id)); + pos = kbasep_serialize_timestamp(buffer, pos); + pos = kbasep_serialize_bytes(buffer, + pos, &atom, sizeof(atom)); + pos = kbasep_serialize_bytes(buffer, + pos, &atom_nr, sizeof(atom_nr)); + + kbase_tlstream_msgbuf_release(stream, acq_flags); +} + +void __kbase_tlstream_tl_run_atom_end( + struct kbase_tlstream *stream, + const void *atom, + u32 atom_nr) +{ + const u32 msg_id = KBASE_TL_RUN_ATOM_END; + const size_t msg_size = sizeof(msg_id) + sizeof(u64) + + sizeof(atom) + + sizeof(atom_nr) + ; + char *buffer; + unsigned long acq_flags; + size_t pos = 0; + + buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags); + + pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id)); + pos = kbasep_serialize_timestamp(buffer, pos); + pos = kbasep_serialize_bytes(buffer, + pos, &atom, sizeof(atom)); + pos = kbasep_serialize_bytes(buffer, + pos, &atom_nr, sizeof(atom_nr)); + + kbase_tlstream_msgbuf_release(stream, acq_flags); +} + /* clang-format on */ diff --git a/mali_kbase/tl/mali_kbase_tracepoints.h b/mali_kbase/tl/mali_kbase_tracepoints.h index 887a1aa..3fc871c 100644 --- a/mali_kbase/tl/mali_kbase_tracepoints.h +++ b/mali_kbase/tl/mali_kbase_tracepoints.h @@ -296,21 +296,35 @@ void __kbase_tlstream_aux_event_job_slot( u32 slot_nr, u32 atom_nr, u32 event); +void __kbase_tlstream_aux_mmu_command( + struct kbase_tlstream *stream, + u32 kernel_ctx_id, + u32 mmu_cmd_id, + u32 mmu_synchronicity, + u64 mmu_lock_addr, + u32 mmu_lock_page_num); void __kbase_tlstream_tl_kbase_new_device( struct kbase_tlstream *stream, u32 kbase_device_id, u32 kbase_device_gpu_core_count, u32 kbase_device_max_num_csgs, - u32 kbase_device_as_count); + u32 kbase_device_as_count, + u32 kbase_device_sb_entry_count, + u32 kbase_device_has_cross_stream_sync); void __kbase_tlstream_tl_kbase_device_program_csg( struct kbase_tlstream *stream, u32 kbase_device_id, + u32 kernel_ctx_id, u32 gpu_cmdq_grp_handle, u32 kbase_device_csg_slot_index); void __kbase_tlstream_tl_kbase_device_deprogram_csg( struct kbase_tlstream *stream, u32 kbase_device_id, u32 kbase_device_csg_slot_index); +void __kbase_tlstream_tl_kbase_device_halt_csg( + struct kbase_tlstream *stream, + u32 kbase_device_id, + u32 kbase_device_csg_slot_index); void __kbase_tlstream_tl_kbase_new_ctx( struct kbase_tlstream *stream, u32 kernel_ctx_id, @@ -491,6 +505,41 @@ void __kbase_tlstream_tl_kbase_csffw_tlstream_overflow( void __kbase_tlstream_tl_kbase_csffw_reset( struct kbase_tlstream *stream, u64 csffw_cycle); +void __kbase_tlstream_tl_js_sched_start( + struct kbase_tlstream *stream, + u32 dummy); +void __kbase_tlstream_tl_js_sched_end( + struct kbase_tlstream *stream, + u32 dummy); +void __kbase_tlstream_tl_jd_submit_atom_start( + struct kbase_tlstream *stream, + const void *atom); +void __kbase_tlstream_tl_jd_submit_atom_end( + struct kbase_tlstream *stream, + const void *atom); +void __kbase_tlstream_tl_jd_done_no_lock_start( + struct kbase_tlstream *stream, + const void *atom); +void __kbase_tlstream_tl_jd_done_no_lock_end( + struct kbase_tlstream *stream, + const void *atom); +void __kbase_tlstream_tl_jd_done_start( + struct kbase_tlstream *stream, + const void *atom); +void __kbase_tlstream_tl_jd_done_end( + struct kbase_tlstream *stream, + const void *atom); +void __kbase_tlstream_tl_jd_atom_complete( + struct kbase_tlstream *stream, + const void *atom); +void __kbase_tlstream_tl_run_atom_start( + struct kbase_tlstream *stream, + const void *atom, + u32 atom_nr); +void __kbase_tlstream_tl_run_atom_end( + struct kbase_tlstream *stream, + const void *atom, + u32 atom_nr); struct kbase_tlstream; @@ -1593,14 +1642,48 @@ struct kbase_tlstream; } while (0) /** + * KBASE_TLSTREAM_AUX_MMU_COMMAND - + * mmu commands with synchronicity info + * + * @kbdev: Kbase device + * @kernel_ctx_id: Unique ID for the KBase Context + * @mmu_cmd_id: MMU Command ID (e.g AS_COMMAND_UPDATE) + * @mmu_synchronicity: Indicates whether the command is related to current running job + * that needs to be resolved to make it progress (synchronous, e.g. + * grow on page fault, JIT) or not (asynchronous, e.g. IOCTL calls + * from user-space). This param will be 0 if it is an asynchronous + * operation. + * @mmu_lock_addr: start address of regions to be locked/unlocked/invalidated + * @mmu_lock_page_num: number of pages to be locked/unlocked/invalidated + */ +#define KBASE_TLSTREAM_AUX_MMU_COMMAND( \ + kbdev, \ + kernel_ctx_id, \ + mmu_cmd_id, \ + mmu_synchronicity, \ + mmu_lock_addr, \ + mmu_lock_page_num \ + ) \ + do { \ + int enabled = atomic_read(&kbdev->timeline_flags); \ + if (enabled & TLSTREAM_ENABLED) \ + __kbase_tlstream_aux_mmu_command( \ + __TL_DISPATCH_STREAM(kbdev, aux), \ + kernel_ctx_id, mmu_cmd_id, mmu_synchronicity, mmu_lock_addr, mmu_lock_page_num); \ + } while (0) + +/** * KBASE_TLSTREAM_TL_KBASE_NEW_DEVICE - * New KBase Device * * @kbdev: Kbase device - * @kbase_device_id: The id of the physical hardware + * @kbase_device_id: The ID of the physical hardware * @kbase_device_gpu_core_count: The number of gpu cores in the physical hardware * @kbase_device_max_num_csgs: The max number of CSGs the physical hardware supports * @kbase_device_as_count: The number of address spaces the physical hardware has available + * @kbase_device_sb_entry_count: The number of entries each scoreboard set in the + * physical hardware has available + * @kbase_device_has_cross_stream_sync: Whether cross-stream synchronization is supported */ #if MALI_USE_CSF #define KBASE_TLSTREAM_TL_KBASE_NEW_DEVICE( \ @@ -1608,14 +1691,16 @@ struct kbase_tlstream; kbase_device_id, \ kbase_device_gpu_core_count, \ kbase_device_max_num_csgs, \ - kbase_device_as_count \ + kbase_device_as_count, \ + kbase_device_sb_entry_count, \ + kbase_device_has_cross_stream_sync \ ) \ do { \ int enabled = atomic_read(&kbdev->timeline_flags); \ if (enabled & BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS) \ __kbase_tlstream_tl_kbase_new_device( \ __TL_DISPATCH_STREAM(kbdev, obj), \ - kbase_device_id, kbase_device_gpu_core_count, kbase_device_max_num_csgs, kbase_device_as_count); \ + kbase_device_id, kbase_device_gpu_core_count, kbase_device_max_num_csgs, kbase_device_as_count, kbase_device_sb_entry_count, kbase_device_has_cross_stream_sync); \ } while (0) #else #define KBASE_TLSTREAM_TL_KBASE_NEW_DEVICE( \ @@ -1623,7 +1708,9 @@ struct kbase_tlstream; kbase_device_id, \ kbase_device_gpu_core_count, \ kbase_device_max_num_csgs, \ - kbase_device_as_count \ + kbase_device_as_count, \ + kbase_device_sb_entry_count, \ + kbase_device_has_cross_stream_sync \ ) \ do { } while (0) #endif /* MALI_USE_CSF */ @@ -1633,7 +1720,8 @@ struct kbase_tlstream; * CSG is programmed to a slot * * @kbdev: Kbase device - * @kbase_device_id: The id of the physical hardware + * @kbase_device_id: The ID of the physical hardware + * @kernel_ctx_id: Unique ID for the KBase Context * @gpu_cmdq_grp_handle: GPU Command Queue Group handle which will match userspace * @kbase_device_csg_slot_index: The index of the slot in the scheduler being programmed */ @@ -1641,6 +1729,7 @@ struct kbase_tlstream; #define KBASE_TLSTREAM_TL_KBASE_DEVICE_PROGRAM_CSG( \ kbdev, \ kbase_device_id, \ + kernel_ctx_id, \ gpu_cmdq_grp_handle, \ kbase_device_csg_slot_index \ ) \ @@ -1649,12 +1738,13 @@ struct kbase_tlstream; if (enabled & BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS) \ __kbase_tlstream_tl_kbase_device_program_csg( \ __TL_DISPATCH_STREAM(kbdev, obj), \ - kbase_device_id, gpu_cmdq_grp_handle, kbase_device_csg_slot_index); \ + kbase_device_id, kernel_ctx_id, gpu_cmdq_grp_handle, kbase_device_csg_slot_index); \ } while (0) #else #define KBASE_TLSTREAM_TL_KBASE_DEVICE_PROGRAM_CSG( \ kbdev, \ kbase_device_id, \ + kernel_ctx_id, \ gpu_cmdq_grp_handle, \ kbase_device_csg_slot_index \ ) \ @@ -1666,7 +1756,7 @@ struct kbase_tlstream; * CSG is deprogrammed from a slot * * @kbdev: Kbase device - * @kbase_device_id: The id of the physical hardware + * @kbase_device_id: The ID of the physical hardware * @kbase_device_csg_slot_index: The index of the slot in the scheduler being programmed */ #if MALI_USE_CSF @@ -1692,12 +1782,33 @@ struct kbase_tlstream; #endif /* MALI_USE_CSF */ /** + * KBASE_TLSTREAM_TL_KBASE_DEVICE_HALT_CSG - + * CSG is halted + * + * @kbdev: Kbase device + * @kbase_device_id: The ID of the physical hardware + * @kbase_device_csg_slot_index: The index of the slot in the scheduler being programmed + */ +#define KBASE_TLSTREAM_TL_KBASE_DEVICE_HALT_CSG( \ + kbdev, \ + kbase_device_id, \ + kbase_device_csg_slot_index \ + ) \ + do { \ + int enabled = atomic_read(&kbdev->timeline_flags); \ + if (enabled & TLSTREAM_ENABLED) \ + __kbase_tlstream_tl_kbase_device_halt_csg( \ + __TL_DISPATCH_STREAM(kbdev, obj), \ + kbase_device_id, kbase_device_csg_slot_index); \ + } while (0) + +/** * KBASE_TLSTREAM_TL_KBASE_NEW_CTX - * New KBase Context * * @kbdev: Kbase device * @kernel_ctx_id: Unique ID for the KBase Context - * @kbase_device_id: The id of the physical hardware + * @kbase_device_id: The ID of the physical hardware */ #if MALI_USE_CSF #define KBASE_TLSTREAM_TL_KBASE_NEW_CTX( \ @@ -1935,7 +2046,7 @@ struct kbase_tlstream; * @cqs_obj_gpu_addr: CQS Object GPU pointer * @cqs_obj_compare_value: Semaphore value that should be exceeded * for the WAIT to pass - * @cqs_obj_inherit_error: Indicates the error state should be inherited into the queue or not + * @cqs_obj_inherit_error: Flag which indicates if the CQS object error state should be inherited by the queue */ #if MALI_USE_CSF #define KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_CQS_WAIT( \ @@ -3091,6 +3202,219 @@ struct kbase_tlstream; do { } while (0) #endif /* MALI_USE_CSF */ +/** + * KBASE_TLSTREAM_TL_JS_SCHED_START - + * Scheduling starts + * + * @kbdev: Kbase device + * @dummy: dummy argument + */ +#define KBASE_TLSTREAM_TL_JS_SCHED_START( \ + kbdev, \ + dummy \ + ) \ + do { \ + int enabled = atomic_read(&kbdev->timeline_flags); \ + if (enabled & TLSTREAM_ENABLED) \ + __kbase_tlstream_tl_js_sched_start( \ + __TL_DISPATCH_STREAM(kbdev, obj), \ + dummy); \ + } while (0) + +/** + * KBASE_TLSTREAM_TL_JS_SCHED_END - + * Scheduling ends + * + * @kbdev: Kbase device + * @dummy: dummy argument + */ +#define KBASE_TLSTREAM_TL_JS_SCHED_END( \ + kbdev, \ + dummy \ + ) \ + do { \ + int enabled = atomic_read(&kbdev->timeline_flags); \ + if (enabled & TLSTREAM_ENABLED) \ + __kbase_tlstream_tl_js_sched_end( \ + __TL_DISPATCH_STREAM(kbdev, obj), \ + dummy); \ + } while (0) + +/** + * KBASE_TLSTREAM_TL_JD_SUBMIT_ATOM_START - + * Submitting an atom starts + * + * @kbdev: Kbase device + * @atom: Atom identifier + */ +#define KBASE_TLSTREAM_TL_JD_SUBMIT_ATOM_START( \ + kbdev, \ + atom \ + ) \ + do { \ + int enabled = atomic_read(&kbdev->timeline_flags); \ + if (enabled & TLSTREAM_ENABLED) \ + __kbase_tlstream_tl_jd_submit_atom_start( \ + __TL_DISPATCH_STREAM(kbdev, obj), \ + atom); \ + } while (0) + +/** + * KBASE_TLSTREAM_TL_JD_SUBMIT_ATOM_END - + * Submitting an atom ends + * + * @kbdev: Kbase device + * @atom: Atom identifier + */ +#define KBASE_TLSTREAM_TL_JD_SUBMIT_ATOM_END( \ + kbdev, \ + atom \ + ) \ + do { \ + int enabled = atomic_read(&kbdev->timeline_flags); \ + if (enabled & TLSTREAM_ENABLED) \ + __kbase_tlstream_tl_jd_submit_atom_end( \ + __TL_DISPATCH_STREAM(kbdev, obj), \ + atom); \ + } while (0) + +/** + * KBASE_TLSTREAM_TL_JD_DONE_NO_LOCK_START - + * Within function jd_done_nolock + * + * @kbdev: Kbase device + * @atom: Atom identifier + */ +#define KBASE_TLSTREAM_TL_JD_DONE_NO_LOCK_START( \ + kbdev, \ + atom \ + ) \ + do { \ + int enabled = atomic_read(&kbdev->timeline_flags); \ + if (enabled & TLSTREAM_ENABLED) \ + __kbase_tlstream_tl_jd_done_no_lock_start( \ + __TL_DISPATCH_STREAM(kbdev, obj), \ + atom); \ + } while (0) + +/** + * KBASE_TLSTREAM_TL_JD_DONE_NO_LOCK_END - + * Within function jd_done_nolock - end + * + * @kbdev: Kbase device + * @atom: Atom identifier + */ +#define KBASE_TLSTREAM_TL_JD_DONE_NO_LOCK_END( \ + kbdev, \ + atom \ + ) \ + do { \ + int enabled = atomic_read(&kbdev->timeline_flags); \ + if (enabled & TLSTREAM_ENABLED) \ + __kbase_tlstream_tl_jd_done_no_lock_end( \ + __TL_DISPATCH_STREAM(kbdev, obj), \ + atom); \ + } while (0) + +/** + * KBASE_TLSTREAM_TL_JD_DONE_START - + * Start of kbase_jd_done + * + * @kbdev: Kbase device + * @atom: Atom identifier + */ +#define KBASE_TLSTREAM_TL_JD_DONE_START( \ + kbdev, \ + atom \ + ) \ + do { \ + int enabled = atomic_read(&kbdev->timeline_flags); \ + if (enabled & TLSTREAM_ENABLED) \ + __kbase_tlstream_tl_jd_done_start( \ + __TL_DISPATCH_STREAM(kbdev, obj), \ + atom); \ + } while (0) + +/** + * KBASE_TLSTREAM_TL_JD_DONE_END - + * End of kbase_jd_done + * + * @kbdev: Kbase device + * @atom: Atom identifier + */ +#define KBASE_TLSTREAM_TL_JD_DONE_END( \ + kbdev, \ + atom \ + ) \ + do { \ + int enabled = atomic_read(&kbdev->timeline_flags); \ + if (enabled & TLSTREAM_ENABLED) \ + __kbase_tlstream_tl_jd_done_end( \ + __TL_DISPATCH_STREAM(kbdev, obj), \ + atom); \ + } while (0) + +/** + * KBASE_TLSTREAM_TL_JD_ATOM_COMPLETE - + * Atom marked complete + * + * @kbdev: Kbase device + * @atom: Atom identifier + */ +#define KBASE_TLSTREAM_TL_JD_ATOM_COMPLETE( \ + kbdev, \ + atom \ + ) \ + do { \ + int enabled = atomic_read(&kbdev->timeline_flags); \ + if (enabled & TLSTREAM_ENABLED) \ + __kbase_tlstream_tl_jd_atom_complete( \ + __TL_DISPATCH_STREAM(kbdev, obj), \ + atom); \ + } while (0) + +/** + * KBASE_TLSTREAM_TL_RUN_ATOM_START - + * Running of atom starts + * + * @kbdev: Kbase device + * @atom: Atom identifier + * @atom_nr: Sequential number of an atom + */ +#define KBASE_TLSTREAM_TL_RUN_ATOM_START( \ + kbdev, \ + atom, \ + atom_nr \ + ) \ + do { \ + int enabled = atomic_read(&kbdev->timeline_flags); \ + if (enabled & TLSTREAM_ENABLED) \ + __kbase_tlstream_tl_run_atom_start( \ + __TL_DISPATCH_STREAM(kbdev, obj), \ + atom, atom_nr); \ + } while (0) + +/** + * KBASE_TLSTREAM_TL_RUN_ATOM_END - + * Running of atom ends + * + * @kbdev: Kbase device + * @atom: Atom identifier + * @atom_nr: Sequential number of an atom + */ +#define KBASE_TLSTREAM_TL_RUN_ATOM_END( \ + kbdev, \ + atom, \ + atom_nr \ + ) \ + do { \ + int enabled = atomic_read(&kbdev->timeline_flags); \ + if (enabled & TLSTREAM_ENABLED) \ + __kbase_tlstream_tl_run_atom_end( \ + __TL_DISPATCH_STREAM(kbdev, obj), \ + atom, atom_nr); \ + } while (0) + /* Gator tracepoints are hooked into TLSTREAM interface. * When the following tracepoints are called, corresponding |