// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note /* * * (C) COPYRIGHT 2019-2023 ARM Limited. All rights reserved. * * This program is free software and is provided to you under the terms of the * GNU General Public License version 2 as published by the Free Software * Foundation, and any use by you of this program is subject to the terms * of such GNU license. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, you can access it online at * http://www.gnu.org/licenses/gpl-2.0.html. * */ /* * Implementation of the dummy job execution workaround for the GPU hang issue. */ #include #include #include #include #include #define DUMMY_JOB_WA_BINARY_NAME "valhall-1691526.wa" struct wa_header { u16 signature; u16 version; u32 info_offset; } __packed; struct wa_v2_info { u64 jc; u32 js; u32 blob_offset; u64 flags; } __packed; struct wa_blob { u64 base; u32 size; u32 map_flags; u32 payload_offset; u32 blob_offset; } __packed; static bool within_range(const u8 *base, const u8 *end, off_t off, size_t sz) { return !((size_t)(end - base - off) < sz); } static u32 wait_any(struct kbase_device *kbdev, off_t offset, u32 bits) { int loop; const int timeout = 100; u32 val; for (loop = 0; loop < timeout; loop++) { val = kbase_reg_read32(kbdev, offset); if (val & bits) break; udelay(10); } if (loop == timeout) { dev_err(kbdev->dev, "Timeout reading register 0x%lx, bits 0x%lx, last read was 0x%lx\n", (unsigned long)offset, (unsigned long)bits, (unsigned long)val); } return (val & bits); } static inline int run_job(struct kbase_device *kbdev, int as, u32 slot, u64 cores, u64 jc) { u32 done; /* setup job */ kbase_reg_write64(kbdev, JOB_SLOT_OFFSET(slot, HEAD_NEXT), jc); kbase_reg_write64(kbdev, JOB_SLOT_OFFSET(slot, AFFINITY_NEXT), cores); kbase_reg_write32(kbdev, JOB_SLOT_OFFSET(slot, CONFIG_NEXT), JS_CONFIG_DISABLE_DESCRIPTOR_WR_BK | (unsigned int)as); /* go */ kbase_reg_write32(kbdev, JOB_SLOT_OFFSET(slot, COMMAND_NEXT), JS_COMMAND_START); /* wait for the slot to finish (done, error) */ done = wait_any(kbdev, JOB_CONTROL_ENUM(JOB_IRQ_RAWSTAT), (1ul << (16 + slot)) | (1ul << slot)); kbase_reg_write32(kbdev, JOB_CONTROL_ENUM(JOB_IRQ_CLEAR), done); if (done != (1ul << slot)) { dev_err(kbdev->dev, "Failed to run WA job on slot %u cores 0x%llx: done 0x%lx\n", slot, (unsigned long long)cores, (unsigned long)done); dev_err(kbdev->dev, "JS_STATUS on failure: 0x%x\n", kbase_reg_read32(kbdev, JOB_SLOT_OFFSET(slot, STATUS))); return -EFAULT; } else { return 0; } } /* To be called after power up & MMU init, but before everything else */ int kbase_dummy_job_wa_execute(struct kbase_device *kbdev, u64 cores) { int as; u32 slot; u64 jc; int failed = 0; int runs = 0; u32 old_gpu_mask; u32 old_job_mask; u64 val; const u32 timeout_us = 10000; if (!kbdev) return -EFAULT; if (!kbdev->dummy_job_wa.kctx) return -EFAULT; as = kbdev->dummy_job_wa.kctx->as_nr; slot = kbdev->dummy_job_wa.slot; jc = kbdev->dummy_job_wa.jc; /* mask off all but MMU IRQs */ old_gpu_mask = kbase_reg_read32(kbdev, GPU_CONTROL_ENUM(GPU_IRQ_MASK)); old_job_mask = kbase_reg_read32(kbdev, JOB_CONTROL_ENUM(JOB_IRQ_MASK)); kbase_reg_write32(kbdev, GPU_CONTROL_ENUM(GPU_IRQ_MASK), 0); kbase_reg_write32(kbdev, JOB_CONTROL_ENUM(JOB_IRQ_MASK), 0); /* power up requested cores */ kbase_reg_write64(kbdev, GPU_CONTROL_ENUM(SHADER_PWRON), cores); if (kbdev->dummy_job_wa.flags & KBASE_DUMMY_JOB_WA_FLAG_WAIT_POWERUP) { /* wait for power-ups */ kbase_reg_poll64_timeout(kbdev, GPU_CONTROL_ENUM(SHADER_READY), val, (val & cores) == cores, 10, timeout_us, false); } if (kbdev->dummy_job_wa.flags & KBASE_DUMMY_JOB_WA_FLAG_SERIALIZE) { size_t i; /* do for each requested core */ for (i = 0; i < sizeof(cores) * 8; i++) { u64 affinity; affinity = 1ull << i; if (!(cores & affinity)) continue; if (run_job(kbdev, as, slot, affinity, jc)) failed++; runs++; } } else { if (run_job(kbdev, as, slot, cores, jc)) failed++; runs++; } if (kbdev->dummy_job_wa.flags & KBASE_DUMMY_JOB_WA_FLAG_LOGICAL_SHADER_POWER) { /* power off shader cores (to reduce any dynamic leakage) */ kbase_reg_write64(kbdev, GPU_CONTROL_ENUM(SHADER_PWROFF), cores); /* wait for power off complete */ kbase_reg_poll64_timeout(kbdev, GPU_CONTROL_ENUM(SHADER_READY), val, !(val & cores), 10, timeout_us, false); kbase_reg_poll64_timeout(kbdev, GPU_CONTROL_ENUM(SHADER_PWRTRANS), val, !(val & cores), 10, timeout_us, false); kbase_reg_write32(kbdev, GPU_CONTROL_ENUM(GPU_IRQ_CLEAR), U32_MAX); } /* restore IRQ masks */ kbase_reg_write32(kbdev, GPU_CONTROL_ENUM(GPU_IRQ_MASK), old_gpu_mask); kbase_reg_write32(kbdev, JOB_CONTROL_ENUM(JOB_IRQ_MASK), old_job_mask); if (failed) dev_err(kbdev->dev, "WA complete with %d failures out of %d runs\n", failed, runs); return failed ? -EFAULT : 0; } static ssize_t dummy_job_wa_info_show(struct device *const dev, struct device_attribute *const attr, char *const buf) { struct kbase_device *const kbdev = dev_get_drvdata(dev); int err; CSTD_UNUSED(attr); if (!kbdev || !kbdev->dummy_job_wa.kctx) return -ENODEV; err = scnprintf(buf, PAGE_SIZE, "slot %u flags %llx\n", kbdev->dummy_job_wa.slot, kbdev->dummy_job_wa.flags); return err; } static DEVICE_ATTR_RO(dummy_job_wa_info); static bool wa_blob_load_needed(struct kbase_device *kbdev) { if (of_machine_is_compatible("arm,juno")) return false; return false; } int kbase_dummy_job_wa_load(struct kbase_device *kbdev) { const struct firmware *firmware; static const char wa_name[] = DUMMY_JOB_WA_BINARY_NAME; const u32 signature = 0x4157; const u32 version = 2; const u8 *fw_end; const u8 *fw; const struct wa_header *header; const struct wa_v2_info *v2_info; u32 blob_offset; int err; struct kbase_context *kctx; /* Calls to this function are inherently asynchronous, with respect to * MMU operations. */ const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC; lockdep_assert_held(&kbdev->fw_load_lock); if (!wa_blob_load_needed(kbdev)) return 0; /* load the wa */ err = request_firmware(&firmware, wa_name, kbdev->dev); if (err) { dev_err(kbdev->dev, "WA blob missing. Please refer to the Arm Mali DDK Valhall Release Notes, " "Part number DC-06002 or contact support-mali@arm.com - driver probe will be failed"); return -ENODEV; } kctx = kbase_create_context(kbdev, true, BASE_CONTEXT_CREATE_FLAG_NONE, 0, NULL); if (!kctx) { dev_err(kbdev->dev, "Failed to create WA context\n"); goto no_ctx; } fw = firmware->data; fw_end = fw + firmware->size; dev_dbg(kbdev->dev, "Loaded firmware of size %zu bytes\n", firmware->size); if (!within_range(fw, fw_end, 0, sizeof(*header))) { dev_err(kbdev->dev, "WA too small\n"); goto bad_fw; } header = (const struct wa_header *)(fw + 0); if (header->signature != signature) { dev_err(kbdev->dev, "WA signature failure: 0x%lx\n", (unsigned long)header->signature); goto bad_fw; } if (header->version != version) { dev_err(kbdev->dev, "WA version 0x%lx not supported\n", (unsigned long)header->version); goto bad_fw; } if (!within_range(fw, fw_end, header->info_offset, sizeof(*v2_info))) { dev_err(kbdev->dev, "WA info offset out of bounds\n"); goto bad_fw; } v2_info = (const struct wa_v2_info *)(fw + header->info_offset); if (v2_info->flags & ~KBASE_DUMMY_JOB_WA_FLAGS) { dev_err(kbdev->dev, "Unsupported WA flag(s): 0x%llx\n", (unsigned long long)v2_info->flags); goto bad_fw; } kbdev->dummy_job_wa.slot = v2_info->js; kbdev->dummy_job_wa.jc = v2_info->jc; kbdev->dummy_job_wa.flags = v2_info->flags; blob_offset = v2_info->blob_offset; while (blob_offset) { const struct wa_blob *blob; size_t nr_pages; u64 flags; u64 gpu_va; struct kbase_va_region *va_region; if (!within_range(fw, fw_end, blob_offset, sizeof(*blob))) { dev_err(kbdev->dev, "Blob offset out-of-range: 0x%lx\n", (unsigned long)blob_offset); goto bad_fw; } blob = (const struct wa_blob *)(fw + blob_offset); if (!within_range(fw, fw_end, blob->payload_offset, blob->size)) { dev_err(kbdev->dev, "Payload out-of-bounds\n"); goto bad_fw; } gpu_va = blob->base; if (PAGE_ALIGN(gpu_va) != gpu_va) { dev_err(kbdev->dev, "blob not page aligned\n"); goto bad_fw; } nr_pages = PFN_UP(blob->size); flags = blob->map_flags | BASE_MEM_FLAG_MAP_FIXED; va_region = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, &gpu_va, mmu_sync_info); if (!va_region) { dev_err(kbdev->dev, "Failed to allocate for blob\n"); } else { struct kbase_vmap_struct vmap = { 0 }; const u8 *payload; void *dst; /* copy the payload, */ payload = fw + blob->payload_offset; dst = kbase_vmap(kctx, va_region->start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT, &vmap); if (dst) { memcpy(dst, payload, blob->size); kbase_vunmap(kctx, &vmap); } else { dev_err(kbdev->dev, "Failed to copy payload\n"); } } blob_offset = blob->blob_offset; /* follow chain */ } release_firmware(firmware); kbasep_js_schedule_privileged_ctx(kbdev, kctx); kbdev->dummy_job_wa.kctx = kctx; err = sysfs_create_file(&kbdev->dev->kobj, &dev_attr_dummy_job_wa_info.attr); if (err) dev_err(kbdev->dev, "SysFS file creation for dummy job wa failed\n"); return 0; bad_fw: kbase_destroy_context(kctx); no_ctx: release_firmware(firmware); return -EFAULT; } void kbase_dummy_job_wa_cleanup(struct kbase_device *kbdev) { struct kbase_context *wa_kctx; /* return if the dummy job has not been loaded */ if (kbdev->dummy_job_wa_loaded == false) return; /* Can be safely called even if the file wasn't created on probe */ sysfs_remove_file(&kbdev->dev->kobj, &dev_attr_dummy_job_wa_info.attr); wa_kctx = READ_ONCE(kbdev->dummy_job_wa.kctx); WRITE_ONCE(kbdev->dummy_job_wa.kctx, NULL); /* make this write visible before we tear down the ctx */ smp_mb(); if (wa_kctx) { kbasep_js_release_privileged_ctx(kbdev, wa_kctx); kbase_destroy_context(wa_kctx); } }