diff options
author | Whi copybara merger <whitechapel-automerger@google.com> | 2021-07-09 00:36:34 +0000 |
---|---|---|
committer | Sharad Bagri <sharadbagri@google.com> | 2021-07-09 10:16:18 -0700 |
commit | f20eac91b6e63ee33793ee4aae418cf9fc84b1cf (patch) | |
tree | d58afe1091bb328b143692eb9317fb613d44cd9c | |
parent | 1aecc588849fb53db0cbc0e8d405e5d720c62524 (diff) | |
download | abrolhos-f20eac91b6e63ee33793ee4aae418cf9fc84b1cf.tar.gz |
[Copybara Auto Merge] Merge branch 'whitechapel' into android-gs-pixel-5.10
edgetpu: add force_reset flag when restarting firmware
When CPU reset is controlled externaly (like mobile platforms relying on
GSA), we want the critical wake-up path to be as short as possible. At
the same time, when the device goes to a bad state we want to make sure
a CPU reset is issued.
Add a flag to edgetpu_firmware_restart_locked, propagate it to the
restart handler in edgetpu_firmware_chip_data, and use it on Abrolhos to
send a GSA_TPU_SHUTDOWN command in order to make sure the CPU is reset.
Bug: 190871324
(cherry picked from commit 88bae702e99cd57b7962043ec76a97939765f6d7)
Revert "edgetpu: abrolhos attempt FW restart on power down"
This reverts commit d5487306b13e81d89435778dd487541ac341759b.
(cherry picked from commit 963540540ce495cd1c0bd6123aea470f26dbbc75)
edgetpu: abrolhos attempt FW restart on power down
Bug: 190871324
edgetpu: KCI early leave if the device state is bad
edgetpu: check device pointer in logging functions
edgetpu: abrolhos: bcl: Renaming gs101_bcl to google_bcl
Bug: 192683348
Bug: 193049438
edgetpu: abrolhos: return 0 when TZ mailbox is not acquired
Bug: 192808637
edgetpu: log pids when acquire fails
edgetpu: increase reverse KCI buffer size
edgetpu: abrolhos fix thermal suspend condition
edgetpu: move default firmware run and firmware_name param to common
edgetpu: create edgetpu_firmware_chip_data
edgetpu: move default firmware run and firmware_name param to common
edgetpu: create edgetpu_firmware_chip_data
edgetpu: watchdog bite when OPEN_DEVICE KCI timed out
edgetpu: remove edgetpu_chip_firmware_run
edgetpu: detach mailbox when activation failed
edgetpu: reduce IOMMU fault reports severity
edgetpu: allow buffer unmapping on errored groups
GitOrigin-RevId: 0f4b817fa17e1df8270cb303eab0aceec4e8bf50
Change-Id: I1e7092b9f8f25ee72b559c6246d705bca24c1cdb
-rw-r--r-- | drivers/edgetpu/abrolhos-device.c | 4 | ||||
-rw-r--r-- | drivers/edgetpu/abrolhos-firmware.c | 18 | ||||
-rw-r--r-- | drivers/edgetpu/abrolhos-pm.c | 14 | ||||
-rw-r--r-- | drivers/edgetpu/abrolhos-thermal.c | 6 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-device-group.c | 19 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-dmabuf.c | 5 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-firmware.c | 105 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-firmware.h | 46 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-fs.c | 8 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-google-iommu.c | 28 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-internal.h | 20 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-kci.c | 14 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-kci.h | 2 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-mailbox.c | 8 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-mmu.h | 5 | ||||
-rw-r--r-- | drivers/edgetpu/mobile-firmware.h | 2 |
16 files changed, 183 insertions, 121 deletions
diff --git a/drivers/edgetpu/abrolhos-device.c b/drivers/edgetpu/abrolhos-device.c index 1277199..e38c508 100644 --- a/drivers/edgetpu/abrolhos-device.c +++ b/drivers/edgetpu/abrolhos-device.c @@ -225,9 +225,9 @@ int edgetpu_chip_release_ext_mailbox(struct edgetpu_client *client, mutex_lock(&apdev->tz_mailbox_lock); if (!apdev->secure_client) { - etdev_err(client->etdev, "TZ mailbox already released\n"); + etdev_warn(client->etdev, "TZ mailbox already released\n"); mutex_unlock(&apdev->tz_mailbox_lock); - return -ENODEV; + return 0; } if (apdev->secure_client != client) { etdev_err(client->etdev, diff --git a/drivers/edgetpu/abrolhos-firmware.c b/drivers/edgetpu/abrolhos-firmware.c index 9acc0d2..5a8cd2e 100644 --- a/drivers/edgetpu/abrolhos-firmware.c +++ b/drivers/edgetpu/abrolhos-firmware.c @@ -64,12 +64,17 @@ static void abrolhos_firmware_teardown_buffer( { } -static int abrolhos_firmware_restart(struct edgetpu_firmware *et_fw) +static int abrolhos_firmware_restart(struct edgetpu_firmware *et_fw, + bool force_reset) { struct edgetpu_dev *etdev = et_fw->etdev; struct abrolhos_platform_dev *edgetpu_pdev = to_abrolhos_dev(etdev); int tpu_state; + /* We are in a bad state, send shutdown command and hope the device recovers */ + if (force_reset) + gsa_send_tpu_cmd(edgetpu_pdev->gsa_dev, GSA_TPU_SHUTDOWN); + tpu_state = gsa_send_tpu_cmd(edgetpu_pdev->gsa_dev, GSA_TPU_START); if (tpu_state < 0) { @@ -199,7 +204,8 @@ out_unmap: return ret; } -static const struct edgetpu_firmware_handlers abrolhos_firmware_handlers = { +static const struct edgetpu_firmware_chip_data abrolhos_firmware_chip_data = { + .default_firmware_name = EDGETPU_DEFAULT_FIRMWARE_NAME, .alloc_buffer = abrolhos_firmware_alloc_buffer, .free_buffer = abrolhos_firmware_free_buffer, .setup_buffer = abrolhos_firmware_setup_buffer, @@ -210,7 +216,7 @@ static const struct edgetpu_firmware_handlers abrolhos_firmware_handlers = { int mobile_edgetpu_firmware_create(struct edgetpu_dev *etdev) { - return edgetpu_firmware_create(etdev, &abrolhos_firmware_handlers); + return edgetpu_firmware_create(etdev, &abrolhos_firmware_chip_data); } void mobile_edgetpu_firmware_destroy(struct edgetpu_dev *etdev) @@ -218,12 +224,6 @@ void mobile_edgetpu_firmware_destroy(struct edgetpu_dev *etdev) edgetpu_firmware_destroy(etdev); } -int edgetpu_chip_firmware_run(struct edgetpu_dev *etdev, const char *name, - enum edgetpu_firmware_flags flags) -{ - return edgetpu_firmware_run(etdev, name, flags); -} - unsigned long edgetpu_chip_firmware_iova(struct edgetpu_dev *etdev) { /* diff --git a/drivers/edgetpu/abrolhos-pm.c b/drivers/edgetpu/abrolhos-pm.c index 651a831..87a82d8 100644 --- a/drivers/edgetpu/abrolhos-pm.c +++ b/drivers/edgetpu/abrolhos-pm.c @@ -16,7 +16,6 @@ #include "abrolhos-platform.h" #include "abrolhos-pm.h" -#include "edgetpu-config.h" #include "edgetpu-firmware.h" #include "edgetpu-internal.h" #include "edgetpu-kci.h" @@ -435,7 +434,9 @@ static void abrolhos_power_down(struct edgetpu_pm *etpm); static int abrolhos_power_up(struct edgetpu_pm *etpm) { struct edgetpu_dev *etdev = etpm->etdev; +#if IS_ENABLED(CONFIG_GOOGLE_BCL) struct abrolhos_platform_dev *abpdev = to_abrolhos_dev(etdev); +#endif int ret = abrolhos_pwr_state_set( etpm->etdev, abrolhos_get_initial_pwr_state(etdev->dev)); @@ -479,12 +480,10 @@ static int abrolhos_power_up(struct edgetpu_pm *etpm) /* attempt firmware run */ switch (edgetpu_firmware_status_locked(etdev)) { case FW_VALID: - ret = edgetpu_firmware_restart_locked(etdev); + ret = edgetpu_firmware_restart_locked(etdev, false); break; case FW_INVALID: - ret = edgetpu_firmware_run_locked(etdev->firmware, - EDGETPU_DEFAULT_FIRMWARE_NAME, - FW_DEFAULT); + ret = edgetpu_firmware_run_default_locked(etdev); break; default: break; @@ -511,6 +510,11 @@ abrolhos_pm_shutdown_firmware(struct abrolhos_platform_dev *etpdev, return; etdev_warn(etdev, "Firmware shutdown request failed!\n"); + etdev_warn(etdev, "Attempting firmware restart\n"); + if (!edgetpu_firmware_restart_locked(etdev, true) && + !edgetpu_pchannel_power_down(etdev, false)) + return; + etdev_warn(etdev, "Requesting early GSA reset\n"); /* diff --git a/drivers/edgetpu/abrolhos-thermal.c b/drivers/edgetpu/abrolhos-thermal.c index d8123b6..67016c6 100644 --- a/drivers/edgetpu/abrolhos-thermal.c +++ b/drivers/edgetpu/abrolhos-thermal.c @@ -77,9 +77,9 @@ static int edgetpu_set_cur_state(struct thermal_cooling_device *cdev, dev_err(dev, "error setting tpu policy: %d\n", ret); goto out; } - if (state_original == 0) + if (pwr_state == TPU_OFF) cooling->thermal_suspended = true; - else if (cooling->cooling_state == 0) + else if (state_pwr_map[cooling->cooling_state].state == TPU_OFF) cooling->thermal_suspended = false; cooling->cooling_state = state_original; ret = edgetpu_kci_notify_throttling(etdev, pwr_state); @@ -258,7 +258,7 @@ static int tpu_thermal_parse_dvfs_table(struct edgetpu_thermal *thermal) for (i = 0; i < row_size; ++i) { int idx = col_size * i; state_pwr_map[i].state = of_data_int_array[idx]; - state_pwr_map[i].power = of_data_int_array[idx+1]; + state_pwr_map[i].power = of_data_int_array[idx + 1]; } return 0; diff --git a/drivers/edgetpu/edgetpu-device-group.c b/drivers/edgetpu/edgetpu-device-group.c index 53c8ca2..6172b2c 100644 --- a/drivers/edgetpu/edgetpu-device-group.c +++ b/drivers/edgetpu/edgetpu-device-group.c @@ -1493,8 +1493,8 @@ int edgetpu_device_group_unmap(struct edgetpu_device_group *group, int ret = 0; mutex_lock(&group->lock); - if (!edgetpu_device_group_is_finalized(group)) { - ret = edgetpu_group_errno(group); + if (!is_finalized_or_errored(group)) { + ret = -EINVAL; goto unlock_group; } @@ -1788,11 +1788,16 @@ int edgetpu_group_attach_and_open_mailbox(struct edgetpu_device_group *group) * Only attaching mailbox for finalized groups. * Don't attach mailbox for errored groups. */ - if (edgetpu_device_group_is_finalized(group)) { - ret = edgetpu_group_attach_mailbox_locked(group); - if (!ret) - ret = edgetpu_group_activate(group); - } + if (!edgetpu_device_group_is_finalized(group)) + goto out_unlock; + ret = edgetpu_group_attach_mailbox_locked(group); + if (ret) + goto out_unlock; + ret = edgetpu_group_activate(group); + if (ret) + edgetpu_group_detach_mailbox_locked(group); + +out_unlock: mutex_unlock(&group->lock); return ret; } diff --git a/drivers/edgetpu/edgetpu-dmabuf.c b/drivers/edgetpu/edgetpu-dmabuf.c index 5259650..1c89178 100644 --- a/drivers/edgetpu/edgetpu-dmabuf.c +++ b/drivers/edgetpu/edgetpu-dmabuf.c @@ -744,8 +744,9 @@ int edgetpu_unmap_dmabuf(struct edgetpu_device_group *group, u32 die_index, int ret = -EINVAL; mutex_lock(&group->lock); - if (!edgetpu_device_group_is_finalized(group)) { - ret = edgetpu_group_errno(group); + /* allows unmapping on errored groups */ + if (!edgetpu_device_group_is_finalized(group) && !edgetpu_device_group_is_errored(group)) { + ret = -EINVAL; goto out_unlock; } edgetpu_mapping_lock(mappings); diff --git a/drivers/edgetpu/edgetpu-firmware.c b/drivers/edgetpu/edgetpu-firmware.c index 2a1e577..d23d00c 100644 --- a/drivers/edgetpu/edgetpu-firmware.c +++ b/drivers/edgetpu/edgetpu-firmware.c @@ -8,6 +8,7 @@ #include <linux/delay.h> #include <linux/device.h> #include <linux/firmware.h> +#include <linux/module.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -25,6 +26,9 @@ #include "edgetpu-sw-watchdog.h" #include "edgetpu-telemetry.h" +static char *firmware_name; +module_param(firmware_name, charp, 0660); + /* * Descriptor for loaded firmware, either in shared buffer mode or legacy mode * (non-shared, custom allocated memory). @@ -46,7 +50,7 @@ struct edgetpu_firmware_desc { }; struct edgetpu_firmware_private { - const struct edgetpu_firmware_handlers *handlers; + const struct edgetpu_firmware_chip_data *chip_fw; void *data; /* for edgetpu_firmware_(set/get)_data */ struct mutex fw_desc_lock; @@ -172,14 +176,14 @@ static int edgetpu_firmware_load_locked( struct edgetpu_firmware_desc *fw_desc, const char *name, enum edgetpu_firmware_flags flags) { - const struct edgetpu_firmware_handlers *handlers = et_fw->p->handlers; + const struct edgetpu_firmware_chip_data *chip_fw = et_fw->p->chip_fw; struct edgetpu_dev *etdev = et_fw->etdev; int ret; fw_desc->buf.flags = flags; - if (handlers && handlers->alloc_buffer) { - ret = handlers->alloc_buffer(et_fw, &fw_desc->buf); + if (chip_fw->alloc_buffer) { + ret = chip_fw->alloc_buffer(et_fw, &fw_desc->buf); if (ret) { etdev_err(etdev, "handler alloc_buffer failed: %d\n", ret); @@ -193,8 +197,8 @@ static int edgetpu_firmware_load_locked( goto out_free_buffer; } - if (handlers && handlers->setup_buffer) { - ret = handlers->setup_buffer(et_fw, &fw_desc->buf); + if (chip_fw->setup_buffer) { + ret = chip_fw->setup_buffer(et_fw, &fw_desc->buf); if (ret) { etdev_err(etdev, "handler setup_buffer failed: %d\n", ret); @@ -207,8 +211,8 @@ static int edgetpu_firmware_load_locked( out_do_unload_locked: edgetpu_firmware_do_unload_locked(et_fw, fw_desc); out_free_buffer: - if (handlers && handlers->free_buffer) - handlers->free_buffer(et_fw, &fw_desc->buf); + if (chip_fw->free_buffer) + chip_fw->free_buffer(et_fw, &fw_desc->buf); return ret; } @@ -216,19 +220,19 @@ static void edgetpu_firmware_unload_locked( struct edgetpu_firmware *et_fw, struct edgetpu_firmware_desc *fw_desc) { - const struct edgetpu_firmware_handlers *handlers = et_fw->p->handlers; + const struct edgetpu_firmware_chip_data *chip_fw = et_fw->p->chip_fw; /* * Platform specific implementation for cleaning up allocated buffer. */ - if (handlers && handlers->teardown_buffer) - handlers->teardown_buffer(et_fw, &fw_desc->buf); + if (chip_fw->teardown_buffer) + chip_fw->teardown_buffer(et_fw, &fw_desc->buf); edgetpu_firmware_do_unload_locked(et_fw, fw_desc); /* * Platform specific implementation for freeing allocated buffer. */ - if (handlers && handlers->free_buffer) - handlers->free_buffer(et_fw, &fw_desc->buf); + if (chip_fw->free_buffer) + chip_fw->free_buffer(et_fw, &fw_desc->buf); } static char *fw_flavor_str(enum edgetpu_fw_flavor fw_flavor) @@ -445,7 +449,7 @@ int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw, const char *name, enum edgetpu_firmware_flags flags) { - const struct edgetpu_firmware_handlers *handlers = et_fw->p->handlers; + const struct edgetpu_firmware_chip_data *chip_fw = et_fw->p->chip_fw; struct edgetpu_firmware_desc new_fw_desc; int ret; bool is_bl1_run = (flags & FW_BL1); @@ -460,9 +464,9 @@ int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw, goto out_failed; etdev_dbg(et_fw->etdev, "run fw %s flags=0x%x", name, flags); - if (handlers && handlers->prepare_run) { + if (chip_fw->prepare_run) { /* Note this may recursively call us to run BL1 */ - ret = handlers->prepare_run(et_fw, &new_fw_desc.buf); + ret = chip_fw->prepare_run(et_fw, &new_fw_desc.buf); if (ret) goto out_unload_new_fw; } @@ -486,18 +490,18 @@ int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw, if (!ret && !is_bl1_run && et_fw->p->fw_info.fw_flavor != FW_FLAVOR_BL1) edgetpu_sw_wdt_start(et_fw->etdev); - if (!ret && !is_bl1_run && handlers && handlers->launch_complete) - handlers->launch_complete(et_fw); - else if (ret && handlers && handlers->launch_failed) - handlers->launch_failed(et_fw, ret); + if (!ret && !is_bl1_run && chip_fw->launch_complete) + chip_fw->launch_complete(et_fw); + else if (ret && chip_fw->launch_failed) + chip_fw->launch_failed(et_fw, ret); edgetpu_firmware_set_state(et_fw, ret); return ret; out_unload_new_fw: edgetpu_firmware_unload_locked(et_fw, &new_fw_desc); out_failed: - if (handlers && handlers->launch_failed) - handlers->launch_failed(et_fw, ret); + if (chip_fw->launch_failed) + chip_fw->launch_failed(et_fw, ret); edgetpu_firmware_set_state(et_fw, ret); return ret; } @@ -528,6 +532,31 @@ int edgetpu_firmware_run(struct edgetpu_dev *etdev, const char *name, return ret; } +int edgetpu_firmware_run_default_locked(struct edgetpu_dev *etdev) +{ + struct edgetpu_firmware *et_fw = etdev->firmware; + const char *run_firmware_name = + et_fw->p->chip_fw->default_firmware_name; + + if (firmware_name && *firmware_name) + run_firmware_name = firmware_name; + + return edgetpu_firmware_run_locked(etdev->firmware, run_firmware_name, + FW_DEFAULT); +} + +int edgetpu_firmware_run_default(struct edgetpu_dev *etdev) +{ + struct edgetpu_firmware *et_fw = etdev->firmware; + const char *run_firmware_name = + et_fw->p->chip_fw->default_firmware_name; + + if (firmware_name && *firmware_name) + run_firmware_name = firmware_name; + + return edgetpu_firmware_run(etdev, run_firmware_name, FW_DEFAULT); +} + bool edgetpu_firmware_is_loading(struct edgetpu_dev *etdev) { struct edgetpu_firmware *et_fw = etdev->firmware; @@ -558,10 +587,10 @@ edgetpu_firmware_set_status_locked(struct edgetpu_dev *etdev, } /* Caller must hold firmware lock for loading. */ -int edgetpu_firmware_restart_locked(struct edgetpu_dev *etdev) +int edgetpu_firmware_restart_locked(struct edgetpu_dev *etdev, bool force_reset) { struct edgetpu_firmware *et_fw = etdev->firmware; - const struct edgetpu_firmware_handlers *handlers = et_fw->p->handlers; + const struct edgetpu_firmware_chip_data *chip_fw = et_fw->p->chip_fw; int ret = -1; edgetpu_firmware_set_loading(et_fw); @@ -570,10 +599,10 @@ int edgetpu_firmware_restart_locked(struct edgetpu_dev *etdev) * Try restarting the firmware first, fall back to normal firmware start * if this fails. */ - if (handlers && handlers->restart) - ret = handlers->restart(et_fw); - if (ret && handlers && handlers->prepare_run) { - ret = handlers->prepare_run(et_fw, &et_fw->p->fw_desc.buf); + if (chip_fw->restart) + ret = chip_fw->restart(et_fw, force_reset); + if (ret && chip_fw->prepare_run) { + ret = chip_fw->prepare_run(et_fw, &et_fw->p->fw_desc.buf); if (ret) goto out; } @@ -637,7 +666,7 @@ static ssize_t load_firmware_store( return PTR_ERR(name); etdev_info(etdev, "loading firmware %s\n", name); - ret = edgetpu_chip_firmware_run(etdev, name, 0); + ret = edgetpu_firmware_run(etdev, name, 0); kfree(name); @@ -726,14 +755,14 @@ static void edgetpu_firmware_wdt_timeout_action(void *data) ret = edgetpu_firmware_pm_get(et_fw); if (!ret) { - ret = edgetpu_firmware_restart_locked(etdev); + ret = edgetpu_firmware_restart_locked(etdev, true); edgetpu_pm_put(etdev->pm); } edgetpu_firmware_unlock(etdev); } int edgetpu_firmware_create(struct edgetpu_dev *etdev, - const struct edgetpu_firmware_handlers *handlers) + const struct edgetpu_firmware_chip_data *chip_fw) { struct edgetpu_firmware *et_fw; int ret; @@ -751,7 +780,7 @@ int edgetpu_firmware_create(struct edgetpu_dev *etdev, ret = -ENOMEM; goto out_kfree_et_fw; } - et_fw->p->handlers = handlers; + et_fw->p->chip_fw = chip_fw; mutex_init(&et_fw->p->fw_desc_lock); @@ -759,8 +788,8 @@ int edgetpu_firmware_create(struct edgetpu_dev *etdev, if (ret) goto out_kfree_et_fw_p; - if (handlers && handlers->after_create) { - ret = handlers->after_create(et_fw); + if (chip_fw->after_create) { + ret = chip_fw->after_create(et_fw); if (ret) { etdev_dbg(etdev, "%s: after create handler failed: %d\n", @@ -791,20 +820,20 @@ out_kfree_et_fw: void edgetpu_firmware_destroy(struct edgetpu_dev *etdev) { struct edgetpu_firmware *et_fw = etdev->firmware; - const struct edgetpu_firmware_handlers *handlers; + const struct edgetpu_firmware_chip_data *chip_fw; if (!et_fw) return; edgetpu_sw_wdt_destroy(etdev); if (et_fw->p) { - handlers = et_fw->p->handlers; + chip_fw = et_fw->p->chip_fw; /* * Platform specific implementation, which includes stop * running firmware. */ - if (handlers && handlers->before_destroy) - handlers->before_destroy(et_fw); + if (chip_fw->before_destroy) + chip_fw->before_destroy(et_fw); } device_remove_group(etdev->dev, &edgetpu_firmware_attr_group); diff --git a/drivers/edgetpu/edgetpu-firmware.h b/drivers/edgetpu/edgetpu-firmware.h index 3b784c5..0d3e1d4 100644 --- a/drivers/edgetpu/edgetpu-firmware.h +++ b/drivers/edgetpu/edgetpu-firmware.h @@ -94,11 +94,15 @@ struct edgetpu_firmware_buffer { const char *name; /* the name of this firmware */ }; -/* - * Each handler returns 0 to indicate success, non-zero value to - * indicate error. - */ -struct edgetpu_firmware_handlers { +struct edgetpu_firmware_chip_data { + /* Name of default firmware image for this chip. */ + const char *default_firmware_name; + + /* + * Chip handlers called by common firmware processing. + * Each handler returns 0 to indicate success, non-zero value to + * indicate error. + */ int (*after_create)(struct edgetpu_firmware *et_fw); /* * Release resource used in platform specific implementation, @@ -154,20 +158,10 @@ struct edgetpu_firmware_handlers { * Optional platform-specific handler to restart an already loaded * firmware. */ - int (*restart)(struct edgetpu_firmware *et_fw); + int (*restart)(struct edgetpu_firmware *et_fw, bool force_reset); }; /* - * Top-level chip-specific run firmware routine. - * Calls edgetpu_firmware_run() one or more times as appropriate for chip- - * specific one- or two-stage bootloader processing. - * - * @name: the name passed into underlying request_firmware API - * @flags: edgetpu_firmware_flags for the image - */ -int edgetpu_chip_firmware_run(struct edgetpu_dev *etdev, const char *name, - enum edgetpu_firmware_flags flags); -/* * Returns the chip-specific IOVA where the firmware is mapped. * * Debug purpose only. @@ -175,13 +169,20 @@ int edgetpu_chip_firmware_run(struct edgetpu_dev *etdev, const char *name, unsigned long edgetpu_chip_firmware_iova(struct edgetpu_dev *etdev); /* - * Load and run firmware. Called by edgetpu_chip_firmware_run(). + * Load and run firmware. * @name: the name passed into underlying request_firmware API * @flags: edgetpu_firmware_flags for the image + * Used internally by the sysfs load interface and by unit tests. */ int edgetpu_firmware_run(struct edgetpu_dev *etdev, const char *name, enum edgetpu_firmware_flags flags); +/* Load and run the default firmware name for the chip. */ +int edgetpu_firmware_run_default(struct edgetpu_dev *etdev); + +/* Runs default firmware for the chip, caller holds FW/PM locks */ +int edgetpu_firmware_run_default_locked(struct edgetpu_dev *etdev); + /* * Private data set and used by handlers. It is expected to * allocate and set the data on after_create() and release on @@ -191,7 +192,7 @@ void edgetpu_firmware_set_data(struct edgetpu_firmware *et_fw, void *data); void *edgetpu_firmware_get_data(struct edgetpu_firmware *et_fw); int edgetpu_firmware_create(struct edgetpu_dev *etdev, - const struct edgetpu_firmware_handlers *handlers); + const struct edgetpu_firmware_chip_data *chip_fw); void edgetpu_firmware_destroy(struct edgetpu_dev *etdev); void edgetpu_firmware_mappings_show(struct edgetpu_dev *etdev, struct seq_file *s); @@ -223,15 +224,16 @@ edgetpu_firmware_set_status_locked(struct edgetpu_dev *etdev, /* * Restarts the last firmware image loaded * Intended for power managed devices to re-run the firmware without a full - * reload from the file system + * reload from the file system. + * Optionally, force a CPU reset to recover from a bad firmware state. */ -int edgetpu_firmware_restart_locked(struct edgetpu_dev *etdev); +int edgetpu_firmware_restart_locked(struct edgetpu_dev *etdev, + bool force_reset); /* * Loads and runs the specified firmware assuming the required locks have been - * acquired + * acquired. Used to run second-stage bootloader. */ - int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw, const char *name, enum edgetpu_firmware_flags flags); diff --git a/drivers/edgetpu/edgetpu-fs.c b/drivers/edgetpu/edgetpu-fs.c index 6fbd642..1b6e039 100644 --- a/drivers/edgetpu/edgetpu-fs.c +++ b/drivers/edgetpu/edgetpu-fs.c @@ -625,6 +625,7 @@ static int edgetpu_ioctl_acquire_wakelock(struct edgetpu_client *client) error_release: edgetpu_wakelock_release(client->wakelock); edgetpu_wakelock_unlock(client->wakelock); + etdev_err(client->etdev, "PID: %d failed to acquire wakelock", client->pid); return ret; } @@ -646,11 +647,15 @@ edgetpu_ioctl_acquire_ext_mailbox(struct edgetpu_client *client, struct edgetpu_ext_mailbox_ioctl __user *argp) { struct edgetpu_ext_mailbox_ioctl ext_mailbox; + int ret; if (copy_from_user(&ext_mailbox, argp, sizeof(ext_mailbox))) return -EFAULT; - return edgetpu_chip_acquire_ext_mailbox(client, &ext_mailbox); + ret = edgetpu_chip_acquire_ext_mailbox(client, &ext_mailbox); + if (ret) + etdev_err(client->etdev, "PID: %d failed to acquire ext mailbox", client->pid); + return ret; } static int @@ -1055,6 +1060,7 @@ void edgetpu_fs_remove(struct edgetpu_dev *etdev) { device_remove_group(etdev->dev, &edgetpu_attr_group); device_destroy(edgetpu_class, etdev->devno); + etdev->etcdev = NULL; cdev_del(&etdev->cdev); debugfs_remove_recursive(etdev->d_entry); } diff --git a/drivers/edgetpu/edgetpu-google-iommu.c b/drivers/edgetpu/edgetpu-google-iommu.c index 9d28949..851a326 100644 --- a/drivers/edgetpu/edgetpu-google-iommu.c +++ b/drivers/edgetpu/edgetpu-google-iommu.c @@ -101,21 +101,21 @@ static int edgetpu_iommu_dev_fault_handler(struct iommu_fault *fault, struct edgetpu_dev *etdev = (struct edgetpu_dev *)token; if (fault->type == IOMMU_FAULT_DMA_UNRECOV) { - etdev_err(etdev, "Unrecoverable IOMMU fault!\n"); - etdev_err(etdev, "Reason = %08X\n", fault->event.reason); - etdev_err(etdev, "flags = %08X\n", fault->event.flags); - etdev_err(etdev, "pasid = %08X\n", fault->event.pasid); - etdev_err(etdev, "perms = %08X\n", fault->event.perm); - etdev_err(etdev, "addr = %llX\n", fault->event.addr); - etdev_err(etdev, "fetch_addr = %llX\n", + etdev_warn(etdev, "Unrecoverable IOMMU fault!\n"); + etdev_warn(etdev, "Reason = %08X\n", fault->event.reason); + etdev_warn(etdev, "flags = %08X\n", fault->event.flags); + etdev_warn(etdev, "pasid = %08X\n", fault->event.pasid); + etdev_warn(etdev, "perms = %08X\n", fault->event.perm); + etdev_warn(etdev, "addr = %llX\n", fault->event.addr); + etdev_warn(etdev, "fetch_addr = %llX\n", fault->event.fetch_addr); } else if (fault->type == IOMMU_FAULT_PAGE_REQ) { - etdev_err(etdev, "IOMMU page request fault!\n"); - etdev_err(etdev, "flags = %08X\n", fault->prm.flags); - etdev_err(etdev, "pasid = %08X\n", fault->prm.pasid); - etdev_err(etdev, "grpid = %08X\n", fault->prm.grpid); - etdev_err(etdev, "perms = %08X\n", fault->prm.perm); - etdev_err(etdev, "addr = %llX\n", fault->prm.addr); + etdev_dbg(etdev, "IOMMU page request fault!\n"); + etdev_dbg(etdev, "flags = %08X\n", fault->prm.flags); + etdev_dbg(etdev, "pasid = %08X\n", fault->prm.pasid); + etdev_dbg(etdev, "grpid = %08X\n", fault->prm.grpid); + etdev_dbg(etdev, "perms = %08X\n", fault->prm.perm); + etdev_dbg(etdev, "addr = %llX\n", fault->prm.addr); } // Tell the IOMMU driver to carry on return -EAGAIN; @@ -168,7 +168,7 @@ static int edgetpu_iommu_fault_handler(struct iommu_domain *domain, struct edgetpu_iommu_domain *etdomain = (struct edgetpu_iommu_domain *)token; - dev_err(dev, "IOMMU fault on address %08lX. PASID = %u flags = %08X", + dev_dbg(dev, "IOMMU fault on address %08lX. PASID = %u flags = %08X", iova, etdomain->pasid, flags); // Tell the IOMMU driver we are OK with this fault return 0; diff --git a/drivers/edgetpu/edgetpu-internal.h b/drivers/edgetpu/edgetpu-internal.h index 7c4966e..23e0c12 100644 --- a/drivers/edgetpu/edgetpu-internal.h +++ b/drivers/edgetpu/edgetpu-internal.h @@ -35,22 +35,24 @@ #include "edgetpu-thermal.h" #include "edgetpu-usage-stats.h" -#define etdev_err(etdev, fmt, ...) dev_err((etdev)->etcdev, fmt, ##__VA_ARGS__) +#define get_dev_for_logging(etdev) ((etdev)->etcdev ? (etdev)->etcdev : (etdev)->dev) + +#define etdev_err(etdev, fmt, ...) dev_err(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__) #define etdev_warn(etdev, fmt, ...) \ - dev_warn((etdev)->etcdev, fmt, ##__VA_ARGS__) + dev_warn(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__) #define etdev_info(etdev, fmt, ...) \ - dev_info((etdev)->etcdev, fmt, ##__VA_ARGS__) -#define etdev_dbg(etdev, fmt, ...) dev_dbg((etdev)->etcdev, fmt, ##__VA_ARGS__) + dev_info(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__) +#define etdev_dbg(etdev, fmt, ...) dev_dbg(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__) #define etdev_err_ratelimited(etdev, fmt, ...) \ - dev_err_ratelimited((etdev)->etcdev, fmt, ##__VA_ARGS__) + dev_err_ratelimited(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__) #define etdev_warn_ratelimited(etdev, fmt, ...) \ - dev_warn_ratelimited((etdev)->etcdev, fmt, ##__VA_ARGS__) + dev_warn_ratelimited(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__) #define etdev_info_ratelimited(etdev, fmt, ...) \ - dev_info_ratelimited((etdev)->etcdev, fmt, ##__VA_ARGS__) + dev_info_ratelimited(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__) #define etdev_dbg_ratelimited(etdev, fmt, ...) \ - dev_dbg_ratelimited((etdev)->etcdev, fmt, ##__VA_ARGS__) + dev_dbg_ratelimited(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__) #define etdev_warn_once(etdev, fmt, ...) \ - dev_warn_once((etdev)->etcdev, fmt, ##__VA_ARGS__) + dev_warn_once(get_dev_for_logging(etdev), fmt, ##__VA_ARGS__) /* The number of TPU tiles in an edgetpu chip */ #ifdef CONFIG_EDGETPU_FPGA diff --git a/drivers/edgetpu/edgetpu-kci.c b/drivers/edgetpu/edgetpu-kci.c index 73a47cc..1b467e2 100644 --- a/drivers/edgetpu/edgetpu-kci.c +++ b/drivers/edgetpu/edgetpu-kci.c @@ -16,8 +16,8 @@ #include "edgetpu-firmware.h" #include "edgetpu-internal.h" -#include "edgetpu-kci.h" #include "edgetpu-iremap-pool.h" +#include "edgetpu-kci.h" #include "edgetpu-mmu.h" #include "edgetpu-telemetry.h" #include "edgetpu-usage-stats.h" @@ -40,6 +40,14 @@ #define KCI_TIMEOUT (5000) #endif +/* A macro for KCIs to leave early when the device state is known to be bad. */ +#define RETURN_ERRNO_IF_ETDEV_NOT_GOOD(kci) \ + do { \ + int ret = edgetpu_get_state_errno_locked(kci->mailbox->etdev); \ + if (ret) \ + return ret; \ + } while (0) + static inline u32 edgetpu_kci_queue_element_size(enum mailbox_queue_type type) { if (type == MAILBOX_CMD_QUEUE) @@ -781,6 +789,7 @@ int edgetpu_kci_join_group(struct edgetpu_kci *kci, u8 n_dies, u8 vid) if (!kci) return -ENODEV; + RETURN_ERRNO_IF_ETDEV_NOT_GOOD(kci); return edgetpu_kci_send_cmd_with_data(kci, &cmd, &detail, sizeof(detail)); } @@ -792,6 +801,7 @@ int edgetpu_kci_leave_group(struct edgetpu_kci *kci) if (!kci) return -ENODEV; + RETURN_ERRNO_IF_ETDEV_NOT_GOOD(kci); return edgetpu_kci_send_cmd(kci, &cmd); } @@ -1001,6 +1011,7 @@ int edgetpu_kci_open_device(struct edgetpu_kci *kci, u32 mailbox_id, s16 vcid, b if (!kci) return -ENODEV; + RETURN_ERRNO_IF_ETDEV_NOT_GOOD(kci); if (vcid < 0) return edgetpu_kci_send_cmd(kci, &cmd); return edgetpu_kci_send_cmd_with_data(kci, &cmd, &detail, sizeof(detail)); @@ -1017,6 +1028,7 @@ int edgetpu_kci_close_device(struct edgetpu_kci *kci, u32 mailbox_id) if (!kci) return -ENODEV; + RETURN_ERRNO_IF_ETDEV_NOT_GOOD(kci); return edgetpu_kci_send_cmd(kci, &cmd); } diff --git a/drivers/edgetpu/edgetpu-kci.h b/drivers/edgetpu/edgetpu-kci.h index deb258d..2893f20 100644 --- a/drivers/edgetpu/edgetpu-kci.h +++ b/drivers/edgetpu/edgetpu-kci.h @@ -24,7 +24,7 @@ * Maximum number of outstanding KCI requests from firmware * This is used to size a circular buffer, so it must be a power of 2 */ -#define REVERSE_KCI_BUFFER_SIZE (8) +#define REVERSE_KCI_BUFFER_SIZE (32) /* * The status field in a firmware response is set to this by us when the diff --git a/drivers/edgetpu/edgetpu-mailbox.c b/drivers/edgetpu/edgetpu-mailbox.c index cf996f7..eedde54 100644 --- a/drivers/edgetpu/edgetpu-mailbox.c +++ b/drivers/edgetpu/edgetpu-mailbox.c @@ -18,6 +18,7 @@ #include "edgetpu-kci.h" #include "edgetpu-mailbox.h" #include "edgetpu-mmu.h" +#include "edgetpu-sw-watchdog.h" #include "edgetpu-wakelock.h" #include "edgetpu.h" @@ -1132,6 +1133,13 @@ int edgetpu_mailbox_activate(struct edgetpu_dev *etdev, u32 mailbox_id, s16 vcid eh->fw_state |= bit; } mutex_unlock(&eh->lock); + /* + * We are observing OPEN_DEVICE KCI fails while other KCIs (usage update / shutdown) still + * succeed and no firmware crash is reported. Kick off the firmware restart when we are + * facing this and hope this can rescue the device from the bad state. + */ + if (ret == -ETIMEDOUT) + edgetpu_watchdog_bite(etdev, false); return ret; } diff --git a/drivers/edgetpu/edgetpu-mmu.h b/drivers/edgetpu/edgetpu-mmu.h index 094f14d..7cc9ffa 100644 --- a/drivers/edgetpu/edgetpu-mmu.h +++ b/drivers/edgetpu/edgetpu-mmu.h @@ -16,11 +16,6 @@ #include "edgetpu-internal.h" #include "edgetpu.h" -/* TODO(b/153947157): remove this */ -#if IS_ENABLED(CONFIG_EDGETPU_TEST) -#include <linux/iommu-ext.h> -#endif - #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 2, 0) #ifndef IOMMU_PASID_INVALID #define IOMMU_PASID_INVALID (-1U) diff --git a/drivers/edgetpu/mobile-firmware.h b/drivers/edgetpu/mobile-firmware.h index e0c8dd8..691eaf5 100644 --- a/drivers/edgetpu/mobile-firmware.h +++ b/drivers/edgetpu/mobile-firmware.h @@ -49,6 +49,4 @@ struct mobile_image_header { int mobile_edgetpu_firmware_create(struct edgetpu_dev *etdev); void mobile_edgetpu_firmware_destroy(struct edgetpu_dev *etdev); -int mobile_edgetpu_firmware_run_default(struct edgetpu_dev *etdev); - #endif /* __MOBILE_FIRMWARE_H__ */ |