diff options
author | Whi copybara merger <whitechapel-automerger@google.com> | 2023-03-27 09:57:12 -0700 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2023-03-28 14:25:37 -0700 |
commit | 7d226040fcdb03c939b20f5824617a7de9c00962 (patch) | |
tree | 847a809ff974de780cd19777cd97be8275934b93 | |
parent | 10294976775d8ffa64640346f8eb66b1da7f40f0 (diff) | |
download | abrolhos-7d226040fcdb03c939b20f5824617a7de9c00962.tar.gz |
[Copybara Auto Merge] Merge branch whitechapel into partner-android
edgetpu: fix -Wcast-function-type-strict
Building this driver with the newer AOSP Clang 17.0.0 produces the
following error:
private/google-modules/edgetpu/abrolhos/drivers/edgetpu/edgetpu-device-group.c:1007:4:
error: cast from 'int (*)(struct iova_mapping_worker_param *)' to
'edgetpu_async_job_t' (aka 'int (*)(void *)') converts to incompatible
function type [-Werror,-Wcast-function-type-strict]
(edgetpu_async_job_t)edgetpu_map_iova_sgt_worker);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Fix this by declaring edgetpu_map_iova_sgt_worker have the correct
signature, cast the parameter in edgetpu_map_iova_sgt_worker, and remove
the function pointer cast. This warning was added to clang to help
catch CFI failures at compile time rather than runtime.
Bug: 264965700
edgetpu: sync with darwinn-2.0
9364e47c8 edgetpu: Make group required for creating fence
a257e3575 edgetpu: create debugfs symlinks for non-default device names
b25b06c56 Revert "edgetpu: Continue powering up if the block is still on"
5dea12ffd edgetpu: Continue powering up if the block is still on
af318f68d edgetpu: retry and sleep until block down
b4cf36957 edgetpu: Add etdev as private data for syncfences
ede90cae7 edgetpu: usage_stats add cluster reconfigurations counters
77dae3e48 edgetpu: usage_stats: process metrics v2 data
3b2bc8d98 edgetpu: pm: reject power up if thermal suspended
5f19efb5b edgetpu: usage_stats send metrics v2 requests with v1 fallback
52c262671 edgetpu: usage stats: sync additional metrics v2 changes
e5cc5696b edgetpu: Only call .power_up if needed
d06a8f889 edgetpu: Downgrade warning on external mailbox alloc
52fe2ac98 edgetpu: usage stats add field definitions for metrics v2
28bbb7446 edgetpu: usage stats ignore metric fields beyond known size
Bug: 271372136
Bug: 271374892
Bug: 269476405
Bug: 272701322
Bug: 264971968
Bug: 258868303
GitOrigin-RevId: 03a5015bf5a6601295e1967dbbedbe242192c30c
Change-Id: I2fa4b73fd554d559734f8803c7893570ae9fad8d
-rw-r--r-- | drivers/edgetpu/abrolhos/config.h | 3 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-device-group.c | 8 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-dmabuf.c | 20 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-external.c | 4 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-firmware.c | 10 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-fs.c | 50 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-internal.h | 1 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-kci.c | 14 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-kci.h | 5 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-pm.c | 11 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-thermal.h | 22 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-usage-stats.c | 380 | ||||
-rw-r--r-- | drivers/edgetpu/edgetpu-usage-stats.h | 79 | ||||
-rw-r--r-- | drivers/edgetpu/mobile-pm.c | 22 |
14 files changed, 372 insertions, 257 deletions
diff --git a/drivers/edgetpu/abrolhos/config.h b/drivers/edgetpu/abrolhos/config.h index 9cb5ffc..b1db928 100644 --- a/drivers/edgetpu/abrolhos/config.h +++ b/drivers/edgetpu/abrolhos/config.h @@ -34,6 +34,9 @@ */ #define EDGETPU_HAS_REMAPPED_DATA +/* Metrics are reported for a single default "cluster" component. */ +#define EDGETPU_TPU_CLUSTER_COUNT 1 + /* * The TPU VA where the firmware is located. * diff --git a/drivers/edgetpu/edgetpu-device-group.c b/drivers/edgetpu/edgetpu-device-group.c index 7734f33..2b13cf0 100644 --- a/drivers/edgetpu/edgetpu-device-group.c +++ b/drivers/edgetpu/edgetpu-device-group.c @@ -949,8 +949,9 @@ struct iova_mapping_worker_param { uint idx; }; -static int edgetpu_map_iova_sgt_worker(struct iova_mapping_worker_param *param) +static int edgetpu_map_iova_sgt_worker(void *p) { + struct iova_mapping_worker_param *param = p; struct edgetpu_device_group *group = param->group; uint i = param->idx; struct edgetpu_host_map *hmap = param->hmap; @@ -1006,9 +1007,8 @@ static int edgetpu_device_group_map_iova_sgt(struct edgetpu_device_group *group, params[i].hmap = hmap; params[i].group = group; params[i].idx = i + 1; - ret = edgetpu_async_add_job( - ctx, ¶ms[i], - (edgetpu_async_job_t)edgetpu_map_iova_sgt_worker); + ret = edgetpu_async_add_job(ctx, ¶ms[i], + edgetpu_map_iova_sgt_worker); if (ret) goto out_free; } diff --git a/drivers/edgetpu/edgetpu-dmabuf.c b/drivers/edgetpu/edgetpu-dmabuf.c index 72072f6..fbc9f48 100644 --- a/drivers/edgetpu/edgetpu-dmabuf.c +++ b/drivers/edgetpu/edgetpu-dmabuf.c @@ -888,7 +888,7 @@ static void edgetpu_dma_fence_release(struct dma_fence *fence) list_del(&etfence->etfence_list); spin_unlock_irqrestore(&etfence_list_lock, flags); - /* TODO(b/258868303): Don't remove this check when group required, might not yet be set. */ + /* group might not yet be set if error at init time. */ group = etfence->group; if (group) { mutex_lock(&group->lock); @@ -963,15 +963,10 @@ int edgetpu_sync_fence_create(struct edgetpu_device_group *group, spin_lock_irqsave(&etfence_list_lock, flags); list_add_tail(&etfence->etfence_list, &etfence_list_head); spin_unlock_irqrestore(&etfence_list_lock, flags); - - /* TODO(b/258868303): Make group required, disallow creating fence we can't track. */ - if (group) { - etfence->group = edgetpu_device_group_get(group); - mutex_lock(&group->lock); - list_add_tail(&etfence->group_list, &group->dma_fence_list); - mutex_unlock(&group->lock); - } - + etfence->group = edgetpu_device_group_get(group); + mutex_lock(&group->lock); + list_add_tail(&etfence->group_list, &group->dma_fence_list); + mutex_unlock(&group->lock); fd_install(fd, sync_file->file); datap->fence = fd; return 0; @@ -1098,10 +1093,7 @@ int edgetpu_sync_fence_debugfs_show(struct seq_file *s, void *unused) if (fence->error) seq_printf(s, " err=%d", fence->error); - /* TODO(b/258868303): Remove check when group is required. */ - if (etfence->group) - seq_printf(s, " group=%u", etfence->group->workload_id); - seq_putc(s, '\n'); + seq_printf(s, " group=%u\n", etfence->group->workload_id); spin_unlock_irq(&etfence->lock); } diff --git a/drivers/edgetpu/edgetpu-external.c b/drivers/edgetpu/edgetpu-external.c index 4b86e13..b954844 100644 --- a/drivers/edgetpu/edgetpu-external.c +++ b/drivers/edgetpu/edgetpu-external.c @@ -95,8 +95,8 @@ static int edgetpu_external_mailbox_alloc(struct device *edgetpu_dev, if (copy_from_user(&req.attr, (void __user *)client_info->attr, sizeof(req.attr))) { if (!client_info->attr) - etdev_warn(client->etdev, - "Illegal mailbox attributes, using VII mailbox attrs\n"); + etdev_dbg(client->etdev, + "Using VII mailbox attrs for external mailbox\n"); req.attr = group->mbox_attr; } diff --git a/drivers/edgetpu/edgetpu-firmware.c b/drivers/edgetpu/edgetpu-firmware.c index cf9009b..ad27ec9 100644 --- a/drivers/edgetpu/edgetpu-firmware.c +++ b/drivers/edgetpu/edgetpu-firmware.c @@ -478,20 +478,21 @@ int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw, enum edgetpu_firmware_flags flags) { const struct edgetpu_firmware_chip_data *chip_fw = et_fw->p->chip_fw; + struct edgetpu_dev *etdev = et_fw->etdev; struct edgetpu_firmware_desc new_fw_desc; int ret; bool is_bl1_run = (flags & FW_BL1); edgetpu_firmware_set_loading(et_fw); if (!is_bl1_run) - edgetpu_sw_wdt_stop(et_fw->etdev); + edgetpu_sw_wdt_stop(etdev); memset(&new_fw_desc, 0, sizeof(new_fw_desc)); ret = edgetpu_firmware_load_locked(et_fw, &new_fw_desc, name, flags); if (ret) goto out_failed; - etdev_dbg(et_fw->etdev, "run fw %s flags=%#x", name, flags); + etdev_dbg(etdev, "run fw %s flags=%#x", name, flags); if (chip_fw->prepare_run) { /* Note this may recursively call us to run BL1 */ ret = chip_fw->prepare_run(et_fw, &new_fw_desc.buf); @@ -516,13 +517,16 @@ int edgetpu_firmware_run_locked(struct edgetpu_firmware *et_fw, /* Don't start wdt if loaded firmware is second stage bootloader. */ if (!ret && !is_bl1_run && et_fw->p->fw_info.fw_flavor != FW_FLAVOR_BL1) - edgetpu_sw_wdt_start(et_fw->etdev); + edgetpu_sw_wdt_start(etdev); if (!ret && !is_bl1_run && chip_fw->launch_complete) chip_fw->launch_complete(et_fw); else if (ret && chip_fw->launch_failed) chip_fw->launch_failed(et_fw, ret); edgetpu_firmware_set_state(et_fw, ret); + /* If previous firmware was metrics v1-only reset that flag and probe this again. */ + if (etdev->usage_stats) + etdev->usage_stats->use_metrics_v1 = false; return ret; out_unload_new_fw: diff --git a/drivers/edgetpu/edgetpu-fs.c b/drivers/edgetpu/edgetpu-fs.c index 68ff32e..854c14c 100644 --- a/drivers/edgetpu/edgetpu-fs.c +++ b/drivers/edgetpu/edgetpu-fs.c @@ -441,10 +441,11 @@ static int edgetpu_ioctl_sync_fence_create( if (copy_from_user(&data, (void __user *)datap, sizeof(data))) return -EFAULT; LOCK(client); - if (!client->group) - /* TODO(b/258868303): Require a group, disallow creating a fence we can't track. */ - etdev_warn(client->etdev, - "client creating sync fence not joined to a device group"); + if (!client->group) { + etdev_err(client->etdev, "client creating sync fence not joined to a device group"); + UNLOCK(client); + return -EINVAL; + } ret = edgetpu_sync_fence_create(client->group, &data); UNLOCK(client); if (ret) @@ -597,17 +598,14 @@ static int edgetpu_ioctl_acquire_wakelock(struct edgetpu_client *client) */ client->pid = current->pid; client->tgid = current->tgid; - edgetpu_thermal_lock(thermal); if (edgetpu_thermal_is_suspended(thermal)) { /* TPU is thermal suspended, so fail acquiring wakelock */ ret = -EAGAIN; etdev_warn_ratelimited(client->etdev, - "wakelock acquire rejected due to thermal suspend"); - edgetpu_thermal_unlock(thermal); + "wakelock acquire rejected due to device thermal limit exceeded"); goto error_client_unlock; } else { ret = edgetpu_pm_get(client->etdev->pm); - edgetpu_thermal_unlock(thermal); if (ret) { etdev_warn(client->etdev, "%s: pm_get failed (%d)", __func__, ret); @@ -1030,6 +1028,19 @@ static const struct file_operations mappings_ops = { .release = single_release, }; +static int syncfences_open(struct inode *inode, struct file *file) +{ + return single_open(file, edgetpu_sync_fence_debugfs_show, inode->i_private); +} + +static const struct file_operations syncfences_ops = { + .open = syncfences_open, + .read = seq_read, + .llseek = seq_lseek, + .owner = THIS_MODULE, + .release = single_release, +}; + static int edgetpu_pm_debugfs_set_wakelock(void *data, u64 val) { struct edgetpu_dev *etdev = data; @@ -1054,6 +1065,7 @@ static void edgetpu_fs_setup_debugfs(struct edgetpu_dev *etdev) } debugfs_create_file("mappings", 0440, etdev->d_entry, etdev, &mappings_ops); + debugfs_create_file("syncfences", 0440, etdev->d_entry, etdev, &syncfences_ops); debugfs_create_file("wakelock", 0220, etdev->d_entry, etdev, &fops_wakelock); #ifndef EDGETPU_FEATURE_MOBILE @@ -1253,6 +1265,10 @@ static int edgeptu_fs_add_interface(struct edgetpu_dev *etdev, struct edgetpu_de return ret; } + if (etiparams->name) + etiface->d_entry = + debugfs_create_symlink(etiparams->name, edgetpu_debugfs_dir, + etdev->dev_name); return 0; } @@ -1288,6 +1304,7 @@ void edgetpu_fs_remove(struct edgetpu_dev *etdev) for (i = 0; i < etdev->num_ifaces; i++) { struct edgetpu_dev_iface *etiface = &etdev->etiface[i]; + debugfs_remove(etiface->d_entry); device_destroy(edgetpu_class, etiface->devno); etiface->etcdev = NULL; cdev_del(&etiface->cdev); @@ -1295,20 +1312,6 @@ void edgetpu_fs_remove(struct edgetpu_dev *etdev) debugfs_remove_recursive(etdev->d_entry); } -static int syncfences_open(struct inode *inode, struct file *file) -{ - return single_open(file, edgetpu_sync_fence_debugfs_show, - inode->i_private); -} - -static const struct file_operations syncfences_ops = { - .open = syncfences_open, - .read = seq_read, - .llseek = seq_lseek, - .owner = THIS_MODULE, - .release = single_release, -}; - static void edgetpu_debugfs_global_setup(void) { edgetpu_debugfs_dir = debugfs_create_dir("edgetpu", NULL); @@ -1316,9 +1319,6 @@ static void edgetpu_debugfs_global_setup(void) pr_warn(DRIVER_NAME " error creating edgetpu debugfs dir\n"); return; } - - debugfs_create_file("syncfences", 0440, edgetpu_debugfs_dir, NULL, - &syncfences_ops); } int __init edgetpu_fs_init(void) diff --git a/drivers/edgetpu/edgetpu-internal.h b/drivers/edgetpu/edgetpu-internal.h index 37ea27f..e8cbf9a 100644 --- a/drivers/edgetpu/edgetpu-internal.h +++ b/drivers/edgetpu/edgetpu-internal.h @@ -238,6 +238,7 @@ struct edgetpu_dev_iface { struct edgetpu_dev *etdev; /* Pointer to core device struct */ dev_t devno; /* char device dev_t */ const char *name; /* interface specific device name */ + struct dentry *d_entry; /* debugfs symlink if not default device name iface */ }; /* Firmware crash_type codes */ diff --git a/drivers/edgetpu/edgetpu-kci.c b/drivers/edgetpu/edgetpu-kci.c index 4bd62c3..fd52c3c 100644 --- a/drivers/edgetpu/edgetpu-kci.c +++ b/drivers/edgetpu/edgetpu-kci.c @@ -934,10 +934,11 @@ int edgetpu_kci_update_usage_locked(struct edgetpu_dev *etdev) { #define EDGETPU_USAGE_BUFFER_SIZE 4096 struct edgetpu_command_element cmd = { - .code = KCI_CODE_GET_USAGE, + .code = KCI_CODE_GET_USAGE_V2, .dma = { .address = 0, .size = 0, + .flags = EDGETPU_USAGE_METRIC_VERSION, }, }; struct edgetpu_coherent_mem mem; @@ -953,13 +954,22 @@ int edgetpu_kci_update_usage_locked(struct edgetpu_dev *etdev) return ret; } + /* TODO(b/271372136): remove v1 when v1 firmware no longer in use. */ +retry_v1: + if (etdev->usage_stats && etdev->usage_stats->use_metrics_v1) + cmd.code = KCI_CODE_GET_USAGE_V1; cmd.dma.address = mem.tpu_addr; cmd.dma.size = EDGETPU_USAGE_BUFFER_SIZE; memset(mem.vaddr, 0, sizeof(struct edgetpu_usage_header)); ret = edgetpu_kci_send_cmd_return_resp(etdev->kci, &cmd, &resp); - if (ret == KCI_ERROR_UNIMPLEMENTED || ret == KCI_ERROR_UNAVAILABLE) + if (ret == KCI_ERROR_UNIMPLEMENTED || ret == KCI_ERROR_UNAVAILABLE) { + if (etdev->usage_stats && !etdev->usage_stats->use_metrics_v1) { + etdev->usage_stats->use_metrics_v1 = true; + goto retry_v1; + } etdev_dbg(etdev, "firmware does not report usage\n"); + } else if (ret == KCI_ERROR_OK) edgetpu_usage_stats_process_buffer(etdev, mem.vaddr); else if (ret != -ETIMEDOUT) diff --git a/drivers/edgetpu/edgetpu-kci.h b/drivers/edgetpu/edgetpu-kci.h index b32b097..4d2f4b0 100644 --- a/drivers/edgetpu/edgetpu-kci.h +++ b/drivers/edgetpu/edgetpu-kci.h @@ -112,11 +112,14 @@ enum edgetpu_kci_code { KCI_CODE_OPEN_DEVICE = 9, KCI_CODE_CLOSE_DEVICE = 10, KCI_CODE_FIRMWARE_INFO = 11, - KCI_CODE_GET_USAGE = 12, + /* TODO(b/271372136): remove v1 when v1 firmware no longer in use. */ + KCI_CODE_GET_USAGE_V1 = 12, KCI_CODE_NOTIFY_THROTTLING = 13, KCI_CODE_BLOCK_BUS_SPEED_CONTROL = 14, /* 15..18 not implemented in this branch */ KCI_CODE_FIRMWARE_TRACING_LEVEL = 19, + /* 20 not implemented in this branch */ + KCI_CODE_GET_USAGE_V2 = 21, KCI_CODE_RKCI_ACK = 256, }; diff --git a/drivers/edgetpu/edgetpu-pm.c b/drivers/edgetpu/edgetpu-pm.c index a71232d..40d41ff 100644 --- a/drivers/edgetpu/edgetpu-pm.c +++ b/drivers/edgetpu/edgetpu-pm.c @@ -53,9 +53,13 @@ static int edgetpu_pm_get_locked(struct edgetpu_pm *etpm) int ret = 0; if (!power_up_count) { - ret = etpm->p->handlers->power_up(etpm); - if (!ret) - edgetpu_mailbox_restore_active_mailbox_queues(etpm->etdev); + if (etpm->p->power_down_pending) { + etpm->p->power_down_pending = false; + } else { + ret = etpm->p->handlers->power_up(etpm); + if (!ret) + edgetpu_mailbox_restore_active_mailbox_queues(etpm->etdev); + } } if (ret) etpm->p->power_up_count--; @@ -103,7 +107,6 @@ int edgetpu_pm_get(struct edgetpu_pm *etpm) return 0; mutex_lock(&etpm->p->lock); - etpm->p->power_down_pending = false; ret = edgetpu_pm_get_locked(etpm); mutex_unlock(&etpm->p->lock); diff --git a/drivers/edgetpu/edgetpu-thermal.h b/drivers/edgetpu/edgetpu-thermal.h index dbd283f..0c163e0 100644 --- a/drivers/edgetpu/edgetpu-thermal.h +++ b/drivers/edgetpu/edgetpu-thermal.h @@ -56,17 +56,6 @@ int edgetpu_thermal_suspend(struct device *dev); int edgetpu_thermal_resume(struct device *dev); /* - * Holds thermal->lock. - * - * Does nothing if the thermal management is not supported. - */ -static inline void edgetpu_thermal_lock(struct edgetpu_thermal *thermal) -{ - if (!IS_ERR_OR_NULL(thermal)) - mutex_lock(&thermal->lock); -} - -/* * Checks whether device is thermal suspended. * Returns false if the thermal management is not supported. */ @@ -77,15 +66,4 @@ static inline bool edgetpu_thermal_is_suspended(struct edgetpu_thermal *thermal) return false; } -/* - * Releases thermal->lock. - * - * Does nothing if the thermal management is not supported. - */ -static inline void edgetpu_thermal_unlock(struct edgetpu_thermal *thermal) -{ - if (!IS_ERR_OR_NULL(thermal)) - mutex_unlock(&thermal->lock); -} - #endif /* __EDGETPU_THERMAL_H__ */ diff --git a/drivers/edgetpu/edgetpu-usage-stats.c b/drivers/edgetpu/edgetpu-usage-stats.c index ba93d49..9934ca6 100644 --- a/drivers/edgetpu/edgetpu-usage-stats.c +++ b/drivers/edgetpu/edgetpu-usage-stats.c @@ -74,6 +74,7 @@ int edgetpu_usage_add(struct edgetpu_dev *etdev, struct tpu_usage *tpu_usage) if (!ustats) return 0; + /* Note: as of metrics v2 the cluster_id is always zero and is ignored. */ etdev_dbg(etdev, "%s: uid=%u state=%u dur=%u", __func__, tpu_usage->uid, tpu_usage->power_state, tpu_usage->duration_us); @@ -125,63 +126,78 @@ static void edgetpu_utilization_update( mutex_unlock(&ustats->usage_stats_lock); } -static void edgetpu_counter_update( - struct edgetpu_dev *etdev, - struct edgetpu_usage_counter *counter) +static void edgetpu_counter_update(struct edgetpu_dev *etdev, struct edgetpu_usage_counter *counter, + uint version) { struct edgetpu_usage_stats *ustats = etdev->usage_stats; + uint component = version > 1 ? counter->component_id : 0; if (!ustats) return; - etdev_dbg(etdev, "%s: type=%d value=%llu\n", __func__, - counter->type, counter->value); + etdev_dbg(etdev, "%s: type=%d value=%llu comp=%u\n", __func__, counter->type, + counter->value, component); mutex_lock(&ustats->usage_stats_lock); if (counter->type >= 0 && counter->type < EDGETPU_COUNTER_COUNT) - ustats->counter[counter->type] += counter->value; + ustats->counter[counter->type][component] += counter->value; mutex_unlock(&ustats->usage_stats_lock); } -static void edgetpu_counter_clear( - struct edgetpu_dev *etdev, - enum edgetpu_usage_counter_type counter_type) +static void edgetpu_counter_clear(struct edgetpu_dev *etdev, + enum edgetpu_usage_counter_type counter_type) { struct edgetpu_usage_stats *ustats = etdev->usage_stats; + int i; - if (!ustats) - return; if (counter_type >= EDGETPU_COUNTER_COUNT) return; mutex_lock(&ustats->usage_stats_lock); - ustats->counter[counter_type] = 0; + for (i = 0; i < EDGETPU_TPU_CLUSTER_COUNT; i++) + ustats->counter[counter_type][i] = 0; mutex_unlock(&ustats->usage_stats_lock); } -static void edgetpu_max_watermark_update( - struct edgetpu_dev *etdev, - struct edgetpu_usage_max_watermark *max_watermark) +static void edgetpu_max_watermark_update(struct edgetpu_dev *etdev, + struct edgetpu_usage_max_watermark *max_watermark, + uint version) { struct edgetpu_usage_stats *ustats = etdev->usage_stats; + uint component = version > 1 ? max_watermark->component_id : 0; if (!ustats) return; - etdev_dbg(etdev, "%s: type=%d value=%llu\n", __func__, - max_watermark->type, max_watermark->value); + etdev_dbg(etdev, "%s: type=%d value=%llu comp=%u\n", __func__, max_watermark->type, + max_watermark->value, component); if (max_watermark->type < 0 || max_watermark->type >= EDGETPU_MAX_WATERMARK_TYPE_COUNT) return; mutex_lock(&ustats->usage_stats_lock); - if (max_watermark->value > ustats->max_watermark[max_watermark->type]) - ustats->max_watermark[max_watermark->type] = + if (max_watermark->value > ustats->max_watermark[max_watermark->type][component]) + ustats->max_watermark[max_watermark->type][component] = max_watermark->value; mutex_unlock(&ustats->usage_stats_lock); } +static void edgetpu_max_watermark_clear(struct edgetpu_dev *etdev, + enum edgetpu_usage_max_watermark_type max_watermark_type) +{ + struct edgetpu_usage_stats *ustats = etdev->usage_stats; + int i; + + if (max_watermark_type < 0 || max_watermark_type >= EDGETPU_MAX_WATERMARK_TYPE_COUNT) + return; + + mutex_lock(&ustats->usage_stats_lock); + for (i = 0; i < EDGETPU_TPU_CLUSTER_COUNT; i++) + ustats->max_watermark[max_watermark_type][i] = 0; + mutex_unlock(&ustats->usage_stats_lock); +} + static void edgetpu_thread_stats_update( struct edgetpu_dev *etdev, struct edgetpu_thread_stats *thread_stats) @@ -241,20 +257,44 @@ out: void edgetpu_usage_stats_process_buffer(struct edgetpu_dev *etdev, void *buf) { - struct edgetpu_usage_header *header = buf; - struct edgetpu_usage_metric *metric = - (struct edgetpu_usage_metric *)(header + 1); + struct edgetpu_usage_stats *ustats = etdev->usage_stats; + struct edgetpu_usage_metric *metric; + uint metric_size; + uint num_metrics; + uint version; int i; - etdev_dbg(etdev, "%s: n=%u sz=%u", __func__, - header->num_metrics, header->metric_size); - if (header->metric_size != sizeof(struct edgetpu_usage_metric)) { - etdev_dbg(etdev, "%s: expected sz=%zu, discard", __func__, - sizeof(struct edgetpu_usage_metric)); + if (!ustats) + return; + + /* TODO(b/271372136): remove v1 when v1 firmware no longer in use. */ + if (ustats->use_metrics_v1) { + struct edgetpu_usage_header_v1 *header = buf; + + metric_size = header->metric_size; + num_metrics = header->num_metrics; + version = 1; + metric = (struct edgetpu_usage_metric *)(header + 1); + } else { + struct edgetpu_usage_header *header = buf; + + metric_size = header->metric_size; + num_metrics = header->num_metrics; + version = header->version; + metric = (struct edgetpu_usage_metric *)((char *)header + header->header_bytes); + } + + etdev_dbg(etdev, "%s: v=%u n=%u sz=%u", __func__, version, num_metrics, metric_size); + if (metric_size < EDGETPU_USAGE_METRIC_SIZE_V1) { + etdev_warn_once(etdev, "fw metric size %u less than minimum %u", + metric_size, EDGETPU_USAGE_METRIC_SIZE_V1); return; } - for (i = 0; i < header->num_metrics; i++) { + if (metric_size > sizeof(struct edgetpu_usage_metric)) + etdev_dbg(etdev, "fw metrics are later version with unknown fields"); + + for (i = 0; i < num_metrics; i++) { switch (metric->type) { case EDGETPU_METRIC_TYPE_TPU_USAGE: edgetpu_usage_add(etdev, &metric->tpu_usage); @@ -264,19 +304,16 @@ void edgetpu_usage_stats_process_buffer(struct edgetpu_dev *etdev, void *buf) etdev, &metric->component_activity); break; case EDGETPU_METRIC_TYPE_COUNTER: - edgetpu_counter_update(etdev, &metric->counter); + edgetpu_counter_update(etdev, &metric->counter, version); break; case EDGETPU_METRIC_TYPE_MAX_WATERMARK: - edgetpu_max_watermark_update( - etdev, &metric->max_watermark); + edgetpu_max_watermark_update(etdev, &metric->max_watermark, version); break; case EDGETPU_METRIC_TYPE_THREAD_STATS: - edgetpu_thread_stats_update( - etdev, &metric->thread_stats); + edgetpu_thread_stats_update(etdev, &metric->thread_stats); break; case EDGETPU_METRIC_TYPE_DVFS_FREQUENCY_INFO: - edgetpu_dvfs_frequency_update( - etdev, metric->dvfs_frequency_info); + edgetpu_dvfs_frequency_update(etdev, metric->dvfs_frequency_info); break; default: etdev_dbg(etdev, "%s: %d: skip unknown type=%u", @@ -284,7 +321,7 @@ void edgetpu_usage_stats_process_buffer(struct edgetpu_dev *etdev, void *buf) break; } - metric++; + metric = (struct edgetpu_usage_metric *)((char *)metric + metric_size); } } @@ -304,36 +341,72 @@ int edgetpu_usage_get_utilization(struct edgetpu_dev *etdev, return val; } -static int64_t edgetpu_usage_get_counter( - struct edgetpu_dev *etdev, - enum edgetpu_usage_counter_type counter_type) +/* + * Resyncs firmware stats and formats the requested counter in the supplied buffer. + * + * If @report_per_cluster is true, and if the firmware implements metrics V2 or higher, + * then one value is formatted per cluster (for chips with only one cluster only one value is + * formatted). + * + * Returns the number of bytes written to buf. + */ +static ssize_t edgetpu_usage_format_counter(struct edgetpu_dev *etdev, char *buf, + enum edgetpu_usage_counter_type counter_type, + bool report_per_cluster) { struct edgetpu_usage_stats *ustats = etdev->usage_stats; - int64_t val; + uint ncomponents = report_per_cluster && !etdev->usage_stats->use_metrics_v1 ? + EDGETPU_TPU_CLUSTER_COUNT : 1; + uint i; + ssize_t ret = 0; if (counter_type >= EDGETPU_COUNTER_COUNT) - return -1; + return 0; edgetpu_kci_update_usage(etdev); mutex_lock(&ustats->usage_stats_lock); - val = ustats->counter[counter_type]; + for (i = 0; i < ncomponents; i++) { + if (i) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, " "); + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%llu", + ustats->counter[counter_type][i]); + } mutex_unlock(&ustats->usage_stats_lock); - return val; + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + return ret; } -static int64_t edgetpu_usage_get_max_watermark( - struct edgetpu_dev *etdev, - enum edgetpu_usage_max_watermark_type max_watermark_type) +/* + * Resyncs firmware stats and formats the requested max watermark in the supplied buffer. + * + * If @report_per_cluster is true, and if the firmware implements metrics V2 or higher, + * then one value is formatted per cluster (for chips with only one cluster only one value is + * formatted). + * + * Returns the number of bytes written to buf. + */ +static ssize_t edgetpu_usage_format_max_watermark( + struct edgetpu_dev *etdev, char *buf, + enum edgetpu_usage_max_watermark_type max_watermark_type, bool report_per_cluster) { struct edgetpu_usage_stats *ustats = etdev->usage_stats; - int64_t val; + uint ncomponents = report_per_cluster && !etdev->usage_stats->use_metrics_v1 ? + EDGETPU_TPU_CLUSTER_COUNT : 1; + uint i; + ssize_t ret = 0; if (max_watermark_type >= EDGETPU_MAX_WATERMARK_TYPE_COUNT) - return -1; + return 0; edgetpu_kci_update_usage(etdev); mutex_lock(&ustats->usage_stats_lock); - val = ustats->max_watermark[max_watermark_type]; + for (i = 0; i < ncomponents; i++) { + if (i) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, " "); + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%llu", + ustats->max_watermark[max_watermark_type][i]); + } mutex_unlock(&ustats->usage_stats_lock); - return val; + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + return ret; } static ssize_t tpu_usage_show(struct device *dev, @@ -447,11 +520,8 @@ static ssize_t tpu_active_cycle_count_show(struct device *dev, char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_counter(etdev, - EDGETPU_COUNTER_TPU_ACTIVE_CYCLES); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_counter(etdev, buf, EDGETPU_COUNTER_TPU_ACTIVE_CYCLES, false); } static ssize_t tpu_active_cycle_count_store(struct device *dev, @@ -472,11 +542,8 @@ static ssize_t tpu_throttle_stall_count_show(struct device *dev, char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_counter(etdev, - EDGETPU_COUNTER_TPU_THROTTLE_STALLS); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_counter(etdev, buf, EDGETPU_COUNTER_TPU_THROTTLE_STALLS, false); } static ssize_t tpu_throttle_stall_count_store(struct device *dev, @@ -497,11 +564,8 @@ static ssize_t inference_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_counter(etdev, - EDGETPU_COUNTER_INFERENCES); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_counter(etdev, buf, EDGETPU_COUNTER_INFERENCES, true); } static ssize_t inference_count_store(struct device *dev, @@ -517,21 +581,15 @@ static ssize_t inference_count_store(struct device *dev, static DEVICE_ATTR(inference_count, 0664, inference_count_show, inference_count_store); -static ssize_t tpu_op_count_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t tpu_op_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_counter(etdev, - EDGETPU_COUNTER_TPU_OPS); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_counter(etdev, buf, EDGETPU_COUNTER_TPU_OPS, true); } -static ssize_t tpu_op_count_store(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) +static ssize_t tpu_op_count_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); @@ -540,22 +598,16 @@ static ssize_t tpu_op_count_store(struct device *dev, } static DEVICE_ATTR(tpu_op_count, 0664, tpu_op_count_show, tpu_op_count_store); -static ssize_t param_cache_hit_count_show(struct device *dev, - struct device_attribute *attr, +static ssize_t param_cache_hit_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_counter(etdev, - EDGETPU_COUNTER_PARAM_CACHE_HITS); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_counter(etdev, buf, EDGETPU_COUNTER_PARAM_CACHE_HITS, false); } -static ssize_t param_cache_hit_count_store(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) +static ssize_t param_cache_hit_count_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); @@ -565,22 +617,16 @@ static ssize_t param_cache_hit_count_store(struct device *dev, static DEVICE_ATTR(param_cache_hit_count, 0664, param_cache_hit_count_show, param_cache_hit_count_store); -static ssize_t param_cache_miss_count_show(struct device *dev, - struct device_attribute *attr, +static ssize_t param_cache_miss_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_counter(etdev, - EDGETPU_COUNTER_PARAM_CACHE_MISSES); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_counter(etdev, buf, EDGETPU_COUNTER_PARAM_CACHE_MISSES, false); } -static ssize_t param_cache_miss_count_store(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) +static ssize_t param_cache_miss_count_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); @@ -590,22 +636,16 @@ static ssize_t param_cache_miss_count_store(struct device *dev, static DEVICE_ATTR(param_cache_miss_count, 0664, param_cache_miss_count_show, param_cache_miss_count_store); -static ssize_t context_preempt_count_show(struct device *dev, - struct device_attribute *attr, +static ssize_t context_preempt_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_counter(etdev, - EDGETPU_COUNTER_CONTEXT_PREEMPTS); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_counter(etdev, buf, EDGETPU_COUNTER_CONTEXT_PREEMPTS, true); } -static ssize_t context_preempt_count_store(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) +static ssize_t context_preempt_count_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); @@ -619,10 +659,8 @@ static ssize_t hardware_preempt_count_show(struct device *dev, struct device_att char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_counter(etdev, EDGETPU_COUNTER_HARDWARE_PREEMPTS); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_counter(etdev, buf, EDGETPU_COUNTER_HARDWARE_PREEMPTS, true); } static ssize_t hardware_preempt_count_store(struct device *dev, struct device_attribute *attr, @@ -640,10 +678,9 @@ static ssize_t hardware_ctx_save_time_show(struct device *dev, struct device_att char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_counter(etdev, EDGETPU_COUNTER_HARDWARE_CTX_SAVE_TIME_US); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_counter(etdev, buf, EDGETPU_COUNTER_HARDWARE_CTX_SAVE_TIME_US, + true); } static ssize_t hardware_ctx_save_time_store(struct device *dev, struct device_attribute *attr, @@ -661,10 +698,9 @@ static ssize_t scalar_fence_wait_time_show(struct device *dev, struct device_att char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_counter(etdev, EDGETPU_COUNTER_SCALAR_FENCE_WAIT_TIME_US); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_counter(etdev, buf, EDGETPU_COUNTER_SCALAR_FENCE_WAIT_TIME_US, + true); } static ssize_t scalar_fence_wait_time_store(struct device *dev, struct device_attribute *attr, @@ -679,13 +715,11 @@ static DEVICE_ATTR(scalar_fence_wait_time, 0664, scalar_fence_wait_time_show, scalar_fence_wait_time_store); static ssize_t long_suspend_count_show(struct device *dev, struct device_attribute *attr, - char *buf) + char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_counter(etdev, EDGETPU_COUNTER_LONG_SUSPEND); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_counter(etdev, buf, EDGETPU_COUNTER_LONG_SUSPEND, false); } static ssize_t long_suspend_count_store(struct device *dev, struct device_attribute *attr, @@ -699,15 +733,53 @@ static ssize_t long_suspend_count_store(struct device *dev, struct device_attrib static DEVICE_ATTR(long_suspend_count, 0664, long_suspend_count_show, long_suspend_count_store); +#if EDGETPU_TPU_CLUSTER_COUNT > 1 +static ssize_t reconfigurations_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct edgetpu_dev *etdev = dev_get_drvdata(dev); + + return edgetpu_usage_format_counter(etdev, buf, EDGETPU_COUNTER_RECONFIGURATIONS, false); +} + +static ssize_t reconfigurations_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct edgetpu_dev *etdev = dev_get_drvdata(dev); + + edgetpu_counter_clear(etdev, EDGETPU_COUNTER_RECONFIGURATIONS); + return count; +} +static DEVICE_ATTR(reconfigurations, 0664, reconfigurations_show, reconfigurations_store); + +static ssize_t preempt_reconfigurations_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct edgetpu_dev *etdev = dev_get_drvdata(dev); + + return edgetpu_usage_format_counter(etdev, buf, EDGETPU_COUNTER_PREEMPT_RECONFIGURATIONS, + false); +} + +static ssize_t preempt_reconfigurations_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct edgetpu_dev *etdev = dev_get_drvdata(dev); + + edgetpu_counter_clear(etdev, EDGETPU_COUNTER_PREEMPT_RECONFIGURATIONS); + return count; +} +static DEVICE_ATTR(preempt_reconfigurations, 0664, preempt_reconfigurations_show, + preempt_reconfigurations_store); +#endif /* EDGETPU_TPU_CLUSTER_COUNT > 1 */ + + static ssize_t outstanding_commands_max_show( struct device *dev, struct device_attribute *attr, char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_max_watermark( - etdev, EDGETPU_MAX_WATERMARK_OUT_CMDS); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_max_watermark(etdev, buf, EDGETPU_MAX_WATERMARK_OUT_CMDS, + false); } static ssize_t outstanding_commands_max_store( @@ -715,14 +787,8 @@ static ssize_t outstanding_commands_max_store( const char *buf, size_t count) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - struct edgetpu_usage_stats *ustats = etdev->usage_stats; - - if (ustats) { - mutex_lock(&ustats->usage_stats_lock); - ustats->max_watermark[EDGETPU_MAX_WATERMARK_OUT_CMDS] = 0; - mutex_unlock(&ustats->usage_stats_lock); - } + edgetpu_max_watermark_clear(etdev, EDGETPU_MAX_WATERMARK_OUT_CMDS); return count; } static DEVICE_ATTR(outstanding_commands_max, 0664, @@ -733,11 +799,9 @@ static ssize_t preempt_depth_max_show( struct device *dev, struct device_attribute *attr, char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_max_watermark( - etdev, EDGETPU_MAX_WATERMARK_PREEMPT_DEPTH); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_max_watermark(etdev, buf, EDGETPU_MAX_WATERMARK_PREEMPT_DEPTH, + true); } static ssize_t preempt_depth_max_store( @@ -745,14 +809,8 @@ static ssize_t preempt_depth_max_store( const char *buf, size_t count) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - struct edgetpu_usage_stats *ustats = etdev->usage_stats; - - if (ustats) { - mutex_lock(&ustats->usage_stats_lock); - ustats->max_watermark[EDGETPU_MAX_WATERMARK_PREEMPT_DEPTH] = 0; - mutex_unlock(&ustats->usage_stats_lock); - } + edgetpu_max_watermark_clear(etdev, EDGETPU_MAX_WATERMARK_PREEMPT_DEPTH); return count; } static DEVICE_ATTR(preempt_depth_max, 0664, preempt_depth_max_show, @@ -762,11 +820,10 @@ static ssize_t hardware_ctx_save_time_max_show( struct device *dev, struct device_attribute *attr, char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_max_watermark( - etdev, EDGETPU_MAX_WATERMARK_HARDWARE_CTX_SAVE_TIME_US); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_max_watermark(etdev, buf, + EDGETPU_MAX_WATERMARK_HARDWARE_CTX_SAVE_TIME_US, + true); } static ssize_t hardware_ctx_save_time_max_store( @@ -774,14 +831,8 @@ static ssize_t hardware_ctx_save_time_max_store( const char *buf, size_t count) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - struct edgetpu_usage_stats *ustats = etdev->usage_stats; - - if (ustats) { - mutex_lock(&ustats->usage_stats_lock); - ustats->max_watermark[EDGETPU_MAX_WATERMARK_HARDWARE_CTX_SAVE_TIME_US] = 0; - mutex_unlock(&ustats->usage_stats_lock); - } + edgetpu_max_watermark_clear(etdev, EDGETPU_MAX_WATERMARK_HARDWARE_CTX_SAVE_TIME_US); return count; } static DEVICE_ATTR(hardware_ctx_save_time_max, 0664, hardware_ctx_save_time_max_show, @@ -791,11 +842,9 @@ static ssize_t scalar_fence_wait_time_max_show( struct device *dev, struct device_attribute *attr, char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_max_watermark( - etdev, EDGETPU_MAX_WATERMARK_SCALAR_FENCE_WAIT_TIME_US); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_max_watermark( + etdev, buf, EDGETPU_MAX_WATERMARK_SCALAR_FENCE_WAIT_TIME_US, true); } static ssize_t scalar_fence_wait_time_max_store( @@ -803,14 +852,8 @@ static ssize_t scalar_fence_wait_time_max_store( const char *buf, size_t count) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - struct edgetpu_usage_stats *ustats = etdev->usage_stats; - - if (ustats) { - mutex_lock(&ustats->usage_stats_lock); - ustats->max_watermark[EDGETPU_MAX_WATERMARK_SCALAR_FENCE_WAIT_TIME_US] = 0; - mutex_unlock(&ustats->usage_stats_lock); - } + edgetpu_max_watermark_clear(etdev, EDGETPU_MAX_WATERMARK_SCALAR_FENCE_WAIT_TIME_US); return count; } static DEVICE_ATTR(scalar_fence_wait_time_max, 0664, scalar_fence_wait_time_max_show, @@ -820,11 +863,9 @@ static ssize_t suspend_time_max_show( struct device *dev, struct device_attribute *attr, char *buf) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - int64_t val; - val = edgetpu_usage_get_max_watermark( - etdev, EDGETPU_MAX_WATERMARK_SUSPEND_TIME_US); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return edgetpu_usage_format_max_watermark(etdev, buf, EDGETPU_MAX_WATERMARK_SUSPEND_TIME_US, + false); } static ssize_t suspend_time_max_store( @@ -832,14 +873,8 @@ static ssize_t suspend_time_max_store( const char *buf, size_t count) { struct edgetpu_dev *etdev = dev_get_drvdata(dev); - struct edgetpu_usage_stats *ustats = etdev->usage_stats; - - if (ustats) { - mutex_lock(&ustats->usage_stats_lock); - ustats->max_watermark[EDGETPU_MAX_WATERMARK_SUSPEND_TIME_US] = 0; - mutex_unlock(&ustats->usage_stats_lock); - } + edgetpu_max_watermark_clear(etdev, EDGETPU_MAX_WATERMARK_SUSPEND_TIME_US); return count; } static DEVICE_ATTR(suspend_time_max, 0664, suspend_time_max_show, @@ -900,6 +935,10 @@ static struct attribute *usage_stats_dev_attrs[] = { &dev_attr_hardware_ctx_save_time.attr, &dev_attr_scalar_fence_wait_time.attr, &dev_attr_long_suspend_count.attr, +#if EDGETPU_TPU_CLUSTER_COUNT > 1 + &dev_attr_reconfigurations.attr, + &dev_attr_preempt_reconfigurations.attr, +#endif &dev_attr_outstanding_commands_max.attr, &dev_attr_preempt_depth_max.attr, &dev_attr_hardware_ctx_save_time_max.attr, @@ -912,6 +951,7 @@ static struct attribute *usage_stats_dev_attrs[] = { static const struct attribute_group usage_stats_attr_group = { .attrs = usage_stats_dev_attrs, }; + void edgetpu_usage_stats_init(struct edgetpu_dev *etdev) { struct edgetpu_usage_stats *ustats; diff --git a/drivers/edgetpu/edgetpu-usage-stats.h b/drivers/edgetpu/edgetpu-usage-stats.h index a60b107..2d97043 100644 --- a/drivers/edgetpu/edgetpu-usage-stats.h +++ b/drivers/edgetpu/edgetpu-usage-stats.h @@ -10,9 +10,31 @@ #include <linux/hashtable.h> #include <linux/mutex.h> +/* The highest version of usage metrics handled by this driver. */ +#define EDGETPU_USAGE_METRIC_VERSION 2 + +/* Max # of TPU clusters accounted for in the highest supported metrics version. */ +#define EDGETPU_USAGE_CLUSTERS_MAX 3 + +/* + * Size in bytes of usage metric v1. + * If fewer bytes than this are received then discard the invalid buffer. + * This size also identifies the fw response as v1; subsequent versions will add another field + * with the version number. + */ +#define EDGETPU_USAGE_METRIC_SIZE_V1 20 + +/* v1 metric header struct. */ +struct edgetpu_usage_header_v1 { + uint32_t num_metrics; /* Number of metrics being reported */ + uint32_t metric_size; /* Size of each metric struct */ +}; + /* Header struct in the metric buffer. */ /* Must be kept in sync with firmware struct UsageTrackerHeader */ struct edgetpu_usage_header { + uint16_t header_bytes; /* Number of bytes in this header */ + uint16_t version; /* Metrics version */ uint32_t num_metrics; /* Number of metrics being reported */ uint32_t metric_size; /* Size of each metric struct */ }; @@ -20,15 +42,25 @@ struct edgetpu_usage_header { /* * Encapsulate TPU core usage information of a specific application for a * specific power state. - * Must be kept in sync with firmware struct TpuUsage. + * Must be kept in sync with firmware struct CoreUsage. */ struct tpu_usage { /* Unique identifier of the application. */ int32_t uid; /* The power state of the device (values are chip dependent) */ + /* Now called operating_point in FW. */ uint32_t power_state; /* Duration of usage in microseconds. */ uint32_t duration_us; + + /* Following fields are added in metrics v2 */ + + /* Compute Core: TPU cluster ID. */ + /* Called core_id in FW. */ + /* Note: as of metrics v2 the cluster_id is always zero and is ignored. */ + uint8_t cluster_id; + /* Reserved. Filling out the next 32-bit boundary. */ + uint8_t reserved[3]; }; /* @@ -38,9 +70,13 @@ struct tpu_usage { enum edgetpu_usage_component { /* The device as a whole */ EDGETPU_USAGE_COMPONENT_DEVICE = 0, - /* Just the TPU core */ + /* Just the TPU core (scalar core and tiles) */ EDGETPU_USAGE_COMPONENT_TPU = 1, - EDGETPU_USAGE_COMPONENT_COUNT = 2, /* number of components above */ + /* Control core (ARM Cortex-R52 CPU) */ + /* Note: this component is not reported as of metrics v2. */ + EDGETPU_USAGE_COMPONENT_CONTROLCORE = 2, + + EDGETPU_USAGE_COMPONENT_COUNT = 3, /* number of components above */ }; /* @@ -62,7 +98,7 @@ enum edgetpu_usage_counter_type { EDGETPU_COUNTER_TPU_ACTIVE_CYCLES = 0, /* Number of stalls caused by throttling. */ EDGETPU_COUNTER_TPU_THROTTLE_STALLS = 1, - /* Number of graph invocations. */ + /* Number of graph invocations. (Now called kWorkload in FW.) */ EDGETPU_COUNTER_INFERENCES = 2, /* Number of TPU offload op invocations. */ EDGETPU_COUNTER_TPU_OPS = 3, @@ -81,7 +117,18 @@ enum edgetpu_usage_counter_type { /* Number of times (firmware)suspend function takes longer than SLA time. */ EDGETPU_COUNTER_LONG_SUSPEND = 10, - EDGETPU_COUNTER_COUNT = 11, /* number of counters above */ + /* The following counters are added in metrics v2. */ + + /* Counter 11 not used on TPU. */ + EDGETPU_COUNTER_CONTEXT_SWITCHES = 11, + + /* Number of TPU Cluster Reconfigurations. */ + EDGETPU_COUNTER_RECONFIGURATIONS = 12, + + /* Number of TPU Cluster Reconfigurations motivated exclusively by a preemption. */ + EDGETPU_COUNTER_PREEMPT_RECONFIGURATIONS = 13, + + EDGETPU_COUNTER_COUNT = 14, /* number of counters above */ }; /* Generic counter. Only reported if it has a value larger than 0. */ @@ -91,6 +138,11 @@ struct __packed edgetpu_usage_counter { /* Accumulated value since last initialization. */ uint64_t value; + + /* Following fields are added in metrics v2 */ + + /* Reporting component. */ + uint8_t component_id; }; /* Defines different max watermarks we track. */ @@ -121,15 +173,22 @@ struct __packed edgetpu_usage_max_watermark { * non-mobile, firmware boot on mobile). */ uint64_t value; + + /* Following fields are added in metrics v2 */ + + /* Reporting component. */ + uint8_t component_id; }; /* An enum to identify the tracked firmware threads. */ /* Must be kept in sync with firmware enum class UsageTrackerThreadId. */ enum edgetpu_usage_threadid { - /* Individual thread IDs are not tracked. */ + /* Individual thread IDs do not have identifiers assigned. */ + + /* Thread ID 14 is not used for TPU */ /* Number of task identifiers. */ - EDGETPU_FW_THREAD_COUNT = 12, + EDGETPU_FW_THREAD_COUNT = 17, }; /* Statistics related to a single thread in firmware. */ @@ -173,11 +232,13 @@ struct edgetpu_usage_metric { #define UID_HASH_BITS 3 struct edgetpu_usage_stats { + /* if true the current firmware only implements metrics V1 */ + bool use_metrics_v1; DECLARE_HASHTABLE(uid_hash_table, UID_HASH_BITS); /* component utilization values reported by firmware */ int32_t component_utilization[EDGETPU_USAGE_COMPONENT_COUNT]; - int64_t counter[EDGETPU_COUNTER_COUNT]; - int64_t max_watermark[EDGETPU_MAX_WATERMARK_TYPE_COUNT]; + int64_t counter[EDGETPU_COUNTER_COUNT][EDGETPU_USAGE_CLUSTERS_MAX]; + int64_t max_watermark[EDGETPU_MAX_WATERMARK_TYPE_COUNT][EDGETPU_USAGE_CLUSTERS_MAX]; int32_t thread_stack_max[EDGETPU_FW_THREAD_COUNT]; struct mutex usage_stats_lock; }; diff --git a/drivers/edgetpu/mobile-pm.c b/drivers/edgetpu/mobile-pm.c index 50c3866..2aafd4c 100644 --- a/drivers/edgetpu/mobile-pm.c +++ b/drivers/edgetpu/mobile-pm.c @@ -21,6 +21,7 @@ #include "edgetpu-mailbox.h" #include "edgetpu-mobile-platform.h" #include "edgetpu-pm.h" +#include "edgetpu-thermal.h" #include "mobile-firmware.h" #include "mobile-pm.h" @@ -40,6 +41,10 @@ module_param(power_state, int, 0660); #define MAX_VOLTAGE_VAL 1250000 +#define BLOCK_DOWN_RETRY_TIMES 50 +#define BLOCK_DOWN_MIN_DELAY_US 1000 +#define BLOCK_DOWN_MAX_DELAY_US 1500 + enum edgetpu_pwr_state edgetpu_active_states[EDGETPU_NUM_STATES] = { TPU_ACTIVE_UUD, TPU_ACTIVE_SUD, @@ -434,8 +439,23 @@ static int mobile_power_up(struct edgetpu_pm *etpm) struct edgetpu_mobile_platform_pwr *platform_pwr = &etmdev->platform_pwr; int ret; - if (platform_pwr->is_block_down && !platform_pwr->is_block_down(etdev)) + if (platform_pwr->is_block_down) { + int times = 0; + + do { + if (platform_pwr->is_block_down(etdev)) + break; + usleep_range(BLOCK_DOWN_MIN_DELAY_US, BLOCK_DOWN_MAX_DELAY_US); + } while (++times < BLOCK_DOWN_RETRY_TIMES); + if (times >= BLOCK_DOWN_RETRY_TIMES && !platform_pwr->is_block_down(etdev)) + return -EAGAIN; + } + + if (edgetpu_thermal_is_suspended(etdev->thermal)) { + etdev_warn_ratelimited(etdev, + "power up rejected due to device thermal limit exceeded"); return -EAGAIN; + } ret = mobile_pwr_state_set(etpm->etdev, mobile_get_initial_pwr_state(etdev->dev)); |