summaryrefslogtreecommitdiff
path: root/drivers/edgetpu/edgetpu-sw-watchdog.c
blob: f7e64cbd73efab88fe3ff57af56983db38b0a7c9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
// SPDX-License-Identifier: GPL-2.0
/*
 * Edge TPU software WDT interface.
 *
 * Copyright (C) 2020 Google, Inc.
 */

#include <asm/barrier.h>
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/workqueue.h>

#include "edgetpu-device-group.h"
#include "edgetpu-internal.h"
#include "edgetpu-kci.h"
#include "edgetpu-sw-watchdog.h"

static bool wdt_disable;
module_param(wdt_disable, bool, 0660);

/* Worker to handle sw watchdog timeout. */
static void sw_wdt_handler_work(struct work_struct *work)
{
	struct edgetpu_sw_wdt_action_work *et_action_work =
		container_of(work, struct edgetpu_sw_wdt_action_work, work);
	struct edgetpu_dev *etdev = et_action_work->etdev;

	etdev_err(etdev, "watchdog restart");
	etdev->watchdog_timeout_count++;
	edgetpu_fatal_error_notify(etdev, EDGETPU_ERROR_WATCHDOG_TIMEOUT);
	edgetpu_firmware_watchdog_restart(etdev);
}

static void sw_wdt_start(struct edgetpu_sw_wdt *wdt)
{
	if (wdt->is_wdt_disabled) {
		etdev_dbg(wdt->etdev, "sw wdt disabled by module param");
		return;
	}
	etdev_dbg(wdt->etdev, "sw wdt: started\n");
	schedule_delayed_work(&wdt->dwork, wdt->hrtbeat_jiffs);
}

static void sw_wdt_stop(struct edgetpu_sw_wdt *wdt)
{
	etdev_dbg(wdt->etdev, "sw wdt: stopped\n");
	cancel_delayed_work_sync(&wdt->dwork);
}

static void sw_wdt_modify_rate(struct edgetpu_sw_wdt *wdt, unsigned long rate)
{
	if (rate == wdt->hrtbeat_jiffs)
		return;
	wdt->hrtbeat_jiffs = rate;
	/*
	 * Don't restart the work if we already encountered a firmware timeout.
	 */
	if (work_pending(&wdt->et_action_work.work))
		return;
	sw_wdt_stop(wdt);
	sw_wdt_start(wdt);
}

void edgetpu_watchdog_bite(struct edgetpu_dev *etdev)
{
	if (!etdev->etdev_sw_wdt)
		return;
	/*
	 * Stop sw wdog delayed worker, to reduce chance this explicit call
	 * races with a sw wdog timeout.  May be in IRQ context, no sync,
	 * worker may already be active.
	 */
	cancel_delayed_work(&etdev->etdev_sw_wdt->dwork);
	schedule_work(&etdev->etdev_sw_wdt->et_action_work.work);
}

/*
 * Ping the f/w for a response. Reschedule the work for next beat in case of f/w
 * is responded, or schedule a worker for action callback in case of TIMEOUT.
 */
static void sw_wdt_work(struct work_struct *work)
{
	int ret;
	struct delayed_work *dwork = to_delayed_work(work);
	struct edgetpu_sw_wdt *etdev_sw_wdt =
		container_of(dwork, struct edgetpu_sw_wdt, dwork);
	struct edgetpu_dev *etdev = etdev_sw_wdt->etdev;

	/* Ping f/w, and grab updated usage stats while we're at it. */
	etdev_dbg(etdev, "sw wdt: pinging firmware\n");
	ret = edgetpu_kci_update_usage(etdev);
	if (ret)
		etdev_dbg(etdev, "sw-watchdog ping resp:%d\n", ret);
	if (ret == -ETIMEDOUT) {
		etdev_err(etdev, "sw-watchdog response timed out\n");
		schedule_work(&etdev_sw_wdt->et_action_work.work);
	} else {
		/* reschedule to next beat. */
		schedule_delayed_work(dwork, etdev_sw_wdt->hrtbeat_jiffs);
	}
}

int edgetpu_sw_wdt_create(struct edgetpu_dev *etdev, unsigned long active_ms,
			  unsigned long dormant_ms)
{
	struct edgetpu_sw_wdt *etdev_sw_wdt;

	etdev_sw_wdt = kzalloc(sizeof(*etdev_sw_wdt), GFP_KERNEL);
	if (!etdev_sw_wdt)
		return -ENOMEM;

	etdev_sw_wdt->etdev = etdev;
	etdev_sw_wdt->hrtbeat_active = msecs_to_jiffies(active_ms);
	etdev_sw_wdt->hrtbeat_dormant = msecs_to_jiffies(dormant_ms);
	atomic_set(&etdev_sw_wdt->active_counter, 0);
	/* init to dormant rate */
	etdev_sw_wdt->hrtbeat_jiffs = etdev_sw_wdt->hrtbeat_dormant;
	INIT_DELAYED_WORK(&etdev_sw_wdt->dwork, sw_wdt_work);
	INIT_WORK(&etdev_sw_wdt->et_action_work.work, sw_wdt_handler_work);
	etdev_sw_wdt->et_action_work.etdev = etdev;
	etdev_sw_wdt->is_wdt_disabled = wdt_disable;
	etdev->etdev_sw_wdt = etdev_sw_wdt;
	return 0;
}

int edgetpu_sw_wdt_start(struct edgetpu_dev *etdev)
{
	struct edgetpu_sw_wdt *wdt;

	/* to match edgetpu_sw_wdt_destroy() */
	smp_mb();
	wdt = etdev->etdev_sw_wdt;
	if (!wdt)
		return -EINVAL;
	sw_wdt_start(wdt);
	return 0;
}

void edgetpu_sw_wdt_stop(struct edgetpu_dev *etdev)
{
	struct edgetpu_sw_wdt *wdt;

	/* to match edgetpu_sw_wdt_destroy() */
	smp_mb();
	wdt = etdev->etdev_sw_wdt;
	if (!wdt)
		return;
	sw_wdt_stop(wdt);
}

void edgetpu_sw_wdt_destroy(struct edgetpu_dev *etdev)
{
	struct edgetpu_sw_wdt *wdt = etdev->etdev_sw_wdt;
	int counter;

	if (!wdt)
		return;
	etdev->etdev_sw_wdt = NULL;
	/*
	 * To ensure that etdev->etdev_sw_wdt is NULL so wdt_start() calls from other processes
	 * won't start the watchdog again.
	 */
	smp_mb();
	sw_wdt_stop(wdt);
	/* cancel and sync work due to watchdog bite to prevent UAF */
	cancel_work_sync(&wdt->et_action_work.work);
	counter = atomic_read(&wdt->active_counter);
	if (counter)
		etdev_warn(etdev, "Unbalanced WDT active counter: %d", counter);
	kfree(wdt);
}

void edgetpu_sw_wdt_inc_active_ref(struct edgetpu_dev *etdev)
{
	struct edgetpu_sw_wdt *wdt = etdev->etdev_sw_wdt;

	if (!wdt)
		return;
	if (!atomic_fetch_inc(&wdt->active_counter))
		sw_wdt_modify_rate(wdt, wdt->hrtbeat_active);
}

void edgetpu_sw_wdt_dec_active_ref(struct edgetpu_dev *etdev)
{
	struct edgetpu_sw_wdt *wdt = etdev->etdev_sw_wdt;

	if (!wdt)
		return;
	if (atomic_fetch_dec(&wdt->active_counter) == 1)
		sw_wdt_modify_rate(wdt, wdt->hrtbeat_dormant);
}