libbpf-tools/ksnoop.bpf.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460

/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
/* Copyright (c) 2021, Oracle and/or its affiliates. */

#include "vmlinux.h"

#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>

#include "ksnoop.h"

/* For kretprobes, the instruction pointer in the struct pt_regs context
 * is the kretprobe_trampoline.  We derive the instruction pointer
 * by pushing it onto a function stack on entry and popping it on return.
 *
 * We could use bpf_get_func_ip(), but "stack mode" - where we
 * specify functions "a", "b and "c" and only want to see a trace if "a"
 * calls "b" and "b" calls "c" - utilizes this stack to determine if trace
 * data should be collected.
 */
#define FUNC_MAX_STACK_DEPTH	16
/* used to convince verifier we do not stray outside of array bounds */
#define FUNC_STACK_DEPTH_MASK	(FUNC_MAX_STACK_DEPTH - 1)

#ifndef ENOSPC
#define ENOSPC			28
#endif

struct func_stack {
	__u64 task;
	__u64 ips[FUNC_MAX_STACK_DEPTH];
	__u8 stack_depth;
};

#define MAX_TASKS		2048

/* function call stack hashed on a per-task key */
struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	/* function call stack for functions we are tracing */
	__uint(max_entries, MAX_TASKS);
	__type(key, __u64);
	__type(value, struct func_stack);
} ksnoop_func_stack SEC(".maps");

/* per-cpu trace info hashed on function address */
struct {
	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
	__uint(max_entries, MAX_FUNC_TRACES);
	__type(key, __u64);
	__type(value, struct trace);
} ksnoop_func_map SEC(".maps");

struct {
	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
	__uint(value_size, sizeof(int));
	__uint(key_size, sizeof(int));
} ksnoop_perf_map SEC(".maps");

static void clear_trace(struct trace *trace)
{
	__builtin_memset(&trace->trace_data, 0, sizeof(trace->trace_data));
	trace->data_flags = 0;
	trace->buf_len = 0;
}

static struct trace *get_trace(struct pt_regs *ctx, bool entry)
{
	__u8 stack_depth, last_stack_depth;
	struct func_stack *func_stack;
	__u64 ip, last_ip = 0, task;
	struct trace *trace;

	task = bpf_get_current_task();

	func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
	if (!func_stack) {
		struct func_stack new_stack = { .task = task };

		bpf_map_update_elem(&ksnoop_func_stack, &task, &new_stack,
				    BPF_NOEXIST);
		func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
		if (!func_stack)
			return NULL;
	}

	stack_depth = func_stack->stack_depth;
	if (stack_depth > FUNC_MAX_STACK_DEPTH)
		return NULL;

	if (entry) {
		if (bpf_core_enum_value_exists(enum bpf_func_id,
					       BPF_FUNC_get_func_ip))
			ip = bpf_get_func_ip(ctx);
		else
			ip = KSNOOP_IP_FIX(PT_REGS_IP_CORE(ctx));
		if (stack_depth >= FUNC_MAX_STACK_DEPTH - 1)
			return NULL;
		/* verifier doesn't like using "stack_depth - 1" as array index
		 * directly.
		 */
		last_stack_depth = stack_depth - 1;
		/* get address of last function we called */
		if (last_stack_depth >= 0 &&
		    last_stack_depth < FUNC_MAX_STACK_DEPTH)
			last_ip = func_stack->ips[last_stack_depth];
		/* push ip onto stack. return will pop it. */
		func_stack->ips[stack_depth] = ip;
		/* mask used in case bounds checks are optimized out */
		stack_depth = (stack_depth + 1) & FUNC_STACK_DEPTH_MASK;
		func_stack->stack_depth = stack_depth;
		/* rather than zero stack entries on popping, we zero the
		 * (stack_depth + 1)'th entry when pushing the current
		 * entry.  The reason we take this approach is that
		 * when tracking the set of functions we returned from,
		 * we want the history of functions we returned from to
		 * be preserved.
		 */
		if (stack_depth < FUNC_MAX_STACK_DEPTH)
			func_stack->ips[stack_depth] = 0;
	} else {
		if (stack_depth == 0 || stack_depth >= FUNC_MAX_STACK_DEPTH)
			return NULL;
		last_stack_depth = stack_depth;
		/* get address of last function we returned from */
		if (last_stack_depth >= 0 &&
		    last_stack_depth < FUNC_MAX_STACK_DEPTH)
			last_ip = func_stack->ips[last_stack_depth];
		if (stack_depth > 0) {
			/* logical OR convinces verifier that we don't
			 * end up with a < 0 value, translating to 0xff
			 * and an outside of map element access.
			 */
			stack_depth = (stack_depth - 1) & FUNC_STACK_DEPTH_MASK;
		}
		/* retrieve ip from stack as IP in pt_regs is
		 * bpf kretprobe trampoline address.
		 */
		if (stack_depth >= 0 && stack_depth < FUNC_MAX_STACK_DEPTH)
			ip = func_stack->ips[stack_depth];
		if (stack_depth >= 0 && stack_depth < FUNC_MAX_STACK_DEPTH)
			func_stack->stack_depth = stack_depth;
	}

	trace = bpf_map_lookup_elem(&ksnoop_func_map, &ip);
	if (!trace)
		return NULL;

	/* we may stash data on entry since predicates are a mix
	 * of entry/return; in such cases, trace->flags specifies
	 * KSNOOP_F_STASH, and we will output stashed data on return.
	 * If returning, make sure we don't clear our stashed data.
	 */
	if (!entry && (trace->flags & KSNOOP_F_STASH)) {
		/* skip clearing trace data */
		if (!(trace->data_flags & KSNOOP_F_STASHED)) {
			/* predicate must have failed */
			return NULL;
		}
		/* skip clearing trace data */
	} else {
		/* clear trace data before starting. */
		clear_trace(trace);
	}

	if (entry) {
		/* if in stack mode, check if previous fn matches */
		if (trace->prev_ip && trace->prev_ip != last_ip)
			return NULL;
		/* if tracing intermediate fn in stack of fns, stash data. */
		if (trace->next_ip)
			trace->data_flags |= KSNOOP_F_STASH;
		/* we may stash data on entry since predicates are a mix
		 * of entry/return; in such cases, trace->flags specifies
		 * KSNOOP_F_STASH, and we will output stashed data on return.
		 */
		if (trace->flags & KSNOOP_F_STASH)
			trace->data_flags |= KSNOOP_F_STASH;
		/* otherwise the data is outputted (because we've reached
		 * the last fn in the set of fns specified).
		 */
	} else {
		/* In stack mode, check if next fn matches the last fn
		 * we returned from; i.e. "a" called "b", and now
		 * we're at "a", was the last fn we returned from "b"?
		 * If so, stash data for later display (when we reach the
		 * first fn in the set of stack fns).
		 */
		if (trace->next_ip && trace->next_ip != last_ip)
			return NULL;
		if (trace->prev_ip)
			trace->data_flags |= KSNOOP_F_STASH;
		/* If there is no "prev" function, i.e. we are at the
		 * first function in a set of stack functions, the trace
		 * info is shown (along with any stashed info associated
		 * with callers).
		 */
	}
	trace->task = task;
	return trace;
}

static void output_trace(struct pt_regs *ctx, struct trace *trace)
{
	__u16 trace_len;

	if (trace->buf_len == 0)
		goto skip;

	/* we may be simply stashing values, and will report later */
	if (trace->data_flags & KSNOOP_F_STASH) {
		trace->data_flags &= ~KSNOOP_F_STASH;
		trace->data_flags |= KSNOOP_F_STASHED;
		return;
	}
	/* we may be outputting earlier stashed data */
	if (trace->data_flags & KSNOOP_F_STASHED)
		trace->data_flags &= ~KSNOOP_F_STASHED;

	/* trim perf event size to only contain data we've recorded. */
	trace_len = sizeof(*trace) + trace->buf_len - MAX_TRACE_BUF;

	if (trace_len <= sizeof(*trace))
		bpf_perf_event_output(ctx, &ksnoop_perf_map,
				      BPF_F_CURRENT_CPU,
				      trace, trace_len);
skip:
	clear_trace(trace);
}

static void output_stashed_traces(struct pt_regs *ctx,
					 struct trace *currtrace,
					 bool entry)
{
	struct func_stack *func_stack;
	struct trace *trace = NULL;
	__u8 i;
	__u64 task = 0;

	task = bpf_get_current_task();
	func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
	if (!func_stack)
		return;

	if (entry) {
		/* iterate from bottom to top of stack, outputting stashed
		 * data we find.  This corresponds to the set of functions
		 * we called before the current function.
		 */
		for (i = 0;
		     i < func_stack->stack_depth - 1 && i < FUNC_MAX_STACK_DEPTH;
		     i++) {
			trace = bpf_map_lookup_elem(&ksnoop_func_map,
						    &func_stack->ips[i]);
			if (!trace || !(trace->data_flags & KSNOOP_F_STASHED))
				break;
			if (trace->task != task)
				return;
			output_trace(ctx, trace);
		}
	} else {
		/* iterate from top to bottom of stack, outputting stashed
		 * data we find.  This corresponds to the set of functions
		 * that returned prior to the current returning function.
		 */
		for (i = FUNC_MAX_STACK_DEPTH; i > 0; i--) {
			__u64 ip;

			ip = func_stack->ips[i];
			if (!ip)
				continue;
			trace = bpf_map_lookup_elem(&ksnoop_func_map, &ip);
			if (!trace || !(trace->data_flags & KSNOOP_F_STASHED))
				break;
			if (trace->task != task)
				return;
			output_trace(ctx, trace);
		}
	}
	/* finally output the current trace info */
	output_trace(ctx, currtrace);
}

static __u64 get_arg(struct pt_regs *ctx, enum arg argnum)
{
	switch (argnum) {
	case KSNOOP_ARG1:
		return PT_REGS_PARM1_CORE(ctx);
	case KSNOOP_ARG2:
		return PT_REGS_PARM2_CORE(ctx);
	case KSNOOP_ARG3:
		return PT_REGS_PARM3_CORE(ctx);
	case KSNOOP_ARG4:
		return PT_REGS_PARM4_CORE(ctx);
	case KSNOOP_ARG5:
		return PT_REGS_PARM5_CORE(ctx);
	case KSNOOP_RETURN:
		return PT_REGS_RC_CORE(ctx);
	default:
		return 0;
	}
}

static int ksnoop(struct pt_regs *ctx, bool entry)
{
	void *data_ptr = NULL;
	struct trace *trace;
	__u64 data;
	__u32 currpid;
	int ret;
	__u8 i;

	trace = get_trace(ctx, entry);
	if (!trace)
		return 0;

	/* make sure we want events from this pid */
	currpid = bpf_get_current_pid_tgid();
	if (trace->filter_pid && trace->filter_pid != currpid)
		return 0;
	trace->pid = currpid;

	trace->cpu = bpf_get_smp_processor_id();
	trace->time = bpf_ktime_get_ns();

	trace->data_flags &= ~(KSNOOP_F_ENTRY | KSNOOP_F_RETURN);
	if (entry)
		trace->data_flags |= KSNOOP_F_ENTRY;
	else
		trace->data_flags |= KSNOOP_F_RETURN;


	for (i = 0; i < MAX_TRACES; i++) {
		struct trace_data *currdata;
		struct value *currtrace;
		char *buf_offset = NULL;
		__u32 tracesize;

		currdata = &trace->trace_data[i];
		currtrace = &trace->traces[i];

		if ((entry && !base_arg_is_entry(currtrace->base_arg)) ||
		    (!entry && base_arg_is_entry(currtrace->base_arg)))
			continue;

		/* skip void (unused) trace arguments, ensuring not to
		 * skip "void *".
		 */
		if (currtrace->type_id == 0 &&
		    !(currtrace->flags & KSNOOP_F_PTR))
			continue;

		data = get_arg(ctx, currtrace->base_arg);

		/* look up member value and read into data field. */
		if (currtrace->flags & KSNOOP_F_MEMBER) {
			if (currtrace->offset)
				data += currtrace->offset;

			/* member is a pointer; read it in */
			if (currtrace->flags & KSNOOP_F_PTR) {
				void *dataptr = (void *)data;

				ret = bpf_probe_read_kernel(&data, sizeof(data), dataptr);
				if (ret) {
					currdata->err_type_id = currtrace->type_id;
					currdata->err = ret;
					continue;
				}
				currdata->raw_value = data;
			} else if (currtrace->size <=
				   sizeof(currdata->raw_value)) {
				/* read member value for predicate comparison */
				bpf_probe_read_kernel(&currdata->raw_value, currtrace->size, (void*)data);
			}
		} else {
			currdata->raw_value = data;
		}

		/* simple predicate evaluation: if any predicate fails,
		 * skip all tracing for this function.
		 */
		if (currtrace->flags & KSNOOP_F_PREDICATE_MASK) {
			bool ok = false;

			if (currtrace->flags & KSNOOP_F_PREDICATE_EQ &&
			    currdata->raw_value == currtrace->predicate_value)
				ok = true;

			if (currtrace->flags & KSNOOP_F_PREDICATE_NOTEQ &&
			    currdata->raw_value != currtrace->predicate_value)
				ok = true;

			if (currtrace->flags & KSNOOP_F_PREDICATE_GT &&
			    currdata->raw_value > currtrace->predicate_value)
				ok = true;

			if (currtrace->flags & KSNOOP_F_PREDICATE_LT &&
			    currdata->raw_value < currtrace->predicate_value)
				ok = true;

			if (!ok) {
				clear_trace(trace);
				return 0;
			}
		}

		if (currtrace->flags & (KSNOOP_F_PTR | KSNOOP_F_MEMBER))
			data_ptr = (void *)data;
		else
			data_ptr = &data;

		if (trace->buf_len + MAX_TRACE_DATA >= MAX_TRACE_BUF)
			break;

		buf_offset = &trace->buf[trace->buf_len];
		if (buf_offset > &trace->buf[MAX_TRACE_BUF]) {
			currdata->err_type_id = currtrace->type_id;
			currdata->err = -ENOSPC;
			continue;
		}
		currdata->buf_offset = trace->buf_len;

		tracesize = currtrace->size;
		if (tracesize > MAX_TRACE_DATA)
			tracesize = MAX_TRACE_DATA;
		ret = bpf_probe_read_kernel(buf_offset, tracesize, data_ptr);
		if (ret < 0) {
			currdata->err_type_id = currtrace->type_id;
			currdata->err = ret;
			continue;
		} else {
			currdata->buf_len = tracesize;
			trace->buf_len += tracesize;
		}
	}

	/* show accumulated stashed traces (if any) */
	if ((entry && trace->prev_ip && !trace->next_ip) ||
	    (!entry && trace->next_ip && !trace->prev_ip))
		output_stashed_traces(ctx, trace, entry);
	else
		output_trace(ctx, trace);

	return 0;
}

SEC("kprobe/foo")
int BPF_KPROBE(kprobe_entry)
{
	return ksnoop(ctx, true);
}

SEC("kretprobe/foo")
int BPF_KRETPROBE(kprobe_return)
{
	return ksnoop(ctx, false);
}

char _license[] SEC("license") = "Dual BSD/GPL";