-
Notifications
You must be signed in to change notification settings - Fork 298
/
Copy pathtracemgmt.h
731 lines (646 loc) · 27 KB
/
tracemgmt.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
// Provides functionality for adding frames to traces and updating trace counts
#ifndef OPTI_TRACEMGMT_H
#define OPTI_TRACEMGMT_H
#include "bpfdefs.h"
#include "errors.h"
#include "extmaps.h"
#include "frametypes.h"
#include "types.h"
// MULTI_USE_FUNC generates perf event and kprobe eBPF programs
// for a given function.
#define MULTI_USE_FUNC(func_name) \
SEC("perf_event/" #func_name) \
int perf_##func_name(struct pt_regs *ctx) \
{ \
return func_name(ctx); \
} \
\
SEC("kprobe/" #func_name) \
int kprobe_##func_name(struct pt_regs *ctx) \
{ \
return func_name(ctx); \
}
// increment_metric increments the value of the given metricID by 1
static inline __attribute__((__always_inline__)) void increment_metric(u32 metricID)
{
u64 *count = bpf_map_lookup_elem(&metrics, &metricID);
if (count) {
++*count;
} else {
DEBUG_PRINT("Failed to lookup metrics map for metricID %d", metricID);
}
}
// Send immediate notifications for event triggers to Go.
// Notifications for GENERIC_PID and TRACES_FOR_SYMBOLIZATION will be
// automatically inhibited until HA resets the type.
static inline void event_send_trigger(struct pt_regs *ctx, u32 event_type)
{
int inhibit_key = event_type;
bool inhibit_value = true;
// GENERIC_PID is a global notification that triggers eBPF map iteration+processing in Go.
// To avoid redundant notifications while userspace processing for them is already taking
// place, we allow latch-like inhibition, where eBPF sets it and Go has to manually reset
// it, before new notifications are triggered.
if (event_type != EVENT_TYPE_GENERIC_PID) {
return;
}
if (bpf_map_update_elem(&inhibit_events, &inhibit_key, &inhibit_value, BPF_NOEXIST) < 0) {
DEBUG_PRINT("Event type %d inhibited", event_type);
return;
}
switch (event_type) {
case EVENT_TYPE_GENERIC_PID: increment_metric(metricID_NumGenericPID); break;
default:
// no action
break;
}
Event event = {.event_type = event_type};
int ret = bpf_perf_event_output(ctx, &report_events, BPF_F_CURRENT_CPU, &event, sizeof(event));
if (ret < 0) {
DEBUG_PRINT("event_send_trigger failed to send event %d: error %d", event_type, ret);
}
}
// Forward declaration
struct bpf_perf_event_data;
// pid_information_exists checks if the given pid exists in pid_page_to_mapping_info or not.
static inline __attribute__((__always_inline__)) bool pid_information_exists(void *ctx, int pid)
{
PIDPage key = {};
key.prefixLen = BIT_WIDTH_PID + BIT_WIDTH_PAGE;
key.pid = __constant_cpu_to_be32((u32)pid);
key.page = 0;
return bpf_map_lookup_elem(&pid_page_to_mapping_info, &key) != NULL;
}
// Reset the ratelimit cache
#define RATELIMIT_ACTION_RESET 0
// Use default timer
#define RATELIMIT_ACTION_DEFAULT 1
// Set PID to fast timer mode
#define RATELIMIT_ACTION_FAST 2
// pid_event_ratelimit determines if the PID event should be inhibited or not
// based on rate limiting rules.
static inline __attribute__((__always_inline__)) bool
pid_event_ratelimit(u32 pid, int ratelimit_action)
{
const u8 default_max_attempts = 8; // 25 seconds
const u8 fast_max_attempts = 4; // 1.6 seconds
const u8 fast_timer_flag = 0x10;
u64 *token_ptr = bpf_map_lookup_elem(&reported_pids, &pid);
u64 ts = bpf_ktime_get_ns();
u8 attempt = 0;
u8 fast_timer = (ratelimit_action == RATELIMIT_ACTION_FAST) ? fast_timer_flag : 0;
if (ratelimit_action == RATELIMIT_ACTION_RESET) {
return false;
}
if (token_ptr) {
u64 token = *token_ptr;
u64 diff_ts = ts - (token & ~0x1fULL);
attempt = token & 0xf;
fast_timer |= token & fast_timer_flag;
// Calculate the limit window size. 100ms << attempt.
u64 limit_window_ts = (100 * 1000000ULL) << attempt;
if (diff_ts < limit_window_ts) {
// Minimum event interval.
DEBUG_PRINT("PID %d event limited: too fast", pid);
return true;
}
if (diff_ts < limit_window_ts + (5000 * 1000000ULL)) {
// PID event within 5 seconds, increase limit window size if possible
if (attempt < (fast_timer ? fast_max_attempts : default_max_attempts)) {
attempt++;
}
} else {
// Silence for at least 5 seconds. Reset back to zero.
attempt = 0;
}
}
// Create new token:
// 59 bits - the high bits of timestamp of last event
// 1 bit - set if the PID should be in fast timer mode
// 4 bits - number of bursts left at event time
DEBUG_PRINT("PID %d event send, attempt=%d", pid, attempt);
u64 token = (ts & ~0x1fULL) | fast_timer | attempt;
// Update the map entry. Technically this is not SMP safe, but doing
// an atomic update would require EBPF atomics. At worst we send an
// extra sync event and the likelihood for this race is very low, so
// we can live with this.
int err = bpf_map_update_elem(&reported_pids, &pid, &token, BPF_ANY);
if (err != 0) {
// Should never happen
DEBUG_PRINT("Failed to report PID %d: %d", pid, err);
increment_metric(metricID_ReportedPIDsErr);
return true;
}
return false;
}
// report_pid informs userspace about a PID that needs to be processed.
// If inhibit is true, PID will first be checked against maps/reported_pids
// and reporting aborted if PID has been recently reported.
// Returns true if the PID was successfully reported to user space.
static inline __attribute__((__always_inline__)) bool
report_pid(void *ctx, int pid, int ratelimit_action)
{
u32 key = (u32)pid;
if (pid_event_ratelimit(pid, ratelimit_action)) {
return false;
}
bool value = true;
int errNo = bpf_map_update_elem(&pid_events, &key, &value, BPF_ANY);
if (errNo != 0) {
DEBUG_PRINT("Failed to update pid_events with PID %d: %d", pid, errNo);
increment_metric(metricID_PIDEventsErr);
return false;
}
if (ratelimit_action == RATELIMIT_ACTION_RESET || errNo != 0) {
bpf_map_delete_elem(&reported_pids, &key);
}
// Notify userspace that there is a PID waiting to be processed.
// At this point, the PID was successfully written to maps/pid_events,
// therefore there is no need to track success/failure of event_send_trigger
// and we can simply return success.
event_send_trigger(ctx, EVENT_TYPE_GENERIC_PID);
return true;
}
// Return the per-cpu record.
// As each per-cpu array only has 1 entry, we hard-code 0 as the key.
// The return value of get_per_cpu_record() can never be NULL and return value checks only exist
// to pass the verifier. If the implementation of get_per_cpu_record() is changed so that NULL can
// be returned, also add an error metric.
static inline PerCPURecord *get_per_cpu_record(void)
{
int key0 = 0;
return bpf_map_lookup_elem(&per_cpu_records, &key0);
}
// Return the per-cpu record initialized with pristine values for state variables.
// The return value of get_pristine_per_cpu_record() can never be NULL and return value checks
// only exist to pass the verifier. If the implementation of get_pristine_per_cpu_record() is
// changed so that NULL can be returned, also add an error metric.
static inline PerCPURecord *get_pristine_per_cpu_record()
{
PerCPURecord *record = get_per_cpu_record();
if (!record)
return record;
record->state.pc = 0;
record->state.sp = 0;
record->state.fp = 0;
#if defined(__x86_64__)
record->state.r13 = 0;
#elif defined(__aarch64__)
record->state.lr = 0;
record->state.r22 = 0;
record->state.lr_invalid = false;
#endif
record->state.return_address = false;
record->state.error_metric = -1;
record->state.unwind_error = ERR_OK;
record->perlUnwindState.stackinfo = 0;
record->perlUnwindState.cop = 0;
record->pythonUnwindState.py_frame = 0;
record->phpUnwindState.zend_execute_data = 0;
record->rubyUnwindState.stack_ptr = 0;
record->rubyUnwindState.last_stack_frame = 0;
record->unwindersDone = 0;
record->tailCalls = 0;
record->ratelimitAction = RATELIMIT_ACTION_DEFAULT;
Trace *trace = &record->trace;
trace->kernel_stack_id = -1;
trace->stack_len = 0;
trace->pid = 0;
trace->tid = 0;
trace->apm_trace_id.as_int.hi = 0;
trace->apm_trace_id.as_int.lo = 0;
trace->apm_transaction_id.as_int = 0;
return record;
}
// unwinder_is_done checks if a given unwinder program is done for the trace
// extraction round.
static inline __attribute__((__always_inline__)) bool
unwinder_is_done(const PerCPURecord *record, int unwinder)
{
return (record->unwindersDone & (1U << unwinder)) != 0;
}
// unwinder_mark_done will mask out a given unwinder program so that it will
// not be called again for the same trace. Used when interpreter unwinder has
// extracted all interpreter frames it can extract.
static inline __attribute__((__always_inline__)) void
unwinder_mark_done(PerCPURecord *record, int unwinder)
{
record->unwindersDone |= 1U << unwinder;
}
// unwinder_mark_nonleaf_frame marks the current frame as a non-leaf
// frame from the perspective of the user-mode stack.
// That is, frames that are making a syscall (thus the leaf for the user-mode
// stack, though not the leaf for the entire logical stack) *are*
// considered leaf frames in this sense.
//
// On both x86 and aarch64, this means we need to subtract 1 from
// the address during later processing.
//
// Additionally, on aarch64, this means that we will not trust the current value of
// `lr` to be the return address for this frame.
static inline __attribute__((__always_inline__)) void
unwinder_mark_nonleaf_frame(UnwindState *state)
{
state->return_address = true;
#if defined(__aarch64__)
state->lr_invalid = true;
#endif
}
// Push the file ID, line number and frame type into FrameList with a user-defined
// maximum stack size.
//
// NOTE: The line argument is used for a lot of different purposes, depending on
// the frame type. For example error frames use it to store the error number,
// and hotspot puts a subtype and BCI indices, amongst other things (see
// calc_line). This should probably be renamed to something like "frame type
// specific data".
static inline __attribute__((__always_inline__)) ErrorCode _push_with_max_frames(
Trace *trace, u64 file, u64 line, u8 frame_type, u8 return_address, u32 max_frames)
{
if (trace->stack_len >= max_frames) {
DEBUG_PRINT("unable to push frame: stack is full");
increment_metric(metricID_UnwindErrStackLengthExceeded);
return ERR_STACK_LENGTH_EXCEEDED;
}
#ifdef TESTING_COREDUMP
// tools/coredump uses CGO to build the eBPF code. This dispatches
// the frame information directly to helper implemented in ebpfhelpers.go.
int __push_frame(u64, u64, u64, u8, u8);
trace->stack_len++;
return __push_frame(__cgo_ctx->id, file, line, frame_type, return_address);
#else
trace->frames[trace->stack_len++] = (Frame){
.file_id = file,
.addr_or_line = line,
.kind = frame_type,
.return_address = return_address,
};
return ERR_OK;
#endif
}
// Push the file ID, line number and frame type into FrameList
static inline __attribute__((__always_inline__)) ErrorCode
_push_with_return_address(Trace *trace, u64 file, u64 line, u8 frame_type, bool return_address)
{
return _push_with_max_frames(
trace, file, line, frame_type, return_address, MAX_NON_ERROR_FRAME_UNWINDS);
}
// Push the file ID, line number and frame type into FrameList
static inline __attribute__((__always_inline__)) ErrorCode
_push(Trace *trace, u64 file, u64 line, u8 frame_type)
{
return _push_with_max_frames(trace, file, line, frame_type, 0, MAX_NON_ERROR_FRAME_UNWINDS);
}
// Push a critical error frame.
static inline __attribute__((__always_inline__)) ErrorCode push_error(Trace *trace, ErrorCode error)
{
return _push_with_max_frames(trace, 0, error, FRAME_MARKER_ABORT, 0, MAX_FRAME_UNWINDS);
}
// Send a trace to user-land via the `trace_events` perf event buffer.
static inline __attribute__((__always_inline__)) void send_trace(void *ctx, Trace *trace)
{
const u64 num_empty_frames = (MAX_FRAME_UNWINDS - trace->stack_len);
const u64 send_size = sizeof(Trace) - sizeof(Frame) * num_empty_frames;
if (send_size > sizeof(Trace)) {
return; // unreachable
}
bpf_perf_event_output(ctx, &trace_events, BPF_F_CURRENT_CPU, trace, send_size);
}
// is_kernel_address checks if the given address looks like virtual address to kernel memory.
static bool is_kernel_address(u64 addr)
{
return addr & 0xFF00000000000000UL;
}
// resolve_unwind_mapping decodes the current PC's mapping and prepares unwinding information.
// The state text_section_id and text_section_offset are updated accordingly. The unwinding program
// index that should be used is written to the given `unwinder` pointer.
static ErrorCode resolve_unwind_mapping(PerCPURecord *record, int *unwinder)
{
UnwindState *state = &record->state;
pid_t pid = record->trace.pid;
u64 pc = state->pc;
if (is_kernel_address(pc)) {
// This should not happen as we should only be unwinding usermode stacks.
// Seeing PC point to a kernel address indicates a bad unwind.
DEBUG_PRINT("PC value %lx is a kernel address", (unsigned long)pc);
state->error_metric = metricID_UnwindNativeErrKernelAddress;
return ERR_NATIVE_UNEXPECTED_KERNEL_ADDRESS;
}
if (pc < 0x1000) {
// The kernel will always return a start address for user space memory mappings that is
// above the value defined in /proc/sys/vm/mmap_min_addr.
// As such small PC values happens regularly (e.g. by handling or extracting the
// PC value incorrectly) we track them but don't proceed with unwinding.
DEBUG_PRINT("small pc value %lx, ignoring", (unsigned long)pc);
state->error_metric = metricID_UnwindNativeSmallPC;
return ERR_NATIVE_SMALL_PC;
}
PIDPage key = {};
key.prefixLen = BIT_WIDTH_PID + BIT_WIDTH_PAGE;
key.pid = __constant_cpu_to_be32((u32)pid);
key.page = __constant_cpu_to_be64(pc);
// Check if we have the data for this virtual address
PIDPageMappingInfo *val = bpf_map_lookup_elem(&pid_page_to_mapping_info, &key);
if (!val) {
DEBUG_PRINT("Failure to look up interval memory mapping for PC 0x%lx", (unsigned long)pc);
state->error_metric = metricID_UnwindNativeErrWrongTextSection;
return ERR_NATIVE_NO_PID_PAGE_MAPPING;
}
decode_bias_and_unwind_program(val->bias_and_unwind_program, &state->text_section_bias, unwinder);
state->text_section_id = val->file_id;
state->text_section_offset = pc - state->text_section_bias;
DEBUG_PRINT(
"Text section id for PC %lx is %llx (unwinder %d)",
(unsigned long)pc,
state->text_section_id,
*unwinder);
DEBUG_PRINT(
"Text section bias is %llx, and offset is %llx",
state->text_section_bias,
state->text_section_offset);
return ERR_OK;
}
// get_next_interpreter tries to get the next interpreter unwinder from the section id.
// If the section id happens to be within the range of a known interpreter it will
// return the interpreter unwinder otherwise the native unwinder.
static inline int get_next_interpreter(PerCPURecord *record)
{
UnwindState *state = &record->state;
u64 section_id = state->text_section_id;
u64 section_offset = state->text_section_offset;
// Check if the section id happens to be in the interpreter map.
OffsetRange *range = bpf_map_lookup_elem(&interpreter_offsets, §ion_id);
if (range != 0) {
if ((section_offset >= range->lower_offset) && (section_offset <= range->upper_offset)) {
DEBUG_PRINT("interpreter_offsets match %d", range->program_index);
if (!unwinder_is_done(record, range->program_index)) {
increment_metric(metricID_UnwindCallInterpreter);
return range->program_index;
}
DEBUG_PRINT("interpreter unwinder done");
}
}
return PROG_UNWIND_NATIVE;
}
// get_next_unwinder_after_native_frame determines the next unwinder program to run
// after a native stack frame has been unwound.
static inline __attribute__((__always_inline__)) ErrorCode
get_next_unwinder_after_native_frame(PerCPURecord *record, int *unwinder)
{
UnwindState *state = &record->state;
*unwinder = PROG_UNWIND_STOP;
if (state->pc == 0) {
DEBUG_PRINT("Stopping unwind due to unwind failure (PC == 0)");
state->error_metric = metricID_UnwindErrZeroPC;
return ERR_NATIVE_ZERO_PC;
}
DEBUG_PRINT("==== Resolve next frame unwinder: frame %d ====", record->trace.stack_len);
ErrorCode error = resolve_unwind_mapping(record, unwinder);
if (error) {
return error;
}
if (*unwinder == PROG_UNWIND_NATIVE) {
*unwinder = get_next_interpreter(record);
}
return ERR_OK;
}
// get_next_unwinder_after_interpreter determines the next unwinder program to run
// after an interpreter (non-native) frame sequence has been unwound.
static inline __attribute__((__always_inline__)) int
get_next_unwinder_after_interpreter(const PerCPURecord *record)
{
// Since interpreter-only frame decoding is no longer supported, this
// currently equals to just resuming native unwinding.
return PROG_UNWIND_NATIVE;
}
// tail_call is a wrapper around bpf_tail_call() and ensures that the number of tail calls is not
// reached while unwinding the stack.
static inline __attribute__((__always_inline__)) void tail_call(void *ctx, int next)
{
PerCPURecord *record = get_per_cpu_record();
if (!record) {
bpf_tail_call(ctx, &perf_progs, PROG_UNWIND_STOP);
// In theory bpf_tail_call() should never return. But due to instruction reordering by the
// compiler we have to place return here to bribe the verifier to accept this.
return;
}
if (record->tailCalls >= 29) {
// The maximum tail call count we need to support on older kernels is 32. At this point
// there is a chance that continuing unwinding the stack would further increase the number of
// tail calls. As a result we might lose the unwound stack as no further tail calls are left
// to report it to user space. To make sure we do not run into this issue we stop unwinding
// the stack at this point and report it to userspace.
next = PROG_UNWIND_STOP;
record->state.unwind_error = ERR_MAX_TAIL_CALLS;
increment_metric(metricID_MaxTailCalls);
}
record->tailCalls += 1;
bpf_tail_call(ctx, &perf_progs, next);
}
#ifndef __USER32_CS
// defined in arch/x86/include/asm/segment.h
#define GDT_ENTRY_DEFAULT_USER32_CS 4
#define GDT_ENTRY_DEFAULT_USER_DS 5
#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
#endif
#ifdef __aarch64__
// Strips the PAC tag from a pointer.
//
// While all pointers can contain PAC tags, we only apply this function to code pointers, because
// that's where normalization is required to make the stack delta lookups work. Note that if that
// should ever change, we'd need a different mask for the data pointers, because it might diverge
// from the mask for code pointers.
static inline u64 normalize_pac_ptr(u64 ptr)
{
// Retrieve PAC mask from the system config.
u32 key = 0;
SystemConfig *syscfg = bpf_map_lookup_elem(&system_config, &key);
if (!syscfg) {
// Unreachable: array maps are always fully initialized.
return ptr;
}
// Mask off PAC bits. Since we're always applying this to usermode pointers that should have all
// the high bits set to 0, we don't need to consider the case of having to fill up the resulting
// hole with 1s (like we'd have to for kernel ptrs).
ptr &= syscfg->inverse_pac_mask;
return ptr;
}
#endif
// Initialize state from pt_regs
static inline ErrorCode
copy_state_regs(UnwindState *state, struct pt_regs *regs, bool interrupted_kernelmode)
{
#if defined(__x86_64__)
// Check if the process is running in 32-bit mode on the x86_64 system.
// This check follows the Linux kernel implementation of user_64bit_mode() in
// arch/x86/include/asm/ptrace.h.
if (regs->cs == __USER32_CS) {
return ERR_NATIVE_X64_32BIT_COMPAT_MODE;
}
state->pc = regs->ip;
state->sp = regs->sp;
state->fp = regs->bp;
state->rax = regs->ax;
state->r9 = regs->r9;
state->r11 = regs->r11;
state->r13 = regs->r13;
state->r15 = regs->r15;
// Treat syscalls as return addresses, but not IRQ handling, page faults, etc..
// https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/include/asm/syscall.h#L31-L39
// https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/entry/entry_64.S#L847
state->return_address = interrupted_kernelmode && regs->orig_ax != -1;
#elif defined(__aarch64__)
// For backwards compatibility aarch64 can run 32-bit code.
// Check if the process is running in this 32-bit compat mod.
if (regs->pstate & PSR_MODE32_BIT) {
return ERR_NATIVE_AARCH64_32BIT_COMPAT_MODE;
}
state->pc = normalize_pac_ptr(regs->pc);
state->sp = regs->sp;
state->fp = regs->regs[29];
state->lr = normalize_pac_ptr(regs->regs[30]);
state->r22 = regs->regs[22];
// Treat syscalls as return addresses, but not IRQ handling, page faults, etc..
// https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L118
// https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L206-L209
//
// Note: We do not use `unwinder_mark_nonleaf_frame` here,
// because the frame is a leaf frame from the perspective of the user stack,
// regardless of whether we are in a syscall.
state->return_address = interrupted_kernelmode && regs->syscallno != -1;
state->lr_invalid = false;
#endif
return ERR_OK;
}
#ifndef TESTING_COREDUMP
// Read the task's entry stack pt_regs. This has identical functionality
// to bpf_task_pt_regs which is emulated to support older kernels.
// Once kernel requirement is increased to 5.15 this can be replaced with
// the bpf_task_pt_regs() helper.
static inline long get_task_pt_regs(struct task_struct *task, SystemConfig *syscfg)
{
u64 stack_ptr = (u64)task + syscfg->task_stack_offset;
long stack_base;
if (bpf_probe_read_kernel(&stack_base, sizeof(stack_base), (void *)stack_ptr)) {
return 0;
}
return stack_base + syscfg->stack_ptregs_offset;
}
// Determine whether the given pt_regs are from user-mode register context.
// This needs to detect also invalid pt_regs in case we its kernel thread stack
// without valid user mode pt_regs so is_kernel_address(pc) is not enough.
static inline bool ptregs_is_usermode(struct pt_regs *regs)
{
#if defined(__x86_64__)
// On x86_64 the user mode SS should always be __USER_DS.
if (regs->ss != __USER_DS) {
return false;
}
return true;
#elif defined(__aarch64__)
// Check if the processor state is in the EL0t what linux uses for usermode.
if ((regs->pstate & PSR_MODE_MASK) != PSR_MODE_EL0t) {
return false;
}
return true;
#else
#error add support for new architecture
#endif
}
// Extract the usermode pt_regs for current task. Use context given pt_regs
// if it is usermode regs, or resolve it via struct task_struct.
//
// State registers are not touched (get_pristine_per_cpu_record already reset it)
// if something fails. has_usermode_regs is set to true if a user-mode register
// context was found: not every thread that we interrupt will actually have
// a user-mode context (e.g. kernel worker threads won't).
static inline ErrorCode
get_usermode_regs(struct pt_regs *ctx, UnwindState *state, bool *has_usermode_regs)
{
ErrorCode error;
if (!ptregs_is_usermode(ctx)) {
u32 key = 0;
SystemConfig *syscfg = bpf_map_lookup_elem(&system_config, &key);
if (!syscfg) {
// Unreachable: array maps are always fully initialized.
return ERR_UNREACHABLE;
}
// Use the current task's entry pt_regs
struct task_struct *task = (struct task_struct *)bpf_get_current_task();
long ptregs_addr = get_task_pt_regs(task, syscfg);
struct pt_regs regs;
if (!ptregs_addr || bpf_probe_read_kernel(®s, sizeof(regs), (void *)ptregs_addr)) {
increment_metric(metricID_UnwindNativeErrReadKernelModeRegs);
return ERR_NATIVE_READ_KERNELMODE_REGS;
}
if (!ptregs_is_usermode(®s)) {
// No usermode registers context found.
return ERR_OK;
}
error = copy_state_regs(state, ®s, true);
} else {
// User mode code interrupted, registers are available via the ebpf context.
error = copy_state_regs(state, ctx, false);
}
if (error == ERR_OK) {
DEBUG_PRINT("Read regs: pc: %llx sp: %llx fp: %llx", state->pc, state->sp, state->fp);
*has_usermode_regs = true;
}
return error;
}
#else // TESTING_COREDUMP
static inline ErrorCode
get_usermode_regs(struct pt_regs *ctx, UnwindState *state, bool *has_usermode_regs)
{
// Coredumps provide always usermode pt_regs directly.
ErrorCode error = copy_state_regs(state, ctx, false);
if (error == ERR_OK) {
*has_usermode_regs = true;
}
return error;
}
#endif // TESTING_COREDUMP
static inline int collect_trace(
struct pt_regs *ctx, TraceOrigin origin, u32 pid, u32 tid, u64 trace_timestamp, u64 off_cpu_time)
{
// The trace is reused on each call to this function so we have to reset the
// variables used to maintain state.
DEBUG_PRINT("Resetting CPU record");
PerCPURecord *record = get_pristine_per_cpu_record();
if (!record) {
return -1;
}
Trace *trace = &record->trace;
trace->origin = origin;
trace->pid = pid;
trace->tid = tid;
trace->ktime = trace_timestamp;
trace->offtime = off_cpu_time;
if (bpf_get_current_comm(&(trace->comm), sizeof(trace->comm)) < 0) {
increment_metric(metricID_ErrBPFCurrentComm);
}
// Get the kernel mode stack trace first
trace->kernel_stack_id = bpf_get_stackid(ctx, &kernel_stackmap, BPF_F_REUSE_STACKID);
DEBUG_PRINT("kernel stack id = %d", trace->kernel_stack_id);
// Recursive unwind frames
int unwinder = PROG_UNWIND_STOP;
bool has_usermode_regs = false;
ErrorCode error = get_usermode_regs(ctx, &record->state, &has_usermode_regs);
if (error || !has_usermode_regs) {
goto exit;
}
if (!pid_information_exists(ctx, pid)) {
if (report_pid(ctx, pid, RATELIMIT_ACTION_DEFAULT)) {
increment_metric(metricID_NumProcNew);
}
return 0;
}
error = get_next_unwinder_after_native_frame(record, &unwinder);
exit:
record->state.unwind_error = error;
tail_call(ctx, unwinder);
DEBUG_PRINT("bpf_tail call failed for %d in native_tracer_entry", unwinder);
return -1;
}
#endif