From e8c4f6ee8eeed8e02800bed6afb9aa22fc3476a1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Aug 2025 12:38:59 +0200 Subject: [PATCH 01/66] perf: Remove redundant condition for AUX buffer size It is already checked whether the VMA size is the same as nr_pages * PAGE_SIZE, so later checking both: aux_size == vma_size && aux_size == nr_pages * PAGE_SIZE is redundant. Remove the vma_size check as nr_pages is what is actually used in the allocation function. That prepares for splitting out the buffer allocation into separate functions, so that only nr_pages needs to be handed in. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104018.424519320@infradead.org --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 8060c2857bb2b..eea3a7d6c61d7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7043,7 +7043,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) goto aux_unlock; - if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) + if (aux_size != nr_pages * PAGE_SIZE) goto aux_unlock; /* already mapped with a different size */ From 81e026ca47b386e4213c1beff069038a3ba8bb76 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Aug 2025 12:39:00 +0200 Subject: [PATCH 02/66] perf: Split out mlock limit handling To prepare for splitting the buffer allocation out into separate functions for the ring buffer and the AUX buffer, split out mlock limit handling into a helper function, which can be called from both. No functional change intended. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104018.541975109@infradead.org --- kernel/events/core.c | 75 ++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index eea3a7d6c61d7..f6299012ed734 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6927,17 +6927,49 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) return err; } +static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra) +{ + unsigned long user_locked, user_lock_limit, locked, lock_limit; + struct user_struct *user = current_user(); + + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + /* Increase the limit linearly with more CPUs */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm); + + /* + * sysctl_perf_event_mlock may have changed, so that + * user->locked_vm > user_lock_limit + */ + if (user_locked > user_lock_limit) + user_locked = user_lock_limit; + user_locked += *user_extra; + + if (user_locked > user_lock_limit) { + /* + * charge locked_vm until it hits user_lock_limit; + * charge the rest from pinned_vm + */ + *extra = user_locked - user_lock_limit; + *user_extra -= *extra; + } + + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra; + + return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK); +} + static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; - unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); + unsigned long vma_size, nr_pages; + long user_extra = 0, extra = 0; struct mutex *aux_mutex = NULL; struct perf_buffer *rb = NULL; - unsigned long locked, lock_limit; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra = 0, extra = 0; int ret, flags = 0; mapped_f mapped; @@ -7063,38 +7095,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } } - user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); - - /* - * Increase the limit linearly with more CPUs: - */ - user_lock_limit *= num_online_cpus(); - - user_locked = atomic_long_read(&user->locked_vm); - - /* - * sysctl_perf_event_mlock may have changed, so that - * user->locked_vm > user_lock_limit - */ - if (user_locked > user_lock_limit) - user_locked = user_lock_limit; - user_locked += user_extra; - - if (user_locked > user_lock_limit) { - /* - * charge locked_vm until it hits user_lock_limit; - * charge the rest from pinned_vm - */ - extra = user_locked - user_lock_limit; - user_extra -= extra; - } - - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; - - if ((locked > lock_limit) && perf_is_paranoid() && - !capable(CAP_IPC_LOCK)) { + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { ret = -EPERM; goto unlock; } From 1ea3e3b0dadc06c5e6c1bdf5312e70ee861b1ba0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Aug 2025 12:39:01 +0200 Subject: [PATCH 03/66] perf: Split out VM accounting Similarly to the mlock limit calculation the VM accounting is required for both the ringbuffer and the AUX buffer allocations. To prepare for splitting them out into separate functions, move the accounting into a helper function. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104018.660347811@infradead.org --- kernel/events/core.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index f6299012ed734..f90847101ade9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6962,10 +6962,17 @@ static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK); } +static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra) +{ + struct user_struct *user = current_user(); + + atomic_long_add(user_extra, &user->locked_vm); + atomic64_add(extra, &vma->vm_mm->pinned_vm); +} + static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; - struct user_struct *user = current_user(); unsigned long vma_size, nr_pages; long user_extra = 0, extra = 0; struct mutex *aux_mutex = NULL; @@ -7136,9 +7143,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unlock: if (!ret) { - atomic_long_add(user_extra, &user->locked_vm); - atomic64_add(extra, &vma->vm_mm->pinned_vm); - + perf_mmap_account(vma, user_extra, extra); atomic_inc(&event->mmap_count); } else if (rb) { /* AUX allocation failed */ From 86a0a7c59845e7093c9c73a7115c9d86349499d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:02 +0200 Subject: [PATCH 04/66] perf: Move perf_mmap_calc_limits() into both rb and aux branches if (cond) { A; } else { B; } C; into if (cond) { A; C; } else { B; C; } Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104018.781244099@infradead.org --- kernel/events/core.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index f90847101ade9..9f19c612c80f7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7054,6 +7054,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ring_buffer_attach(event, NULL); } + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { + ret = -EPERM; + goto unlock; + } + + WARN_ON(!rb && event->rb); + + if (vma->vm_flags & VM_WRITE) + flags |= RING_BUFFER_WRITABLE; + } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already @@ -7100,17 +7110,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = 0; goto unlock; } - } - if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { - ret = -EPERM; - goto unlock; - } + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { + ret = -EPERM; + goto unlock; + } - WARN_ON(!rb && event->rb); + WARN_ON(!rb && event->rb); - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; + if (vma->vm_flags & VM_WRITE) + flags |= RING_BUFFER_WRITABLE; + } if (!rb) { rb = rb_alloc(nr_pages, From 3821f258686691cf12bbfc636ab22fa2b049dc86 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:03 +0200 Subject: [PATCH 05/66] perf: Merge consecutive conditionals in perf_mmap() if (cond) { A; } else { B; } if (cond) { C; } else { D; } into: if (cond) { A; C; } else { B; D; } Notably the conditions are not identical in form, but are equivalent. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104018.900078502@infradead.org --- kernel/events/core.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 9f19c612c80f7..085f36f611375 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7064,6 +7064,25 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, flags); + + if (!rb) { + ret = -ENOMEM; + goto unlock; + } + + atomic_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; + + ring_buffer_attach(event, rb); + + perf_event_update_time(event); + perf_event_init_userpage(event); + perf_event_update_userpage(event); + ret = 0; } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already @@ -7120,29 +7139,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; - } - if (!rb) { - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); - - if (!rb) { - ret = -ENOMEM; - goto unlock; - } - - atomic_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); - rb->mmap_locked = extra; - - ring_buffer_attach(event, rb); - - perf_event_update_time(event); - perf_event_init_userpage(event); - perf_event_update_userpage(event); - ret = 0; - } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); if (!ret) { From 4118994b33bb628dd9aeb941c5af6f950f1dea90 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:04 +0200 Subject: [PATCH 06/66] perf: Move common code into both rb and aux branches if (cond) { A; } else { B; } C; into if (cond) { A; C; } else { B; C; } Notably C has a success branch and both A and B have two places for success. For A (rb case), duplicate the success case because later patches will result in them no longer being identical. For B (aux case), share using goto (cleaned up later). Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.016252852@infradead.org --- kernel/events/core.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 085f36f611375..dfe09b0332739 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7043,6 +7043,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = 0; /* We need the rb to map pages. */ rb = event->rb; + perf_mmap_account(vma, user_extra, extra); + atomic_inc(&event->mmap_count); goto unlock; } @@ -7083,6 +7085,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) perf_event_init_userpage(event); perf_event_update_userpage(event); ret = 0; + + perf_mmap_account(vma, user_extra, extra); + atomic_inc(&event->mmap_count); } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already @@ -7127,11 +7132,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (rb_has_aux(rb)) { atomic_inc(&rb->aux_mmap_count); ret = 0; - goto unlock; + goto aux_success; } if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { ret = -EPERM; + atomic_dec(&rb->mmap_count); goto unlock; } @@ -7142,20 +7148,19 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); - if (!ret) { - atomic_set(&rb->aux_mmap_count, 1); - rb->aux_mmap_locked = extra; + if (ret) { + atomic_dec(&rb->mmap_count); + goto unlock; } - } -unlock: - if (!ret) { + atomic_set(&rb->aux_mmap_count, 1); + rb->aux_mmap_locked = extra; +aux_success: perf_mmap_account(vma, user_extra, extra); atomic_inc(&event->mmap_count); - } else if (rb) { - /* AUX allocation failed */ - atomic_dec(&rb->mmap_count); } + +unlock: aux_unlock: if (aux_mutex) mutex_unlock(aux_mutex); From 41b80e1d74bdef5e48ea63d186244b9f6f82a4da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:05 +0200 Subject: [PATCH 07/66] perf: Remove redundant aux_unlock label unlock and aux_unlock are now identical, remove the aux_unlock one. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.131293512@infradead.org --- kernel/events/core.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index dfe09b0332739..89fb069913d07 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7098,7 +7098,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) rb = event->rb; if (!rb) - goto aux_unlock; + goto unlock; aux_mutex = &rb->aux_mutex; mutex_lock(aux_mutex); @@ -7107,27 +7107,27 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) aux_size = READ_ONCE(rb->user_page->aux_size); if (aux_offset < perf_data_size(rb) + PAGE_SIZE) - goto aux_unlock; + goto unlock; if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) - goto aux_unlock; + goto unlock; /* already mapped with a different offset */ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) - goto aux_unlock; + goto unlock; if (aux_size != nr_pages * PAGE_SIZE) - goto aux_unlock; + goto unlock; /* already mapped with a different size */ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) - goto aux_unlock; + goto unlock; if (!is_power_of_2(nr_pages)) - goto aux_unlock; + goto unlock; if (!atomic_inc_not_zero(&rb->mmap_count)) - goto aux_unlock; + goto unlock; if (rb_has_aux(rb)) { atomic_inc(&rb->aux_mmap_count); @@ -7161,7 +7161,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } unlock: -aux_unlock: if (aux_mutex) mutex_unlock(aux_mutex); mutex_unlock(&event->mmap_mutex); From b33a51564e3eb6c468979f9f08d9b4ad8451bed7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:06 +0200 Subject: [PATCH 08/66] perf: Use guard() for aux_mutex in perf_mmap() After duplicating the common code into the rb/aux branches is it possible to use a simple guard() for the aux_mutex. Making the aux branch self-contained. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.246250452@infradead.org --- kernel/events/core.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 89fb069913d07..236c60adde883 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6975,7 +6975,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) struct perf_event *event = file->private_data; unsigned long vma_size, nr_pages; long user_extra = 0, extra = 0; - struct mutex *aux_mutex = NULL; struct perf_buffer *rb = NULL; int ret, flags = 0; mapped_f mapped; @@ -7100,8 +7099,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!rb) goto unlock; - aux_mutex = &rb->aux_mutex; - mutex_lock(aux_mutex); + guard(mutex)(&rb->aux_mutex); aux_offset = READ_ONCE(rb->user_page->aux_offset); aux_size = READ_ONCE(rb->user_page->aux_size); @@ -7161,8 +7159,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } unlock: - if (aux_mutex) - mutex_unlock(aux_mutex); mutex_unlock(&event->mmap_mutex); if (ret) From 8558dca9fbdf825edf30b5fb74fbbbf3e6ba5dce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:07 +0200 Subject: [PATCH 09/66] perf: Reflow to get rid of aux_success label Mostly re-indent noise needed to get rid of that label. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.362581570@infradead.org --- kernel/events/core.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 236c60adde883..5bbea8127bb6c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7130,30 +7130,29 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (rb_has_aux(rb)) { atomic_inc(&rb->aux_mmap_count); ret = 0; - goto aux_success; - } - if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { - ret = -EPERM; - atomic_dec(&rb->mmap_count); - goto unlock; - } + } else { + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { + ret = -EPERM; + atomic_dec(&rb->mmap_count); + goto unlock; + } - WARN_ON(!rb && event->rb); + WARN_ON(!rb && event->rb); - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; + if (vma->vm_flags & VM_WRITE) + flags |= RING_BUFFER_WRITABLE; - ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, - event->attr.aux_watermark, flags); - if (ret) { - atomic_dec(&rb->mmap_count); - goto unlock; - } + ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, + event->attr.aux_watermark, flags); + if (ret) { + atomic_dec(&rb->mmap_count); + goto unlock; + } - atomic_set(&rb->aux_mmap_count, 1); - rb->aux_mmap_locked = extra; -aux_success: + atomic_set(&rb->aux_mmap_count, 1); + rb->aux_mmap_locked = extra; + } perf_mmap_account(vma, user_extra, extra); atomic_inc(&event->mmap_count); } From 2aee37682391332d26c01e703170e0d9358c7252 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:08 +0200 Subject: [PATCH 10/66] perf: Split out the AUX buffer allocation Move the AUX buffer allocation branch into its own function. Originally-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.494205648@infradead.org --- kernel/events/core.c | 144 +++++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 5bbea8127bb6c..e76afd9c17592 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6970,6 +6970,82 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long atomic64_add(extra, &vma->vm_mm->pinned_vm); } +static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + u64 aux_offset, aux_size; + struct perf_buffer *rb; + int ret, rb_flags = 0; + + rb = event->rb; + if (!rb) + return -EINVAL; + + guard(mutex)(&rb->aux_mutex); + + /* + * AUX area mapping: if rb->aux_nr_pages != 0, it's already + * mapped, all subsequent mappings should have the same size + * and offset. Must be above the normal perf buffer. + */ + aux_offset = READ_ONCE(rb->user_page->aux_offset); + aux_size = READ_ONCE(rb->user_page->aux_size); + + if (aux_offset < perf_data_size(rb) + PAGE_SIZE) + return -EINVAL; + + if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) + return -EINVAL; + + /* already mapped with a different offset */ + if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) + return -EINVAL; + + if (aux_size != nr_pages * PAGE_SIZE) + return -EINVAL; + + /* already mapped with a different size */ + if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) + return -EINVAL; + + if (!is_power_of_2(nr_pages)) + return -EINVAL; + + if (!atomic_inc_not_zero(&rb->mmap_count)) + return -EINVAL; + + if (rb_has_aux(rb)) { + atomic_inc(&rb->aux_mmap_count); + + } else { + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { + atomic_dec(&rb->mmap_count); + return -EPERM; + } + + WARN_ON(!rb && event->rb); + + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; + + ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, + event->attr.aux_watermark, rb_flags); + if (ret) { + atomic_dec(&rb->mmap_count); + return ret; + } + + atomic_set(&rb->aux_mmap_count, 1); + rb->aux_mmap_locked = extra; + } + + perf_mmap_account(vma, user_extra, extra); + atomic_inc(&event->mmap_count); + + return 0; +} + static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; @@ -7088,73 +7164,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) perf_mmap_account(vma, user_extra, extra); atomic_inc(&event->mmap_count); } else { - /* - * AUX area mapping: if rb->aux_nr_pages != 0, it's already - * mapped, all subsequent mappings should have the same size - * and offset. Must be above the normal perf buffer. - */ - u64 aux_offset, aux_size; - - rb = event->rb; - if (!rb) - goto unlock; - - guard(mutex)(&rb->aux_mutex); - - aux_offset = READ_ONCE(rb->user_page->aux_offset); - aux_size = READ_ONCE(rb->user_page->aux_size); - - if (aux_offset < perf_data_size(rb) + PAGE_SIZE) - goto unlock; - - if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) - goto unlock; - - /* already mapped with a different offset */ - if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) - goto unlock; - - if (aux_size != nr_pages * PAGE_SIZE) - goto unlock; - - /* already mapped with a different size */ - if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) - goto unlock; - - if (!is_power_of_2(nr_pages)) - goto unlock; - - if (!atomic_inc_not_zero(&rb->mmap_count)) - goto unlock; - - if (rb_has_aux(rb)) { - atomic_inc(&rb->aux_mmap_count); - ret = 0; - - } else { - if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { - ret = -EPERM; - atomic_dec(&rb->mmap_count); - goto unlock; - } - - WARN_ON(!rb && event->rb); - - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; - - ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, - event->attr.aux_watermark, flags); - if (ret) { - atomic_dec(&rb->mmap_count); - goto unlock; - } - - atomic_set(&rb->aux_mmap_count, 1); - rb->aux_mmap_locked = extra; - } - perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); + ret = perf_mmap_aux(vma, event, nr_pages); } unlock: From 191759e5ea9f6995171ed2ffcc41a2377f946a3a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:09 +0200 Subject: [PATCH 11/66] perf: Make RB allocation branch self sufficient Ensure @rb usage doesn't extend out of the branch block. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.605285302@infradead.org --- kernel/events/core.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index e76afd9c17592..875c27b28e9b9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7116,8 +7116,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) * multiple times. */ ret = 0; - /* We need the rb to map pages. */ - rb = event->rb; perf_mmap_account(vma, user_extra, extra); atomic_inc(&event->mmap_count); goto unlock; @@ -7136,8 +7134,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - WARN_ON(!rb && event->rb); - if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; @@ -7190,7 +7186,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) * full cleanup in this case and therefore does not invoke * vmops::close(). */ - ret = map_range(rb, vma); + ret = map_range(event->rb, vma); if (ret) perf_mmap_close(vma); From 5d299897f1e36025400ca84fd36c15925a383b03 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:10 +0200 Subject: [PATCH 12/66] perf: Split out the RB allocation Move the RB buffer allocation branch into its own function. Originally-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.722214699@infradead.org --- kernel/events/core.c | 145 ++++++++++++++++++++++--------------------- 1 file changed, 73 insertions(+), 72 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 875c27b28e9b9..3a5fd2b802e4f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6970,6 +6970,75 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long atomic64_add(extra, &vma->vm_mm->pinned_vm); } +static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + struct perf_buffer *rb; + int rb_flags = 0; + + nr_pages -= 1; + + /* + * If we have rb pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + WARN_ON_ONCE(event->ctx->parent_ctx); + + if (event->rb) { + if (data_page_nr(event->rb) != nr_pages) + return -EINVAL; + + if (atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Success -- managed to mmap() the same buffer + * multiple times. + */ + perf_mmap_account(vma, user_extra, extra); + atomic_inc(&event->mmap_count); + return 0; + } + + /* + * Raced against perf_mmap_close()'s + * atomic_dec_and_mutex_lock() remove the + * event and continue as if !event->rb + */ + ring_buffer_attach(event, NULL); + } + + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) + return -EPERM; + + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; + + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, rb_flags); + + if (!rb) + return -ENOMEM; + + atomic_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; + + ring_buffer_attach(event, rb); + + perf_event_update_time(event); + perf_event_init_userpage(event); + perf_event_update_userpage(event); + + perf_mmap_account(vma, user_extra, extra); + atomic_inc(&event->mmap_count); + + return 0; +} + static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, unsigned long nr_pages) { @@ -7050,10 +7119,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; unsigned long vma_size, nr_pages; - long user_extra = 0, extra = 0; - struct perf_buffer *rb = NULL; - int ret, flags = 0; mapped_f mapped; + int ret; /* * Don't allow mmap() of inherited per-task counters. This would @@ -7079,8 +7146,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma_size != PAGE_SIZE * nr_pages) return -EINVAL; - user_extra = nr_pages; - mutex_lock(&event->mmap_mutex); ret = -EINVAL; @@ -7094,74 +7159,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - if (vma->vm_pgoff == 0) { - nr_pages -= 1; - - /* - * If we have rb pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - goto unlock; - - WARN_ON_ONCE(event->ctx->parent_ctx); - - if (event->rb) { - if (data_page_nr(event->rb) != nr_pages) - goto unlock; - - if (atomic_inc_not_zero(&event->rb->mmap_count)) { - /* - * Success -- managed to mmap() the same buffer - * multiple times. - */ - ret = 0; - perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); - goto unlock; - } - - /* - * Raced against perf_mmap_close()'s - * atomic_dec_and_mutex_lock() remove the - * event and continue as if !event->rb - */ - ring_buffer_attach(event, NULL); - } - - if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { - ret = -EPERM; - goto unlock; - } - - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; - - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); - - if (!rb) { - ret = -ENOMEM; - goto unlock; - } - - atomic_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); - rb->mmap_locked = extra; - - ring_buffer_attach(event, rb); - - perf_event_update_time(event); - perf_event_init_userpage(event); - perf_event_update_userpage(event); - ret = 0; - - perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); - } else { + if (vma->vm_pgoff == 0) + ret = perf_mmap_rb(vma, event, nr_pages); + else ret = perf_mmap_aux(vma, event, nr_pages); - } unlock: mutex_unlock(&event->mmap_mutex); From d23a6dbc0a71741eb7b141fdc04e31360fba46ef Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:11 +0200 Subject: [PATCH 13/66] perf: Use scoped_guard() for mmap_mutex in perf_mmap() Mostly just re-indent noise. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104019.838047976@infradead.org --- kernel/events/core.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 3a5fd2b802e4f..41941dfadfcbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7146,30 +7146,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma_size != PAGE_SIZE * nr_pages) return -EINVAL; - mutex_lock(&event->mmap_mutex); - ret = -EINVAL; + scoped_guard (mutex, &event->mmap_mutex) { + /* + * This relies on __pmu_detach_event() taking mmap_mutex after marking + * the event REVOKED. Either we observe the state, or __pmu_detach_event() + * will detach the rb created here. + */ + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; - /* - * This relies on __pmu_detach_event() taking mmap_mutex after marking - * the event REVOKED. Either we observe the state, or __pmu_detach_event() - * will detach the rb created here. - */ - if (event->state <= PERF_EVENT_STATE_REVOKED) { - ret = -ENODEV; - goto unlock; + if (vma->vm_pgoff == 0) + ret = perf_mmap_rb(vma, event, nr_pages); + else + ret = perf_mmap_aux(vma, event, nr_pages); + if (ret) + return ret; } - if (vma->vm_pgoff == 0) - ret = perf_mmap_rb(vma, event, nr_pages); - else - ret = perf_mmap_aux(vma, event, nr_pages); - -unlock: - mutex_unlock(&event->mmap_mutex); - - if (ret) - return ret; - /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. From 59741451b49ce9964a9758c19d6f7df2a1255c75 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Aug 2025 12:39:12 +0200 Subject: [PATCH 14/66] perf: Identify the 0->1 transition for event::mmap_count Needed because refcount_inc() doesn't allow the 0->1 transition. Specifically, this is the case where we've created the RB, this means there was no RB, and as such there could not have been an mmap. Additionally we hold mmap_mutex to serialize everything. This must be the first. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250812104019.956479989@infradead.org --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 41941dfadfcbc..f6211ab185036 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7034,7 +7034,7 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, perf_event_update_userpage(event); perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); + atomic_set(&event->mmap_count, 1); return 0; } From 448f97fba9013ffa13f5dd82febd18836b189499 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Aug 2025 12:39:13 +0200 Subject: [PATCH 15/66] perf: Convert mmap() refcounts to refcount_t The recently fixed reference count leaks could have been detected by using refcount_t and refcount_t would have mitigated the potential overflow at least. Now that the code is properly structured, convert the mmap() related mmap_count variants over to refcount_t. No functional change intended. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104020.071507932@infradead.org --- include/linux/perf_event.h | 2 +- kernel/events/core.c | 40 ++++++++++++++++++------------------- kernel/events/internal.h | 4 ++-- kernel/events/ring_buffer.c | 2 +- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ec9d960256839..bfbf9ea53f259 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -859,7 +859,7 @@ struct perf_event { /* mmap bits */ struct mutex mmap_mutex; - atomic_t mmap_count; + refcount_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; diff --git a/kernel/events/core.c b/kernel/events/core.c index f6211ab185036..ea357044d7806 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3968,7 +3968,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, */ static inline bool event_update_userpage(struct perf_event *event) { - if (likely(!atomic_read(&event->mmap_count))) + if (likely(!refcount_read(&event->mmap_count))) return false; perf_event_update_time(event); @@ -6704,11 +6704,11 @@ static void perf_mmap_open(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; mapped_f mapped = get_mapped(event, event_mapped); - atomic_inc(&event->mmap_count); - atomic_inc(&event->rb->mmap_count); + refcount_inc(&event->mmap_count); + refcount_inc(&event->rb->mmap_count); if (vma->vm_pgoff) - atomic_inc(&event->rb->aux_mmap_count); + refcount_inc(&event->rb->aux_mmap_count); if (mapped) mapped(event, vma->vm_mm); @@ -6743,7 +6743,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { + refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU @@ -6763,10 +6763,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&rb->aux_mutex); } - if (atomic_dec_and_test(&rb->mmap_count)) + if (refcount_dec_and_test(&rb->mmap_count)) detach_rest = true; - if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); @@ -6992,19 +6992,19 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (data_page_nr(event->rb) != nr_pages) return -EINVAL; - if (atomic_inc_not_zero(&event->rb->mmap_count)) { + if (refcount_inc_not_zero(&event->rb->mmap_count)) { /* * Success -- managed to mmap() the same buffer * multiple times. */ perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); + refcount_inc(&event->mmap_count); return 0; } /* * Raced against perf_mmap_close()'s - * atomic_dec_and_mutex_lock() remove the + * refcount_dec_and_mutex_lock() remove the * event and continue as if !event->rb */ ring_buffer_attach(event, NULL); @@ -7023,7 +7023,7 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (!rb) return -ENOMEM; - atomic_set(&rb->mmap_count, 1); + refcount_set(&rb->mmap_count, 1); rb->mmap_user = get_current_user(); rb->mmap_locked = extra; @@ -7034,7 +7034,7 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, perf_event_update_userpage(event); perf_mmap_account(vma, user_extra, extra); - atomic_set(&event->mmap_count, 1); + refcount_set(&event->mmap_count, 1); return 0; } @@ -7081,15 +7081,15 @@ static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, if (!is_power_of_2(nr_pages)) return -EINVAL; - if (!atomic_inc_not_zero(&rb->mmap_count)) + if (!refcount_inc_not_zero(&rb->mmap_count)) return -EINVAL; if (rb_has_aux(rb)) { - atomic_inc(&rb->aux_mmap_count); + refcount_inc(&rb->aux_mmap_count); } else { if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { - atomic_dec(&rb->mmap_count); + refcount_dec(&rb->mmap_count); return -EPERM; } @@ -7101,16 +7101,16 @@ static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, rb_flags); if (ret) { - atomic_dec(&rb->mmap_count); + refcount_dec(&rb->mmap_count); return ret; } - atomic_set(&rb->aux_mmap_count, 1); + refcount_set(&rb->aux_mmap_count, 1); rb->aux_mmap_locked = extra; } perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); + refcount_inc(&event->mmap_count); return 0; } @@ -13254,7 +13254,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) + if (refcount_read(&event->mmap_count)) goto unlock; if (output_event) { @@ -13267,7 +13267,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) goto unlock; /* did we race against perf_mmap_close() */ - if (!atomic_read(&rb->mmap_count)) { + if (!refcount_read(&rb->mmap_count)) { ring_buffer_put(rb); goto unlock; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 249288d82b8dc..d9cc570830918 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -35,7 +35,7 @@ struct perf_buffer { spinlock_t event_lock; struct list_head event_list; - atomic_t mmap_count; + refcount_t mmap_count; unsigned long mmap_locked; struct user_struct *mmap_user; @@ -47,7 +47,7 @@ struct perf_buffer { unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; - atomic_t aux_mmap_count; + refcount_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index aa9a759e824fe..20a9050237362 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * the same order, see perf_mmap_close. Otherwise we end up freeing * aux pages in this path, which is a bug, because in_atomic(). */ - if (!atomic_read(&rb->aux_mmap_count)) + if (!refcount_read(&rb->aux_mmap_count)) goto err; if (!refcount_inc_not_zero(&rb->aux_refcount)) From 7769cb177b23142b83f22abd06e492cc25157893 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:11 +0200 Subject: [PATCH 16/66] uprobes: Remove breakpoint in unapply_uprobe under mmap_write_lock Currently unapply_uprobe takes mmap_read_lock, but it might call remove_breakpoint which eventually changes user pages. Current code writes either breakpoint or original instruction, so it can go away with read lock as explained in here [1]. But with the upcoming change that writes multiple instructions on the probed address we need to ensure that any update to mm's pages is exclusive. [1] https://lore.kernel.org/all/20240710140045.GA1084@redhat.com/ Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Masami Hiramatsu (Google) Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-2-jolsa@kernel.org --- kernel/events/uprobes.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd8..1cbfe3cfe5738 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -482,7 +482,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma, * @opcode_vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @opcode_vaddr. * - * Called with mm->mmap_lock held for read or write. + * Called with mm->mmap_lock held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, @@ -1463,7 +1463,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) struct vm_area_struct *vma; int err = 0; - mmap_read_lock(mm); + mmap_write_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1480,7 +1480,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, vma, vaddr); } - mmap_read_unlock(mm); + mmap_write_unlock(mm); return err; } From 0f07b7919d679050d354d3279faa74bdc7ce17a0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:12 +0200 Subject: [PATCH 17/66] uprobes: Rename arch_uretprobe_trampoline function We are about to add uprobe trampoline, so cleaning up the namespace. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-3-jolsa@kernel.org --- arch/x86/kernel/uprobes.c | 2 +- include/linux/uprobes.h | 2 +- kernel/events/uprobes.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6d383839e8393..77050e5a4680d 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -338,7 +338,7 @@ extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; -void *arch_uprobe_trampoline(unsigned long *psize) +void *arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 516217c390946..01112f27cd215 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -224,7 +224,7 @@ extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); extern void uprobe_handle_trampoline(struct pt_regs *regs); -extern void *arch_uprobe_trampoline(unsigned long *psize); +extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); #else /* !CONFIG_UPROBES */ struct uprobes_state { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1cbfe3cfe5738..dd4dd156c9567 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1726,7 +1726,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } -void * __weak arch_uprobe_trampoline(unsigned long *psize) +void * __weak arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; @@ -1758,7 +1758,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - insns = arch_uprobe_trampoline(&insns_size); + insns = arch_uretprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) From 82afdd05a16a424409682e06a53d6afcda038d30 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:13 +0200 Subject: [PATCH 18/66] uprobes: Make copy_from_page global Making copy_from_page global and adding uprobe prefix. Adding the uprobe prefix to copy_to_page as well for symmetry. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-4-jolsa@kernel.org --- include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 01112f27cd215..7447e15559b86 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -226,6 +226,7 @@ extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, extern void uprobe_handle_trampoline(struct pt_regs *regs); extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); +extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index dd4dd156c9567..f993a3422083d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) return is_swbp_insn(insn); } -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) +void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); @@ -205,7 +205,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { @@ -1051,7 +1051,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, if (IS_ERR(page)) return PTR_ERR(page); - copy_from_page(page, offset, insn, nbytes); + uprobe_copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; @@ -1397,7 +1397,7 @@ struct uprobe *uprobe_register(struct inode *inode, return ERR_PTR(-EINVAL); /* - * This ensures that copy_from_page(), copy_to_page() and + * This ensures that uprobe_copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) @@ -2393,7 +2393,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ From 33d7b2beaf34a3c0f6406bc76f6e1b1755150ad9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:14 +0200 Subject: [PATCH 19/66] uprobes: Add uprobe_write function Adding uprobe_write function that does what uprobe_write_opcode did so far, but allows to pass verify callback function that checks the memory location before writing the opcode. It will be used in following changes to implement specific checking logic for instruction update. The uprobe_write_opcode now calls uprobe_write with verify_opcode as the verify callback. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Masami Hiramatsu (Google) Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-5-jolsa@kernel.org --- include/linux/uprobes.h | 5 +++++ kernel/events/uprobes.c | 14 ++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 7447e15559b86..e13382054435d 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -187,6 +187,9 @@ struct uprobes_state { struct xol_area *xol_area; }; +typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, + uprobe_opcode_t *opcode); + extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -195,6 +198,8 @@ extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, + uprobe_opcode_t opcode, uprobe_write_verify_t verify); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f993a3422083d..838ac40e91e61 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -399,7 +399,7 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, return identical; } -static int __uprobe_write_opcode(struct vm_area_struct *vma, +static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, unsigned long opcode_vaddr, uprobe_opcode_t opcode) { @@ -487,6 +487,12 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma, */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, uprobe_opcode_t opcode) +{ + return uprobe_write(auprobe, vma, opcode_vaddr, opcode, verify_opcode); +} + +int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, uprobe_write_verify_t verify) { const unsigned long vaddr = opcode_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; @@ -509,7 +515,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, * page that we can safely modify. Use FOLL_WRITE to trigger a write * fault if required. When unregistering, we might be lucky and the * anon page is already gone. So defer write faults until really - * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() + * required. Use FOLL_SPLIT_PMD, because __uprobe_write() * cannot deal with PMDs yet. */ if (is_register) @@ -521,7 +527,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, goto out; folio = page_folio(page); - ret = verify_opcode(page, opcode_vaddr, &opcode); + ret = verify(page, opcode_vaddr, &opcode); if (ret <= 0) { folio_put(folio); goto out; @@ -560,7 +566,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); + ret = __uprobe_write(vma, &fw, folio, opcode_vaddr, opcode); folio_walk_end(&fw, vma); } From f8b7c528b4fb7018d12b6bb63bb52576cfc73697 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:15 +0200 Subject: [PATCH 20/66] uprobes: Add nbytes argument to uprobe_write Adding nbytes argument to uprobe_write and related functions as preparation for writing whole instructions in following changes. Also renaming opcode arguments to insn, which seems to fit better. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-6-jolsa@kernel.org --- include/linux/uprobes.h | 4 ++-- kernel/events/uprobes.c | 26 ++++++++++++++------------ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index e13382054435d..147c4a0a1af9f 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -188,7 +188,7 @@ struct uprobes_state { }; typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, - uprobe_opcode_t *opcode); + uprobe_opcode_t *insn, int nbytes); extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -199,7 +199,7 @@ extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t opcode, uprobe_write_verify_t verify); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 838ac40e91e61..c133fd4b492d8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src kunmap_atomic(kaddr); } -static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, + int nbytes) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -208,7 +209,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); - if (is_swbp_insn(new_opcode)) { + if (is_swbp_insn(insn)) { if (is_swbp) /* register: already installed? */ return 0; } else { @@ -401,10 +402,10 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, - unsigned long opcode_vaddr, uprobe_opcode_t opcode) + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; - const bool is_register = !!is_swbp_insn(&opcode); + const unsigned long vaddr = insn_vaddr & PAGE_MASK; + const bool is_register = !!is_swbp_insn(insn); bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ @@ -429,7 +430,7 @@ static int __uprobe_write(struct vm_area_struct *vma, */ flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); - copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + copy_to_page(fw->page, insn_vaddr, insn, nbytes); /* * When unregistering, we may only zap a PTE if uffd is disabled and @@ -488,13 +489,14 @@ static int __uprobe_write(struct vm_area_struct *vma, int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, uprobe_opcode_t opcode) { - return uprobe_write(auprobe, vma, opcode_vaddr, opcode, verify_opcode); + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, verify_opcode); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, - const unsigned long opcode_vaddr, uprobe_opcode_t opcode, uprobe_write_verify_t verify) + const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + uprobe_write_verify_t verify) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; int ret, is_register, ref_ctr_updated = 0; @@ -504,7 +506,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, struct folio *folio; struct page *page; - is_register = is_swbp_insn(&opcode); + is_register = is_swbp_insn(insn); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) @@ -527,7 +529,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, goto out; folio = page_folio(page); - ret = verify(page, opcode_vaddr, &opcode); + ret = verify(page, insn_vaddr, insn, nbytes); if (ret <= 0) { folio_put(folio); goto out; @@ -566,7 +568,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write(vma, &fw, folio, opcode_vaddr, opcode); + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes); folio_walk_end(&fw, vma); } From ec46350fe1e2338f42ee84974c36b25afe8ba53a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:16 +0200 Subject: [PATCH 21/66] uprobes: Add is_register argument to uprobe_write and uprobe_write_opcode The uprobe_write has special path to restore the original page when we write original instruction back. This happens when uprobe_write detects that we want to write anything else but breakpoint instruction. Moving the detection away and passing it to uprobe_write as argument, so it's possible to write different instructions (other than just breakpoint and rest). Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-7-jolsa@kernel.org --- arch/arm/probes/uprobes/core.c | 2 +- include/linux/uprobes.h | 5 +++-- kernel/events/uprobes.c | 21 +++++++++++---------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index 885e0c5e8c20d..3d96fb41d6245 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -30,7 +30,7 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - __opcode_to_mem_arm(auprobe->bpinsn)); + __opcode_to_mem_arm(auprobe->bpinsn), true); } bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 147c4a0a1af9f..518b267564698 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -197,9 +197,10 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, + bool is_register); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c133fd4b492d8..955e5ed3e383c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -402,10 +402,10 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, - unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes) + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + bool is_register) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; - const bool is_register = !!is_swbp_insn(insn); bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ @@ -487,26 +487,27 @@ static int __uprobe_write(struct vm_area_struct *vma, * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, - const unsigned long opcode_vaddr, uprobe_opcode_t opcode) + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, + bool is_register) { - return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, verify_opcode); + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, + verify_opcode, is_register); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, - uprobe_write_verify_t verify) + uprobe_write_verify_t verify, bool is_register) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; - int ret, is_register, ref_ctr_updated = 0; + int ret, ref_ctr_updated = 0; unsigned int gup_flags = FOLL_FORCE; struct mmu_notifier_range range; struct folio_walk fw; struct folio *folio; struct page *page; - is_register = is_swbp_insn(insn); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) @@ -568,7 +569,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes); + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register); folio_walk_end(&fw, vma); } @@ -610,7 +611,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true); } /** @@ -626,7 +627,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - *(uprobe_opcode_t *)&auprobe->insn); + *(uprobe_opcode_t *)&auprobe->insn, false); } /* uprobe should have guaranteed positive refcount */ From 18a111256a0b4fedfe47101f084441a84d7e357a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:17 +0200 Subject: [PATCH 22/66] uprobes: Add do_ref_ctr argument to uprobe_write function Making update_ref_ctr call in uprobe_write conditional based on do_ref_ctr argument. This way we can use uprobe_write for instruction update without doing ref_ctr_offset update. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-8-jolsa@kernel.org --- include/linux/uprobes.h | 2 +- kernel/events/uprobes.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 518b267564698..5080619560d42 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -200,7 +200,7 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, bool is_register); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 955e5ed3e383c..da2b3d0deab62 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -491,12 +491,12 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, bool is_register) { return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, - verify_opcode, is_register); + verify_opcode, is_register, true /* do_update_ref_ctr */); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, - uprobe_write_verify_t verify, bool is_register) + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; @@ -537,7 +537,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, } /* We are going to replace instruction, update ref_ctr. */ - if (!ref_ctr_updated && uprobe->ref_ctr_offset) { + if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) { folio_put(folio); @@ -589,7 +589,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, out: /* Revert back reference counter if instruction update failed. */ - if (ret < 0 && ref_ctr_updated) + if (do_update_ref_ctr && ret < 0 && ref_ctr_updated) update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ From 91440ff4cafad4c86322a612e523f7f021a493e7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:18 +0200 Subject: [PATCH 23/66] uprobes/x86: Add mapping for optimized uprobe trampolines Adding support to add special mapping for user space trampoline with following functions: uprobe_trampoline_get - find or add uprobe_trampoline uprobe_trampoline_put - remove or destroy uprobe_trampoline The user space trampoline is exported as arch specific user space special mapping through tramp_mapping, which is initialized in following changes with new uprobe syscall. The uprobe trampoline needs to be callable/reachable from the probed address, so while searching for available address we use is_reachable_by_call function to decide if the uprobe trampoline is callable from the probe address. All uprobe_trampoline objects are stored in uprobes_state object and are cleaned up when the process mm_struct goes down. Adding new arch hooks for that, because this change is x86_64 specific. Locking is provided by callers in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250720112133.244369-9-jolsa@kernel.org --- arch/x86/kernel/uprobes.c | 144 ++++++++++++++++++++++++++++++++++++++ include/linux/uprobes.h | 6 ++ kernel/events/uprobes.c | 10 +++ kernel/fork.c | 1 + 4 files changed, 161 insertions(+) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 77050e5a4680d..6c4dcbdd0c3c0 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -608,6 +608,150 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) *sr = utask->autask.saved_scratch_register; } } + +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + +static struct page *tramp_mapping_pages[2] __ro_after_init; + +static struct vm_special_mapping tramp_mapping = { + .name = "[uprobes-trampoline]", + .mremap = tramp_mremap, + .pages = tramp_mapping_pages, +}; + +struct uprobe_trampoline { + struct hlist_node node; + unsigned long vaddr; +}; + +static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr) +{ + long delta = (long)(vaddr + 5 - vtramp); + + return delta >= INT_MIN && delta <= INT_MAX; +} + +static unsigned long find_nearest_trampoline(unsigned long vaddr) +{ + struct vm_unmapped_area_info info = { + .length = PAGE_SIZE, + .align_mask = ~PAGE_MASK, + }; + unsigned long low_limit, high_limit; + unsigned long low_tramp, high_tramp; + unsigned long call_end = vaddr + 5; + + if (check_add_overflow(call_end, INT_MIN, &low_limit)) + low_limit = PAGE_SIZE; + + high_limit = call_end + INT_MAX; + + /* Search up from the caller address. */ + info.low_limit = call_end; + info.high_limit = min(high_limit, TASK_SIZE); + high_tramp = vm_unmapped_area(&info); + + /* Search down from the caller address. */ + info.low_limit = max(low_limit, PAGE_SIZE); + info.high_limit = call_end; + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + low_tramp = vm_unmapped_area(&info); + + if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp)) + return -ENOMEM; + if (IS_ERR_VALUE(high_tramp)) + return low_tramp; + if (IS_ERR_VALUE(low_tramp)) + return high_tramp; + + /* Return address that's closest to the caller address. */ + if (call_end - low_tramp < high_tramp - call_end) + return low_tramp; + return high_tramp; +} + +static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) +{ + struct pt_regs *regs = task_pt_regs(current); + struct mm_struct *mm = current->mm; + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + + if (!user_64bit_mode(regs)) + return NULL; + + vaddr = find_nearest_trampoline(vaddr); + if (IS_ERR_VALUE(vaddr)) + return NULL; + + tramp = kzalloc(sizeof(*tramp), GFP_KERNEL); + if (unlikely(!tramp)) + return NULL; + + tramp->vaddr = vaddr; + vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE, + VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO, + &tramp_mapping); + if (IS_ERR(vma)) { + kfree(tramp); + return NULL; + } + return tramp; +} + +__maybe_unused +static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) +{ + struct uprobes_state *state = ¤t->mm->uprobes_state; + struct uprobe_trampoline *tramp = NULL; + + if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE) + return NULL; + + hlist_for_each_entry(tramp, &state->head_tramps, node) { + if (is_reachable_by_call(tramp->vaddr, vaddr)) { + *new = false; + return tramp; + } + } + + tramp = create_uprobe_trampoline(vaddr); + if (!tramp) + return NULL; + + *new = true; + hlist_add_head(&tramp->node, &state->head_tramps); + return tramp; +} + +static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp) +{ + /* + * We do not unmap and release uprobe trampoline page itself, + * because there's no easy way to make sure none of the threads + * is still inside the trampoline. + */ + hlist_del(&tramp->node); + kfree(tramp); +} + +void arch_uprobe_init_state(struct mm_struct *mm) +{ + INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps); +} + +void arch_uprobe_clear_state(struct mm_struct *mm) +{ + struct uprobes_state *state = &mm->uprobes_state; + struct uprobe_trampoline *tramp; + struct hlist_node *n; + + hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) + destroy_uprobe_trampoline(tramp); +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 5080619560d42..b40d33aae0168 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -17,6 +17,7 @@ #include #include #include +#include struct uprobe; struct vm_area_struct; @@ -185,6 +186,9 @@ struct xol_area; struct uprobes_state { struct xol_area *xol_area; +#ifdef CONFIG_X86_64 + struct hlist_head head_tramps; +#endif }; typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, @@ -233,6 +237,8 @@ extern void uprobe_handle_trampoline(struct pt_regs *regs); extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); +extern void arch_uprobe_clear_state(struct mm_struct *mm); +extern void arch_uprobe_init_state(struct mm_struct *mm); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index da2b3d0deab62..2cd7a4c6f303b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1801,6 +1801,14 @@ static struct xol_area *get_xol_area(void) return area; } +void __weak arch_uprobe_clear_state(struct mm_struct *mm) +{ +} + +void __weak arch_uprobe_init_state(struct mm_struct *mm) +{ +} + /* * uprobe_clear_state - Free the area allocated for slots. */ @@ -1812,6 +1820,8 @@ void uprobe_clear_state(struct mm_struct *mm) delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); + arch_uprobe_clear_state(mm); + if (!area) return; diff --git a/kernel/fork.c b/kernel/fork.c index af673856499dc..d827cc6c53623 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1015,6 +1015,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; + arch_uprobe_init_state(mm); #endif } From 56101b69c9190667f473b9f93f8b6d8209aaa816 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:19 +0200 Subject: [PATCH 24/66] uprobes/x86: Add uprobe syscall to speed up uprobe Adding new uprobe syscall that calls uprobe handlers for given 'breakpoint' address. The idea is that the 'breakpoint' address calls the user space trampoline which executes the uprobe syscall. The syscall handler reads the return address of the initial call to retrieve the original 'breakpoint' address. With this address we find the related uprobe object and call its consumers. Adding the arch_uprobe_trampoline_mapping function that provides uprobe trampoline mapping. This mapping is backed with one global page initialized at __init time and shared by the all the mapping instances. We do not allow to execute uprobe syscall if the caller is not from uprobe trampoline mapping. The uprobe syscall ensures the consumer (bpf program) sees registers values in the state before the trampoline was called. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250720112133.244369-10-jolsa@kernel.org --- arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/x86/kernel/uprobes.c | 139 +++++++++++++++++++++++++ include/linux/syscalls.h | 2 + include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 17 +++ kernel/sys_ni.c | 1 + 6 files changed, 161 insertions(+) diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 92cf0fe2291eb..ced2a1deecd7c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq 335 common uretprobe sys_uretprobe +336 common uprobe sys_uprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c4dcbdd0c3c0..d18e1ae599016 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm) hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) destroy_uprobe_trampoline(tramp); } + +static bool __in_uprobe_trampoline(unsigned long ip) +{ + struct vm_area_struct *vma = vma_lookup(current->mm, ip); + + return vma && vma_is_special_mapping(vma, &tramp_mapping); +} + +static bool in_uprobe_trampoline(unsigned long ip) +{ + struct mm_struct *mm = current->mm; + bool found, retry = true; + unsigned int seq; + + rcu_read_lock(); + if (mmap_lock_speculate_try_begin(mm, &seq)) { + found = __in_uprobe_trampoline(ip); + retry = mmap_lock_speculate_retry(mm, seq); + } + rcu_read_unlock(); + + if (retry) { + mmap_read_lock(mm); + found = __in_uprobe_trampoline(ip); + mmap_read_unlock(mm); + } + return found; +} + +/* + * See uprobe syscall trampoline; the call to the trampoline will push + * the return address on the stack, the trampoline itself then pushes + * cx, r11 and ax. + */ +struct uprobe_syscall_args { + unsigned long ax; + unsigned long r11; + unsigned long cx; + unsigned long retaddr; +}; + +SYSCALL_DEFINE0(uprobe) +{ + struct pt_regs *regs = task_pt_regs(current); + struct uprobe_syscall_args args; + unsigned long ip, sp; + int err; + + /* Allow execution only from uprobe trampolines. */ + if (!in_uprobe_trampoline(regs->ip)) + goto sigill; + + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); + if (err) + goto sigill; + + ip = regs->ip; + + /* + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: + * - adjust ip to the probe address, call saved next instruction address + * - adjust sp to the probe's stack frame (check trampoline code) + */ + regs->ax = args.ax; + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ip = args.retaddr - 5; + regs->sp += sizeof(args); + regs->orig_ax = -1; + + sp = regs->sp; + + handle_syscall_uprobe(regs, regs->ip); + + /* + * Some of the uprobe consumers has changed sp, we can do nothing, + * just return via iret. + */ + if (regs->sp != sp) { + /* skip the trampoline call */ + if (args.retaddr - 5 == regs->ip) + regs->ip += 5; + return regs->ax; + } + + regs->sp -= sizeof(args); + + /* for the case uprobe_consumer has changed ax/r11/cx */ + args.ax = regs->ax; + args.r11 = regs->r11; + args.cx = regs->cx; + + /* keep return address unless we are instructed otherwise */ + if (args.retaddr - 5 != regs->ip) + args.retaddr = regs->ip; + + regs->ip = ip; + + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); + if (err) + goto sigill; + + /* ensure sysret, see do_syscall_64() */ + regs->r11 = regs->flags; + regs->cx = regs->ip; + return 0; + +sigill: + force_sig(SIGILL); + return -1; +} + +asm ( + ".pushsection .rodata\n" + ".balign " __stringify(PAGE_SIZE) "\n" + "uprobe_trampoline_entry:\n" + "push %rcx\n" + "push %r11\n" + "push %rax\n" + "movq $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "pop %rax\n" + "pop %r11\n" + "pop %rcx\n" + "ret\n" + ".balign " __stringify(PAGE_SIZE) "\n" + ".popsection\n" +); + +extern u8 uprobe_trampoline_entry[]; + +static int __init arch_uprobes_init(void) +{ + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); + return 0; +} + +late_initcall(arch_uprobes_init); + #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 77f45e5d44139..66c06fcdfe19e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1005,6 +1005,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on); asmlinkage long sys_uretprobe(void); +asmlinkage long sys_uprobe(void); + /* pciconfig: alpha, arm, arm64, ia64, sparc */ asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b40d33aae0168..b6b077cc7d0f2 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -239,6 +239,7 @@ extern unsigned long uprobe_get_trampoline_vaddr(void); extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); extern void arch_uprobe_clear_state(struct mm_struct *mm); extern void arch_uprobe_init_state(struct mm_struct *mm); +extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2cd7a4c6f303b..eb07e602b6c9a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2771,6 +2771,23 @@ static void handle_swbp(struct pt_regs *regs) rcu_read_unlock_trace(); } +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) +{ + struct uprobe *uprobe; + int is_swbp; + + guard(rcu_tasks_trace)(); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); + if (!uprobe) + return; + if (!get_utask()) + return; + if (arch_uprobe_ignore(&uprobe->arch, regs)) + return; + handler_chain(uprobe, regs); +} + /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c00a86931f8c6..bf5d05c635ffd 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16); COND_SYSCALL(rseq); COND_SYSCALL(uretprobe); +COND_SYSCALL(uprobe); From ba2bfc97b4629b10bd8d02b36e04f3932a04cac4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:20 +0200 Subject: [PATCH 25/66] uprobes/x86: Add support to optimize uprobes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Putting together all the previously added pieces to support optimized uprobes on top of 5-byte nop instruction. The current uprobe execution goes through following: - installs breakpoint instruction over original instruction - exception handler hit and calls related uprobe consumers - and either simulates original instruction or does out of line single step execution of it - returns to user space The optimized uprobe path does following: - checks the original instruction is 5-byte nop (plus other checks) - adds (or uses existing) user space trampoline with uprobe syscall - overwrites original instruction (5-byte nop) with call to user space trampoline - the user space trampoline executes uprobe syscall that calls related uprobe consumers - trampoline returns back to next instruction This approach won't speed up all uprobes as it's limited to using nop5 as original instruction, but we plan to use nop5 as USDT probe instruction (which currently uses single byte nop) and speed up the USDT probes. The arch_uprobe_optimize triggers the uprobe optimization and is called after first uprobe hit. I originally had it called on uprobe installation but then it clashed with elf loader, because the user space trampoline was added in a place where loader might need to put elf segments, so I decided to do it after first uprobe hit when loading is done. The uprobe is un-optimized in arch specific set_orig_insn call. The instruction overwrite is x86 arch specific and needs to go through 3 updates: (on top of nop5 instruction) - write int3 into 1st byte - write last 4 bytes of the call instruction - update the call instruction opcode And cleanup goes though similar reverse stages: - overwrite call opcode with breakpoint (int3) - write last 4 bytes of the nop5 instruction - write the nop5 first instruction byte We do not unmap and release uprobe trampoline when it's no longer needed, because there's no easy way to make sure none of the threads is still inside the trampoline. But we do not waste memory, because there's just single page for all the uprobe trampoline mappings. We do waste frame on page mapping for every 4GB by keeping the uprobe trampoline page mapped, but that seems ok. We take the benefit from the fact that set_swbp and set_orig_insn are called under mmap_write_lock(mm), so we can use the current instruction as the state the uprobe is in - nop5/breakpoint/call trampoline - and decide the needed action (optimize/un-optimize) based on that. Attaching the speed up from benchs/run_bench_uprobes.sh script: current: usermode-count : 152.604 ± 0.044M/s syscall-count : 13.359 ± 0.042M/s --> uprobe-nop : 3.229 ± 0.002M/s uprobe-push : 3.086 ± 0.004M/s uprobe-ret : 1.114 ± 0.004M/s uprobe-nop5 : 1.121 ± 0.005M/s uretprobe-nop : 2.145 ± 0.002M/s uretprobe-push : 2.070 ± 0.001M/s uretprobe-ret : 0.931 ± 0.001M/s uretprobe-nop5 : 0.957 ± 0.001M/s after the change: usermode-count : 152.448 ± 0.244M/s syscall-count : 14.321 ± 0.059M/s uprobe-nop : 3.148 ± 0.007M/s uprobe-push : 2.976 ± 0.004M/s uprobe-ret : 1.068 ± 0.003M/s --> uprobe-nop5 : 7.038 ± 0.007M/s uretprobe-nop : 2.109 ± 0.004M/s uretprobe-push : 2.035 ± 0.001M/s uretprobe-ret : 0.908 ± 0.001M/s uretprobe-nop5 : 3.377 ± 0.009M/s I see bit more speed up on Intel (above) compared to AMD. The big nop5 speed up is partly due to emulating nop5 and partly due to optimization. The key speed up we do this for is the USDT switch from nop to nop5: uprobe-nop : 3.148 ± 0.007M/s uprobe-nop5 : 7.038 ± 0.007M/s Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250720112133.244369-11-jolsa@kernel.org --- arch/x86/include/asm/uprobes.h | 7 + arch/x86/kernel/uprobes.c | 283 ++++++++++++++++++++++++++++++++- include/linux/uprobes.h | 6 +- kernel/events/uprobes.c | 16 +- 4 files changed, 305 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 678fb546f0a75..1ee2e5115955c 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +enum { + ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0, + ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1, +}; + struct uprobe_xol_ops; struct arch_uprobe { @@ -45,6 +50,8 @@ struct arch_uprobe { u8 ilen; } push; }; + + unsigned long flags; }; struct arch_uprobe_task { diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index d18e1ae599016..209ce74ab93f4 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -18,6 +18,7 @@ #include #include #include +#include /* Post-execution fixups. */ @@ -702,7 +703,6 @@ static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) return tramp; } -__maybe_unused static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) { struct uprobes_state *state = ¤t->mm->uprobes_state; @@ -891,6 +891,280 @@ static int __init arch_uprobes_init(void) late_initcall(arch_uprobes_init); +enum { + EXPECT_SWBP, + EXPECT_CALL, +}; + +struct write_opcode_ctx { + unsigned long base; + int expect; +}; + +static int is_call_insn(uprobe_opcode_t *insn) +{ + return *insn == CALL_INSN_OPCODE; +} + +/* + * Verification callback used by int3_update uprobe_write calls to make sure + * the underlying instruction is as expected - either int3 or call. + */ +static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, + int nbytes, void *data) +{ + struct write_opcode_ctx *ctx = data; + uprobe_opcode_t old_opcode[5]; + + uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5); + + switch (ctx->expect) { + case EXPECT_SWBP: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + case EXPECT_CALL: + if (is_call_insn(&old_opcode[0])) + return 1; + break; + } + + return -1; +} + +/* + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. + * (borrowed comment from smp_text_poke_batch_finish) + * + * The way it is done: + * - Add an INT3 trap to the address that will be patched + * - SMP sync all CPUs + * - Update all but the first byte of the patched range + * - SMP sync all CPUs + * - Replace the first byte (INT3) by the first byte of the replacing opcode + * - SMP sync all CPUs + */ +static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, char *insn, bool optimize) +{ + uprobe_opcode_t int3 = UPROBE_SWBP_INSN; + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + int err; + + /* + * Write int3 trap. + * + * The swbp_optimize path comes with breakpoint already installed, + * so we can skip this step for optimize == true. + */ + if (!optimize) { + ctx.expect = EXPECT_CALL; + err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + } + + smp_text_poke_sync_each_cpu(); + + /* Write all but the first byte of the patched range. */ + ctx.expect = EXPECT_SWBP; + err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + + /* + * Write first byte. + * + * The swbp_unoptimize needs to finish uprobe removal together + * with ref_ctr update, using uprobe_write with proper flags. + */ + err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn, + optimize /* is_register */, !optimize /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + return 0; +} + +static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, unsigned long tramp) +{ + u8 call[5]; + + __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr, + (const void *) tramp, CALL_INSN_SIZE); + return int3_update(auprobe, vma, vaddr, call, true /* optimize */); +} + +static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */); +} + +static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len) +{ + unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD; + struct vm_area_struct *vma; + struct page *page; + + page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR(page)) + return PTR_ERR(page); + uprobe_copy_from_page(page, vaddr, dst, len); + put_page(page); + return 0; +} + +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *call = (struct __arch_relative_insn *) insn; + + if (!is_call_insn(insn)) + return false; + return __in_uprobe_trampoline(vaddr + 5 + call->raddr); +} + +static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized) +{ + uprobe_opcode_t insn[5]; + int err; + + err = copy_from_vaddr(mm, vaddr, &insn, 5); + if (err) + return err; + *optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr); + return 0; +} + +static bool should_optimize(struct arch_uprobe *auprobe) +{ + return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) && + test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); +} + +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (should_optimize(auprobe)) { + bool optimized = false; + int err; + + /* + * We could race with another thread that already optimized the probe, + * so let's not overwrite it with int3 again in this case. + */ + err = is_optimized(vma->vm_mm, vaddr, &optimized); + if (err) + return err; + if (optimized) + return 0; + } + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, + true /* is_register */); +} + +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { + struct mm_struct *mm = vma->vm_mm; + bool optimized = false; + int err; + + err = is_optimized(mm, vaddr, &optimized); + if (err) + return err; + if (optimized) { + err = swbp_unoptimize(auprobe, vma, vaddr); + WARN_ON_ONCE(err); + return err; + } + } + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, + false /* is_register */); +} + +static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr) +{ + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + bool new = false; + int err = 0; + + vma = find_vma(mm, vaddr); + if (!vma) + return -EINVAL; + tramp = get_uprobe_trampoline(vaddr, &new); + if (!tramp) + return -EINVAL; + err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr); + if (WARN_ON_ONCE(err) && new) + destroy_uprobe_trampoline(tramp); + return err; +} + +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + struct mm_struct *mm = current->mm; + uprobe_opcode_t insn[5]; + + /* + * Do not optimize if shadow stack is enabled, the return address hijack + * code in arch_uretprobe_hijack_return_addr updates wrong frame when + * the entry uprobe is optimized and the shadow stack crashes the app. + */ + if (shstk_is_enabled()) + return; + + if (!should_optimize(auprobe)) + return; + + mmap_write_lock(mm); + + /* + * Check if some other thread already optimized the uprobe for us, + * if it's the case just go away silently. + */ + if (copy_from_vaddr(mm, vaddr, &insn, 5)) + goto unlock; + if (!is_swbp_insn((uprobe_opcode_t*) &insn)) + goto unlock; + + /* + * If we fail to optimize the uprobe we set the fail bit so the + * above should_optimize will fail from now on. + */ + if (__arch_uprobe_optimize(auprobe, mm, vaddr)) + set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags); + +unlock: + mmap_write_unlock(mm); +} + +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + if (memcmp(&auprobe->insn, x86_nops[5], 5)) + return false; + /* We can't do cross page atomic writes yet. */ + return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit @@ -904,6 +1178,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + return false; +} #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { @@ -1270,6 +1548,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret) return ret; + if (can_optimize(auprobe, addr)) + set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); + ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b6b077cc7d0f2..08ef78439d0df 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -192,7 +192,7 @@ struct uprobes_state { }; typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, - uprobe_opcode_t *insn, int nbytes); + uprobe_opcode_t *insn, int nbytes, void *data); extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -204,7 +204,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, bool is_register); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); @@ -240,6 +241,7 @@ extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void * extern void arch_uprobe_clear_state(struct mm_struct *mm); extern void arch_uprobe_init_state(struct mm_struct *mm); extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); +extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eb07e602b6c9a..4a194d7c838b5 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -192,7 +192,7 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src } static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, - int nbytes) + int nbytes, void *data) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -491,12 +491,13 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, bool is_register) { return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, - verify_opcode, is_register, true /* do_update_ref_ctr */); + verify_opcode, is_register, true /* do_update_ref_ctr */, NULL); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, - uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr) + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; @@ -530,7 +531,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, goto out; folio = page_folio(page); - ret = verify(page, insn_vaddr, insn, nbytes); + ret = verify(page, insn_vaddr, insn, nbytes, data); if (ret <= 0) { folio_put(folio); goto out; @@ -2696,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c return true; } +void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -2760,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; From 985e820b72e0a89746f51c072ed10caf78890244 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2025 12:28:36 +0200 Subject: [PATCH 26/66] uprobes/x86: Add struct uretprobe_syscall_args Like uprobe_syscall_args; keep things consistent. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821123656.705837806@infradead.org --- arch/x86/kernel/uprobes.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 209ce74ab93f4..580989dfddc75 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -311,6 +311,12 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool #ifdef CONFIG_X86_64 +struct uretprobe_syscall_args { + unsigned long r11; + unsigned long cx; + unsigned long ax; +}; + asm ( ".pushsection .rodata\n" ".global uretprobe_trampoline_entry\n" @@ -324,8 +330,8 @@ asm ( "uretprobe_syscall_check:\n" "popq %r11\n" "popq %rcx\n" - - /* The uretprobe syscall replaces stored %rax value with final + /* + * The uretprobe syscall replaces stored %rax value with final * return address, so we don't restore %rax in here and just * call ret. */ @@ -366,7 +372,8 @@ static unsigned long trampoline_check_ip(unsigned long tramp) SYSCALL_DEFINE0(uretprobe) { struct pt_regs *regs = task_pt_regs(current); - unsigned long err, ip, sp, r11_cx_ax[3], tramp; + struct uretprobe_syscall_args args; + unsigned long err, ip, sp, tramp; /* If there's no trampoline, we are called from wrong place. */ tramp = uprobe_get_trampoline_vaddr(); @@ -377,15 +384,15 @@ SYSCALL_DEFINE0(uretprobe) if (unlikely(regs->ip != trampoline_check_ip(tramp))) goto sigill; - err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax)); + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); if (err) goto sigill; /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */ - regs->r11 = r11_cx_ax[0]; - regs->cx = r11_cx_ax[1]; - regs->ax = r11_cx_ax[2]; - regs->sp += sizeof(r11_cx_ax); + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ax = args.ax; + regs->sp += sizeof(args); regs->orig_ax = -1; ip = regs->ip; @@ -401,21 +408,21 @@ SYSCALL_DEFINE0(uretprobe) */ if (regs->sp != sp || shstk_is_enabled()) return regs->ax; - regs->sp -= sizeof(r11_cx_ax); + regs->sp -= sizeof(args); /* for the case uprobe_consumer has changed r11/cx */ - r11_cx_ax[0] = regs->r11; - r11_cx_ax[1] = regs->cx; + args.r11 = regs->r11; + args.cx = regs->cx; /* * ax register is passed through as return value, so we can use * its space on stack for ip value and jump to it through the * trampoline's ret instruction */ - r11_cx_ax[2] = regs->ip; + args.ax = regs->ip; regs->ip = ip; - err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax)); + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); if (err) goto sigill; From fd54052b60cf6e73cf918fd8653cd7a5c84b0cc3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2025 12:37:22 +0200 Subject: [PATCH 27/66] uprobes/x86: Optimize is_optimize() Make is_optimized() return a tri-state and avoid return through argument. This simplifies things a little. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821123656.823296198@infradead.org --- arch/x86/kernel/uprobes.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 580989dfddc75..3b46a894c2049 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1047,7 +1047,7 @@ static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) return __in_uprobe_trampoline(vaddr + 5 + call->raddr); } -static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized) +static int is_optimized(struct mm_struct *mm, unsigned long vaddr) { uprobe_opcode_t insn[5]; int err; @@ -1055,8 +1055,7 @@ static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimiz err = copy_from_vaddr(mm, vaddr, &insn, 5); if (err) return err; - *optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr); - return 0; + return __is_optimized((uprobe_opcode_t *)&insn, vaddr); } static bool should_optimize(struct arch_uprobe *auprobe) @@ -1069,17 +1068,14 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { if (should_optimize(auprobe)) { - bool optimized = false; - int err; - /* * We could race with another thread that already optimized the probe, * so let's not overwrite it with int3 again in this case. */ - err = is_optimized(vma->vm_mm, vaddr, &optimized); - if (err) - return err; - if (optimized) + int ret = is_optimized(vma->vm_mm, vaddr); + if (ret < 0) + return ret; + if (ret) return 0; } return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, @@ -1090,17 +1086,13 @@ int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { - struct mm_struct *mm = vma->vm_mm; - bool optimized = false; - int err; - - err = is_optimized(mm, vaddr, &optimized); - if (err) - return err; - if (optimized) { - err = swbp_unoptimize(auprobe, vma, vaddr); - WARN_ON_ONCE(err); - return err; + int ret = is_optimized(vma->vm_mm, vaddr); + if (ret < 0) + return ret; + if (ret) { + ret = swbp_unoptimize(auprobe, vma, vaddr); + WARN_ON_ONCE(ret); + return ret; } } return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, From 7c2bfc183b05103287cc32ad68184f7d4312c06d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2025 12:55:14 +0200 Subject: [PATCH 28/66] uprobes/x86: Accept more NOP forms Instead of only accepting the x86_64 nop5 chosen by the kernel, accept any x86_64 NOP or NOPL instruction that is 5 bytes. Notably, the x86_64 nop5 pattern is valid in 32bit apps and could get compiler generated when build for i686 (which introduced NOPL). Since the trampoline is x86_64 only, make sure to limit to x86_64 code. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821123656.935559566@infradead.org --- arch/x86/kernel/uprobes.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 3b46a894c2049..d513c9761f341 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1157,10 +1157,37 @@ void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) mmap_write_unlock(mm); } -static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +static bool insn_is_nop(struct insn *insn) { - if (memcmp(&auprobe->insn, x86_nops[5], 5)) + return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; +} + +static bool insn_is_nopl(struct insn *insn) +{ + if (insn->opcode.nbytes != 2) + return false; + + if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) + return false; + + if (!insn->modrm.nbytes) + return false; + + if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) + return false; + + /* 0f 1f /0 - NOPL */ + return true; +} + +static bool can_optimize(struct insn *insn, unsigned long vaddr) +{ + if (!insn->x86_64 || insn->length != 5) return false; + + if (!insn_is_nop(insn) && !insn_is_nopl(insn)) + return false; + /* We can't do cross page atomic writes yet. */ return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; } @@ -1177,7 +1204,7 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } -static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +static bool can_optimize(struct insn *insn, unsigned long vaddr) { return false; } @@ -1539,15 +1566,15 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) */ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - struct insn insn; u8 fix_ip_or_call = UPROBE_FIX_IP; + struct insn insn; int ret; ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); if (ret) return ret; - if (can_optimize(auprobe, addr)) + if (can_optimize(&insn, addr)) set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); ret = branch_setup_xol_ops(auprobe, &insn); From f349ec80865dfeaf8a33feae6b4a500db039c69c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2025 14:28:34 +0200 Subject: [PATCH 29/66] uprobes/x86: Fix uprobe syscall vs shadow stack The uprobe syscall stores and strips the trampoline stack frame from the user context, to make it appear similar to an exception at the original instruction. It then restores the trampoline stack when it can exit using sysexit. Make sure to match the regular stack manipulation with shadow stack operations such that regular and shadow stack don't get out of sync and causes trouble. This enables using the optimization when shadow stack is in use. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821123657.055790090@infradead.org --- arch/x86/include/asm/shstk.h | 4 ++++ arch/x86/kernel/shstk.c | 40 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/uprobes.c | 17 ++++++++------- 3 files changed, 52 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h index ba6f2fe438488..6723dbf6ab829 100644 --- a/arch/x86/include/asm/shstk.h +++ b/arch/x86/include/asm/shstk.h @@ -23,6 +23,8 @@ int setup_signal_shadow_stack(struct ksignal *ksig); int restore_signal_shadow_stack(void); int shstk_update_last_frame(unsigned long val); bool shstk_is_enabled(void); +int shstk_pop(u64 *val); +int shstk_push(u64 val); #else static inline long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) { return -EINVAL; } @@ -35,6 +37,8 @@ static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; } static inline int restore_signal_shadow_stack(void) { return 0; } static inline int shstk_update_last_frame(unsigned long val) { return 0; } static inline bool shstk_is_enabled(void) { return false; } +static inline int shstk_pop(u64 *val) { return -ENOTSUPP; } +static inline int shstk_push(u64 val) { return -ENOTSUPP; } #endif /* CONFIG_X86_USER_SHADOW_STACK */ #endif /* __ASSEMBLER__ */ diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 2ddf23387c7ef..2dd91a3053d0f 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr(void) return ssp; } +int shstk_pop(u64 *val) +{ + int ret = 0; + u64 ssp; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -ENOTSUPP; + + fpregs_lock_and_load(); + + rdmsrq(MSR_IA32_PL3_SSP, ssp); + if (val && get_user(*val, (__user u64 *)ssp)) + ret = -EFAULT; + else + wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE); + fpregs_unlock(); + + return ret; +} + +int shstk_push(u64 val) +{ + u64 ssp; + int ret; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -ENOTSUPP; + + fpregs_lock_and_load(); + + rdmsrq(MSR_IA32_PL3_SSP, ssp); + ssp -= SS_FRAME_SIZE; + ret = write_user_shstk_64((__user void *)ssp, val); + if (!ret) + wrmsrq(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return ret; +} + #define SHSTK_DATA_BIT BIT(63) static int put_shstk_data(u64 __user *addr, u64 data) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index d513c9761f341..ab6547bf2fa5d 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -804,7 +804,7 @@ SYSCALL_DEFINE0(uprobe) { struct pt_regs *regs = task_pt_regs(current); struct uprobe_syscall_args args; - unsigned long ip, sp; + unsigned long ip, sp, sret; int err; /* Allow execution only from uprobe trampolines. */ @@ -831,6 +831,10 @@ SYSCALL_DEFINE0(uprobe) sp = regs->sp; + err = shstk_pop((u64 *)&sret); + if (err == -EFAULT || (!err && sret != args.retaddr)) + goto sigill; + handle_syscall_uprobe(regs, regs->ip); /* @@ -855,6 +859,9 @@ SYSCALL_DEFINE0(uprobe) if (args.retaddr - 5 != regs->ip) args.retaddr = regs->ip; + if (shstk_push(args.retaddr) == -EFAULT) + goto sigill; + regs->ip = ip; err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); @@ -1124,14 +1131,6 @@ void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) struct mm_struct *mm = current->mm; uprobe_opcode_t insn[5]; - /* - * Do not optimize if shadow stack is enabled, the return address hijack - * code in arch_uretprobe_hijack_return_addr updates wrong frame when - * the entry uprobe is optimized and the shadow stack crashes the app. - */ - if (shstk_is_enabled()) - return; - if (!should_optimize(auprobe)) return; From 60ed85b7e46983725954103beaca5ca41816cd58 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Aug 2025 13:48:48 +0200 Subject: [PATCH 30/66] uprobes/x86: Make asm style consistent The asm syntax in uretprobe_trampoline and uprobe_trampoline differs in the use of operand size suffixes. Make them consistent and remove all size suffixes. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821123657.163417243@infradead.org --- arch/x86/kernel/uprobes.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index ab6547bf2fa5d..643027e35c461 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -321,21 +321,21 @@ asm ( ".pushsection .rodata\n" ".global uretprobe_trampoline_entry\n" "uretprobe_trampoline_entry:\n" - "pushq %rax\n" - "pushq %rcx\n" - "pushq %r11\n" - "movq $" __stringify(__NR_uretprobe) ", %rax\n" + "push %rax\n" + "push %rcx\n" + "push %r11\n" + "mov $" __stringify(__NR_uretprobe) ", %rax\n" "syscall\n" ".global uretprobe_syscall_check\n" "uretprobe_syscall_check:\n" - "popq %r11\n" - "popq %rcx\n" + "pop %r11\n" + "pop %rcx\n" /* * The uretprobe syscall replaces stored %rax value with final * return address, so we don't restore %rax in here and just * call ret. */ - "retq\n" + "ret\n" ".global uretprobe_trampoline_end\n" "uretprobe_trampoline_end:\n" ".popsection\n" @@ -885,7 +885,7 @@ asm ( "push %rcx\n" "push %r11\n" "push %rax\n" - "movq $" __stringify(__NR_uprobe) ", %rax\n" + "mov $" __stringify(__NR_uprobe) ", %rax\n" "syscall\n" "pop %rax\n" "pop %r11\n" From 354492a0e1bc4a408e26ebe14166bd1064182439 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2025 19:49:56 +0200 Subject: [PATCH 31/66] uprobes/x86: Add SLS mitigation to the trampolines It is trivial; no reason not to. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821123657.277506098@infradead.org --- arch/x86/kernel/uprobes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 643027e35c461..0a8c0a4a54233 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -336,6 +336,7 @@ asm ( * call ret. */ "ret\n" + "int3\n" ".global uretprobe_trampoline_end\n" "uretprobe_trampoline_end:\n" ".popsection\n" @@ -891,6 +892,7 @@ asm ( "pop %r11\n" "pop %rcx\n" "ret\n" + "int3\n" ".balign " __stringify(PAGE_SIZE) "\n" ".popsection\n" ); From 17c3b001576452fda8fd19d664a93a988b309780 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:21 +0200 Subject: [PATCH 32/66] selftests/bpf: Import usdt.h from libbpf/usdt project Importing usdt.h from libbpf/usdt project. Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-12-jolsa@kernel.org --- tools/testing/selftests/bpf/usdt.h | 545 +++++++++++++++++++++++++++++ 1 file changed, 545 insertions(+) create mode 100644 tools/testing/selftests/bpf/usdt.h diff --git a/tools/testing/selftests/bpf/usdt.h b/tools/testing/selftests/bpf/usdt.h new file mode 100644 index 0000000000000..549d1f7748101 --- /dev/null +++ b/tools/testing/selftests/bpf/usdt.h @@ -0,0 +1,545 @@ +// SPDX-License-Identifier: BSD-2-Clause +/* + * This single-header library defines a collection of variadic macros for + * defining and triggering USDTs (User Statically-Defined Tracepoints): + * + * - For USDTs without associated semaphore: + * USDT(group, name, args...) + * + * - For USDTs with implicit (transparent to the user) semaphore: + * USDT_WITH_SEMA(group, name, args...) + * USDT_IS_ACTIVE(group, name) + * + * - For USDTs with explicit (user-defined and provided) semaphore: + * USDT_WITH_EXPLICIT_SEMA(sema, group, name, args...) + * USDT_SEMA_IS_ACTIVE(sema) + * + * all of which emit a NOP instruction into the instruction stream, and so + * have *zero* overhead for the surrounding code. USDTs are identified by + * a combination of `group` and `name` identifiers, which is used by external + * tracing tooling (tracers) for identifying exact USDTs of interest. + * + * USDTs can have an associated (2-byte) activity counter (USDT semaphore), + * automatically maintained by Linux kernel whenever any correctly written + * BPF-based tracer is attached to the USDT. This USDT semaphore can be used + * to check whether there is a need to do any extra data collection and + * processing for a given USDT (if necessary), and otherwise avoid extra work + * for a common case of USDT not being traced ("active"). + * + * See documentation for USDT_WITH_SEMA()/USDT_IS_ACTIVE() or + * USDT_WITH_EXPLICIT_SEMA()/USDT_SEMA_IS_ACTIVE() APIs below for details on + * working with USDTs with implicitly or explicitly associated + * USDT semaphores, respectively. + * + * There is also some additional data recorded into an auxiliary note + * section. The data in the note section describes the operands, in terms of + * size and location, used by tracing tooling to know where to find USDT + * arguments. Each location is encoded as an assembler operand string. + * Tracing tools (bpftrace and BPF-based tracers, systemtap, etc) insert + * breakpoints on top of the nop, and decode the location operand-strings, + * like an assembler, to find the values being passed. + * + * The operand strings are selected by the compiler for each operand. + * They are constrained by inline-assembler codes.The default is: + * + * #define USDT_ARG_CONSTRAINT nor + * + * This is a good default if the operands tend to be integral and + * moderate in number (smaller than number of registers). In other + * cases, the compiler may report "'asm' requires impossible reload" or + * similar. In this case, consider simplifying the macro call (fewer + * and simpler operands), reduce optimization, or override the default + * constraints string via: + * + * #define USDT_ARG_CONSTRAINT g + * #include + * + * For some historical description of USDT v3 format (the one used by this + * library and generally recognized and assumed by BPF-based tracing tools) + * see [0]. The more formal specification can be found at [1]. Additional + * argument constraints information can be found at [2]. + * + * Original SystemTap's sys/sdt.h implementation ([3]) was used as a base for + * this USDT library implementation. Current implementation differs *a lot* in + * terms of exposed user API and general usability, which was the main goal + * and focus of the reimplementation work. Nevertheless, underlying recorded + * USDT definitions are fully binary compatible and any USDT-based tooling + * should work equally well with USDTs defined by either SystemTap's or this + * library's USDT implementation. + * + * [0] https://ecos.sourceware.org/ml/systemtap/2010-q3/msg00145.html + * [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + * [2] https://gcc.gnu.org/onlinedocs/gcc/Constraints.html + * [3] https://sourceware.org/git/?p=systemtap.git;a=blob;f=includes/sys/sdt.h + */ +#ifndef __USDT_H +#define __USDT_H + +/* + * Changelog: + * + * 0.1.0 + * ----- + * - Initial release + */ +#define USDT_MAJOR_VERSION 0 +#define USDT_MINOR_VERSION 1 +#define USDT_PATCH_VERSION 0 + +/* C++20 and C23 added __VA_OPT__ as a standard replacement for non-standard `##__VA_ARGS__` extension */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L) +#define __usdt_va_opt 1 +#define __usdt_va_args(...) __VA_OPT__(,) __VA_ARGS__ +#else +#define __usdt_va_args(...) , ##__VA_ARGS__ +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. No USDT semaphore is + * associated with this USDT. + * + * Such "semaphoreless" USDTs are commonly used when there is no extra data + * collection or processing needed to collect and prepare USDT arguments and + * they are just available in the surrounding code. USDT() macro will just + * record their locations in CPU registers or in memory for tracing tooling to + * be able to access them, if necessary. + */ +#ifdef __usdt_va_opt +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0 __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0, ##__VA_ARGS__) +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. USDT also get an + * implicitly-defined associated USDT semaphore, which will be "activated" by + * tracing tooling and can be used to check whether USDT is being actively + * observed. + * + * USDTs with semaphore are commonly used when there is a need to perform + * additional data collection and processing to prepare USDT arguments, which + * otherwise might not be necessary for the rest of application logic. In such + * case, USDT semaphore can be used to avoid unnecessary extra work. If USDT + * is not traced (which is presumed to be a common situation), the associated + * USDT semaphore is "inactive", and so there is no need to waste resources to + * prepare USDT arguments. Use USDT_IS_ACTIVE(group, name) to check whether + * USDT is "active". + * + * N.B. There is an inherent (albeit short) gap between checking whether USDT + * is active and triggering corresponding USDT, in which external tracer can + * be attached to an USDT and activate USDT semaphore after the activity check. + * If such a race occurs, tracers might miss one USDT execution. Tracers are + * expected to accommodate such possibility and this is expected to not be + * a problem for applications and tracers. + * + * N.B. Implicit USDT semaphore defined by USDT_WITH_SEMA() is contained + * within a single executable or shared library and is not shared outside + * them. I.e., if you use USDT_WITH_SEMA() with the same USDT group and name + * identifier across executable and shared library, it will work and won't + * conflict, per se, but will define independent USDT semaphores, one for each + * shared library/executable in which USDT_WITH_SEMA(group, name) is used. + * That is, if you attach to this USDT in one shared library (or executable), + * then only USDT semaphore within that shared library (or executable) will be + * updated by the kernel, while other libraries (or executable) will not see + * activated USDT semaphore. In short, it's best to use unique USDT group:name + * identifiers across different shared libraries (and, equivalently, between + * executable and shared library). This is advanced consideration and is + * rarely (if ever) seen in practice, but just to avoid surprises this is + * called out here. (Static libraries become a part of final executable, once + * linked by linker, so the above considerations don't apply to them.) + */ +#ifdef __usdt_va_opt +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name) \ + __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name), \ + ##__VA_ARGS__) +#endif + +struct usdt_sema { volatile unsigned short active; }; + +/* + * Check if USDT with `group`:`name` identifier is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into the given USDT. It is assumed that USDT is triggered with + * USDT_WITH_SEMA() macro which will implicitly define associated USDT + * semaphore. (If one needs more control over USDT semaphore, see + * USDT_DEFINE_SEMA() and USDT_WITH_EXPLICIT_SEMA() macros below.) + * + * N.B. Such checks are necessarily racy and speculative. Between checking + * whether USDT is active and triggering the USDT itself, tracer can be + * detached with no notification. This race should be extremely rare and worst + * case should result in one-time wasted extra data collection and processing. + */ +#define USDT_IS_ACTIVE(group, name) ({ \ + extern struct usdt_sema __usdt_sema_name(group, name) \ + __usdt_asm_name(__usdt_sema_name(group, name)); \ + __usdt_sema_implicit(__usdt_sema_name(group, name)); \ + __usdt_sema_name(group, name).active > 0; \ +}) + +/* + * APIs for working with user-defined explicit USDT semaphores. + * + * This is a less commonly used advanced API for use cases in which user needs + * an explicit control over (potentially shared across multiple USDTs) USDT + * semaphore instance. This can be used when there is a group of logically + * related USDTs that all need extra data collection and processing whenever + * any of a family of related USDTs are "activated" (i.e., traced). In such + * a case, all such related USDTs will be associated with the same shared USDT + * semaphore defined with USDT_DEFINE_SEMA() and the USDTs themselves will be + * triggered with USDT_WITH_EXPLICIT_SEMA() macros, taking an explicit extra + * USDT semaphore identifier as an extra parameter. + */ + +/** + * Underlying C global variable name for user-defined USDT semaphore with + * `sema` identifier. Could be useful for debugging, but normally shouldn't be + * used explicitly. + */ +#define USDT_SEMA(sema) __usdt_sema_##sema + +/* + * Define storage for user-defined USDT semaphore `sema`. + * + * Should be used only once in non-header source file to let compiler allocate + * space for the semaphore variable. Just like with any other global variable. + * + * This macro can be used anywhere where global variable declaration is + * allowed. Just like with global variable definitions, there should be only + * one definition of user-defined USDT semaphore with given `sema` identifier, + * otherwise compiler or linker will complain about duplicate variable + * definition. + * + * For C++, it is allowed to use USDT_DEFINE_SEMA() both in global namespace + * and inside namespaces (including nested namespaces). Just make sure that + * USDT_DECLARE_SEMA() is placed within the namespace where this semaphore is + * referenced, or any of its parent namespaces, so the C++ language-level + * identifier is visible to the code that needs to reference the semaphore. + * At the lowest layer, USDT semaphores have global naming and visibility + * (they have a corresponding `__usdt_sema_` symbol, which can be linked + * against from C or C++ code, if necessary). To keep it simple, putting + * USDT_DECLARE_SEMA() declarations into global namespaces is the simplest + * no-brainer solution. All these aspects are irrelevant for plain C, because + * C doesn't have namespaces and everything is always in the global namespace. + * + * N.B. Due to USDT metadata being recorded in non-allocatable ELF note + * section, it has limitations when it comes to relocations, which, in + * practice, means that it's not possible to correctly share USDT semaphores + * between main executable and shared libraries, or even between multiple + * shared libraries. USDT semaphore has to be contained to individual shared + * library or executable to avoid unpleasant surprises with half-working USDT + * semaphores. We enforce this by marking semaphore ELF symbols as having + * a hidden visibility. This is quite an advanced use case and consideration + * and for most users this should have no consequences whatsoever. + */ +#define USDT_DEFINE_SEMA(sema) \ + struct usdt_sema __usdt_sema_sec USDT_SEMA(sema) \ + __usdt_asm_name(USDT_SEMA(sema)) \ + __attribute__((visibility("hidden"))) = { 0 } + +/* + * Declare extern reference to user-defined USDT semaphore `sema`. + * + * Refers to a variable defined in another compilation unit by + * USDT_DEFINE_SEMA() and allows to use the same USDT semaphore across + * multiple compilation units (i.e., .c and .cpp files). + * + * See USDT_DEFINE_SEMA() notes above for C++ language usage peculiarities. + */ +#define USDT_DECLARE_SEMA(sema) \ + extern struct usdt_sema USDT_SEMA(sema) __usdt_asm_name(USDT_SEMA(sema)) + +/* + * Check if user-defined USDT semaphore `sema` is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into USDT(s) associated with USDT semaphore `sema`. + * + * N.B. Such checks are necessarily racy. Between checking the state of USDT + * semaphore and triggering associated USDT(s), the active tracer might attach + * or detach. This race should be extremely rare and worst case should result + * in one-time missed USDT event or wasted extra data collection and + * processing. USDT-using tracers should be written with this in mind and is + * not a concern of the application defining USDTs with associated semaphore. + */ +#define USDT_SEMA_IS_ACTIVE(sema) (USDT_SEMA(sema).active > 0) + +/* + * Invoke USDT specified by `group` and `name` identifiers and associate + * explicitly user-defined semaphore `sema` with it. Pass through `args` as + * USDT arguments. `args` are optional and zero arguments are acceptable. + * + * Semaphore is defined with the help of USDT_DEFINE_SEMA() macro and can be + * checked whether active with USDT_SEMA_IS_ACTIVE(). + */ +#ifdef __usdt_va_opt +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema), ##__VA_ARGS__) +#else +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema) __VA_OPT__(,) __VA_ARGS__) +#endif + +/* + * Adjustable implementation aspects + */ +#ifndef USDT_ARG_CONSTRAINT +#if defined __powerpc__ +#define USDT_ARG_CONSTRAINT nZr +#elif defined __arm__ +#define USDT_ARG_CONSTRAINT g +#elif defined __loongarch__ +#define USDT_ARG_CONSTRAINT nmr +#else +#define USDT_ARG_CONSTRAINT nor +#endif +#endif /* USDT_ARG_CONSTRAINT */ + +#ifndef USDT_NOP +#if defined(__ia64__) || defined(__s390__) || defined(__s390x__) +#define USDT_NOP nop 0 +#else +#define USDT_NOP nop +#endif +#endif /* USDT_NOP */ + +/* + * Implementation details + */ +/* USDT name for implicitly-defined USDT semaphore, derived from group:name */ +#define __usdt_sema_name(group, name) __usdt_sema_##group##__##name +/* ELF section into which USDT semaphores are put */ +#define __usdt_sema_sec __attribute__((section(".probes"))) + +#define __usdt_concat(a, b) a ## b +#define __usdt_apply(fn, n) __usdt_concat(fn, n) + +#ifndef __usdt_nth +#define __usdt_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, N, ...) N +#endif + +#ifndef __usdt_narg +#ifdef __usdt_va_opt +#define __usdt_narg(...) __usdt_nth(_ __VA_OPT__(,) __VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#else +#define __usdt_narg(...) __usdt_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif +#endif /* __usdt_narg */ + +#define __usdt_hash # +#define __usdt_str_(x) #x +#define __usdt_str(x) __usdt_str_(x) + +#ifndef __usdt_asm_name +#define __usdt_asm_name(name) __asm__(__usdt_str(name)) +#endif + +#define __usdt_asm0() "\n" +#define __usdt_asm1(x) __usdt_str(x) "\n" +#define __usdt_asm2(x, ...) __usdt_str(x) "," __usdt_asm1(__VA_ARGS__) +#define __usdt_asm3(x, ...) __usdt_str(x) "," __usdt_asm2(__VA_ARGS__) +#define __usdt_asm4(x, ...) __usdt_str(x) "," __usdt_asm3(__VA_ARGS__) +#define __usdt_asm5(x, ...) __usdt_str(x) "," __usdt_asm4(__VA_ARGS__) +#define __usdt_asm6(x, ...) __usdt_str(x) "," __usdt_asm5(__VA_ARGS__) +#define __usdt_asm7(x, ...) __usdt_str(x) "," __usdt_asm6(__VA_ARGS__) +#define __usdt_asm8(x, ...) __usdt_str(x) "," __usdt_asm7(__VA_ARGS__) +#define __usdt_asm9(x, ...) __usdt_str(x) "," __usdt_asm8(__VA_ARGS__) +#define __usdt_asm10(x, ...) __usdt_str(x) "," __usdt_asm9(__VA_ARGS__) +#define __usdt_asm11(x, ...) __usdt_str(x) "," __usdt_asm10(__VA_ARGS__) +#define __usdt_asm12(x, ...) __usdt_str(x) "," __usdt_asm11(__VA_ARGS__) +#define __usdt_asm(...) __usdt_apply(__usdt_asm, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#ifdef __LP64__ +#define __usdt_asm_addr .8byte +#else +#define __usdt_asm_addr .4byte +#endif + +#define __usdt_asm_strz_(x) __usdt_asm1(.asciz #x) +#define __usdt_asm_strz(x) __usdt_asm_strz_(x) +#define __usdt_asm_str_(x) __usdt_asm1(.ascii #x) +#define __usdt_asm_str(x) __usdt_asm_str_(x) + +/* "semaphoreless" USDT case */ +#ifndef __usdt_sema_none +#define __usdt_sema_none(sema) +#endif + +/* implicitly defined __usdt_sema__group__name semaphore (using weak symbols) */ +#ifndef __usdt_sema_implicit +#define __usdt_sema_implicit(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm1(.ifndef sema) \ + __usdt_asm3( .pushsection .probes, "aw", "progbits") \ + __usdt_asm1( .weak sema) \ + __usdt_asm1( .hidden sema) \ + __usdt_asm1( .align 2) \ + __usdt_asm1(sema:) \ + __usdt_asm1( .zero 2) \ + __usdt_asm2( .type sema, @object) \ + __usdt_asm2( .size sema, 2) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + ); +#endif + +/* externally defined semaphore using USDT_DEFINE_SEMA() and passed explicitly by user */ +#ifndef __usdt_sema_explicit +#define __usdt_sema_explicit(sema) \ + __asm__ __volatile__ ("" :: "m" (sema)); +#endif + +/* main USDT definition (nop and .note.stapsdt metadata) */ +#define __usdt_probe(group, name, sema_def, sema, ...) do { \ + sema_def(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm( 990: USDT_NOP) \ + __usdt_asm3( .pushsection .note.stapsdt, "", "note") \ + __usdt_asm1( .balign 4) \ + __usdt_asm3( .4byte 992f-991f,994f-993f,3) \ + __usdt_asm1(991: .asciz "stapsdt") \ + __usdt_asm1(992: .balign 4) \ + __usdt_asm1(993: __usdt_asm_addr 990b) \ + __usdt_asm1( __usdt_asm_addr _.stapsdt.base) \ + __usdt_asm1( __usdt_asm_addr sema) \ + __usdt_asm_strz(group) \ + __usdt_asm_strz(name) \ + __usdt_asm_args(__VA_ARGS__) \ + __usdt_asm1( .ascii "\0") \ + __usdt_asm1(994: .balign 4) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.ifndef _.stapsdt.base) \ + __usdt_asm5( .pushsection .stapsdt.base,"aG","progbits",.stapsdt.base,comdat)\ + __usdt_asm1( .weak _.stapsdt.base) \ + __usdt_asm1( .hidden _.stapsdt.base) \ + __usdt_asm1(_.stapsdt.base:) \ + __usdt_asm1( .space 1) \ + __usdt_asm2( .size _.stapsdt.base, 1) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + :: __usdt_asm_ops(__VA_ARGS__) \ + ); \ +} while (0) + +/* + * NB: gdb PR24541 highlighted an unspecified corner of the sdt.h + * operand note format. + * + * The named register may be a longer or shorter (!) alias for the + * storage where the value in question is found. For example, on + * i386, 64-bit value may be put in register pairs, and a register + * name stored would identify just one of them. Previously, gcc was + * asked to emit the %w[id] (16-bit alias of some registers holding + * operands), even when a wider 32-bit value was used. + * + * Bottom line: the byte-width given before the @ sign governs. If + * there is a mismatch between that width and that of the named + * register, then a sys/sdt.h note consumer may need to employ + * architecture-specific heuristics to figure out where the compiler + * has actually put the complete value. + */ +#if defined(__powerpc__) || defined(__powerpc64__) +#define __usdt_argref(id) %I[id]%[id] +#elif defined(__i386__) +#define __usdt_argref(id) %k[id] /* gcc.gnu.org/PR80115 sourceware.org/PR24541 */ +#else +#define __usdt_argref(id) %[id] +#endif + +#define __usdt_asm_arg(n) __usdt_asm_str(%c[__usdt_asz##n]) \ + __usdt_asm1(.ascii "@") \ + __usdt_asm_str(__usdt_argref(__usdt_aval##n)) + +#define __usdt_asm_args0 /* no arguments */ +#define __usdt_asm_args1 __usdt_asm_arg(1) +#define __usdt_asm_args2 __usdt_asm_args1 __usdt_asm1(.ascii " ") __usdt_asm_arg(2) +#define __usdt_asm_args3 __usdt_asm_args2 __usdt_asm1(.ascii " ") __usdt_asm_arg(3) +#define __usdt_asm_args4 __usdt_asm_args3 __usdt_asm1(.ascii " ") __usdt_asm_arg(4) +#define __usdt_asm_args5 __usdt_asm_args4 __usdt_asm1(.ascii " ") __usdt_asm_arg(5) +#define __usdt_asm_args6 __usdt_asm_args5 __usdt_asm1(.ascii " ") __usdt_asm_arg(6) +#define __usdt_asm_args7 __usdt_asm_args6 __usdt_asm1(.ascii " ") __usdt_asm_arg(7) +#define __usdt_asm_args8 __usdt_asm_args7 __usdt_asm1(.ascii " ") __usdt_asm_arg(8) +#define __usdt_asm_args9 __usdt_asm_args8 __usdt_asm1(.ascii " ") __usdt_asm_arg(9) +#define __usdt_asm_args10 __usdt_asm_args9 __usdt_asm1(.ascii " ") __usdt_asm_arg(10) +#define __usdt_asm_args11 __usdt_asm_args10 __usdt_asm1(.ascii " ") __usdt_asm_arg(11) +#define __usdt_asm_args12 __usdt_asm_args11 __usdt_asm1(.ascii " ") __usdt_asm_arg(12) +#define __usdt_asm_args(...) __usdt_apply(__usdt_asm_args, __usdt_narg(__VA_ARGS__)) + +#define __usdt_is_arr(x) (__builtin_classify_type(x) == 14 || __builtin_classify_type(x) == 5) +#define __usdt_arg_size(x) (__usdt_is_arr(x) ? sizeof(void *) : sizeof(x)) + +/* + * We can't use __builtin_choose_expr() in C++, so fall back to table-based + * signedness determination for known types, utilizing templates magic. + */ +#ifdef __cplusplus + +#define __usdt_is_signed(x) (!__usdt_is_arr(x) && __usdt_t<__typeof(x)>::is_signed) + +#include + +template struct __usdt_t { static const bool is_signed = false; }; +template struct __usdt_t : public __usdt_t {}; +template struct __usdt_t : public __usdt_t {}; + +#define __usdt_def_signed(T) \ +template<> struct __usdt_t { static const bool is_signed = true; }; \ +template<> struct __usdt_t { static const bool is_signed = true; }; \ +template<> struct __usdt_t { static const bool is_signed = true; }; \ +template<> struct __usdt_t { static const bool is_signed = true; } +#define __usdt_maybe_signed(T) \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; } + +__usdt_def_signed(signed char); +__usdt_def_signed(short); +__usdt_def_signed(int); +__usdt_def_signed(long); +__usdt_def_signed(long long); +__usdt_maybe_signed(char); +__usdt_maybe_signed(wchar_t); + +#else /* !__cplusplus */ + +#define __usdt_is_inttype(x) (__builtin_classify_type(x) >= 1 && __builtin_classify_type(x) <= 4) +#define __usdt_inttype(x) __typeof(__builtin_choose_expr(__usdt_is_inttype(x), (x), 0U)) +#define __usdt_is_signed(x) ((__usdt_inttype(x))-1 < (__usdt_inttype(x))1) + +#endif /* __cplusplus */ + +#define __usdt_asm_op(n, x) \ + [__usdt_asz##n] "n" ((__usdt_is_signed(x) ? (int)-1 : 1) * (int)__usdt_arg_size(x)), \ + [__usdt_aval##n] __usdt_str(USDT_ARG_CONSTRAINT)(x) + +#define __usdt_asm_ops0() [__usdt_dummy] "g" (0) +#define __usdt_asm_ops1(x) __usdt_asm_op(1, x) +#define __usdt_asm_ops2(a,x) __usdt_asm_ops1(a), __usdt_asm_op(2, x) +#define __usdt_asm_ops3(a,b,x) __usdt_asm_ops2(a,b), __usdt_asm_op(3, x) +#define __usdt_asm_ops4(a,b,c,x) __usdt_asm_ops3(a,b,c), __usdt_asm_op(4, x) +#define __usdt_asm_ops5(a,b,c,d,x) __usdt_asm_ops4(a,b,c,d), __usdt_asm_op(5, x) +#define __usdt_asm_ops6(a,b,c,d,e,x) __usdt_asm_ops5(a,b,c,d,e), __usdt_asm_op(6, x) +#define __usdt_asm_ops7(a,b,c,d,e,f,x) __usdt_asm_ops6(a,b,c,d,e,f), __usdt_asm_op(7, x) +#define __usdt_asm_ops8(a,b,c,d,e,f,g,x) __usdt_asm_ops7(a,b,c,d,e,f,g), __usdt_asm_op(8, x) +#define __usdt_asm_ops9(a,b,c,d,e,f,g,h,x) __usdt_asm_ops8(a,b,c,d,e,f,g,h), __usdt_asm_op(9, x) +#define __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,x) __usdt_asm_ops9(a,b,c,d,e,f,g,h,i), __usdt_asm_op(10, x) +#define __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,x) __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,j), __usdt_asm_op(11, x) +#define __usdt_asm_ops12(a,b,c,d,e,f,g,h,i,j,k,x) __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,k), __usdt_asm_op(12, x) +#define __usdt_asm_ops(...) __usdt_apply(__usdt_asm_ops, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#endif /* __USDT_H */ From 4e7005223e6dab882646d96d0e2aa84a5dd07b56 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:22 +0200 Subject: [PATCH 33/66] selftests/bpf: Reorg the uprobe_syscall test function Adding __test_uprobe_syscall with non x86_64 stub to execute all the tests, so we don't need to keep adding non x86_64 stub functions for new tests. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-13-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 34 +++++++------------ 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index b17dc39a23dbf..a8f00aee77993 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -350,29 +350,8 @@ static void test_uretprobe_shadow_stack(void) ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } -#else -static void test_uretprobe_regs_equal(void) -{ - test__skip(); -} - -static void test_uretprobe_regs_change(void) -{ - test__skip(); -} - -static void test_uretprobe_syscall_call(void) -{ - test__skip(); -} -static void test_uretprobe_shadow_stack(void) -{ - test__skip(); -} -#endif - -void test_uprobe_syscall(void) +static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) test_uretprobe_regs_equal(); @@ -383,3 +362,14 @@ void test_uprobe_syscall(void) if (test__start_subtest("uretprobe_shadow_stack")) test_uretprobe_shadow_stack(); } +#else +static void __test_uprobe_syscall(void) +{ + test__skip(); +} +#endif + +void test_uprobe_syscall(void) +{ + __test_uprobe_syscall(); +} From 7932c4cf577187dec42ddfba0aba26434cecab0c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:23 +0200 Subject: [PATCH 34/66] selftests/bpf: Rename uprobe_syscall_executed prog to test_uretprobe_multi Renaming uprobe_syscall_executed prog to test_uretprobe_multi to fit properly in the following changes that add more programs. Plus adding pid filter and increasing executed variable. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-14-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 12 ++++++++---- .../selftests/bpf/progs/uprobe_syscall_executed.c | 8 ++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index a8f00aee77993..6d58a44da2b24 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -252,6 +252,7 @@ static void test_uretprobe_syscall_call(void) ); struct uprobe_syscall_executed *skel; int pid, status, err, go[2], c = 0; + struct bpf_link *link; if (!ASSERT_OK(pipe(go), "pipe")) return; @@ -277,11 +278,14 @@ static void test_uretprobe_syscall_call(void) _exit(0); } - skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, pid, - "/proc/self/exe", - "uretprobe_syscall_call", &opts); - if (!ASSERT_OK_PTR(skel->links.test, "bpf_program__attach_uprobe_multi")) + skel->bss->pid = pid; + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + pid, "/proc/self/exe", + "uretprobe_syscall_call", &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) goto cleanup; + skel->links.test_uretprobe_multi = link; /* kick the child */ write(go[1], &c, 1); diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c index 0d7f1a7db2e2e..8f48976a33aa5 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c @@ -8,10 +8,14 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; int executed = 0; +int pid; SEC("uretprobe.multi") -int test(struct pt_regs *regs) +int test_uretprobe_multi(struct pt_regs *ctx) { - executed = 1; + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; return 0; } From d5c86c3370100620fa9c2e8dc9350c354b30ddb4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:24 +0200 Subject: [PATCH 35/66] selftests/bpf: Add uprobe/usdt syscall tests Adding tests for optimized uprobe/usdt probes. Checking that we get expected trampoline and attached bpf programs get executed properly. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-15-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 284 +++++++++++++++++- .../bpf/progs/uprobe_syscall_executed.c | 52 ++++ 2 files changed, 335 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index 6d58a44da2b24..b91135abcf8ab 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,11 @@ #include "uprobe_syscall.skel.h" #include "uprobe_syscall_executed.skel.h" +#define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00 +#include "usdt.h" + +#pragma GCC diagnostic ignored "-Wattributes" + __naked unsigned long uretprobe_regs_trigger(void) { asm volatile ( @@ -305,6 +311,265 @@ static void test_uretprobe_syscall_call(void) close(go[0]); } +#define TRAMP "[uprobes-trampoline]" + +__attribute__((aligned(16))) +__nocf_check __weak __naked void uprobe_test(void) +{ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} + +__attribute__((aligned(16))) +__nocf_check __weak void usdt_test(void) +{ + USDT(optimized_uprobe, usdt); +} + +static int find_uprobes_trampoline(void *tramp_addr) +{ + void *start, *end; + char line[128]; + int ret = -1; + FILE *maps; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + fprintf(stderr, "cannot open maps\n"); + return -1; + } + + while (fgets(line, sizeof(line), maps)) { + int m = -1; + + /* We care only about private r-x mappings. */ + if (sscanf(line, "%p-%p r-xp %*x %*x:%*x %*u %n", &start, &end, &m) != 2) + continue; + if (m < 0) + continue; + if (!strncmp(&line[m], TRAMP, sizeof(TRAMP)-1) && (start == tramp_addr)) { + ret = 0; + break; + } + } + + fclose(maps); + return ret; +} + +static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; + +static void *find_nop5(void *fn) +{ + int i; + + for (i = 0; i < 10; i++) { + if (!memcmp(nop5, fn + i, 5)) + return fn + i; + } + return NULL; +} + +typedef void (__attribute__((nocf_check)) *trigger_t)(void); + +static bool shstk_is_enabled; + +static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger, + void *addr, int executed) +{ + struct __arch_relative_insn { + __u8 op; + __s32 raddr; + } __packed *call; + void *tramp = NULL; + __u8 *bp; + + /* Uprobe gets optimized after first trigger, so let's press twice. */ + trigger(); + trigger(); + + /* Make sure bpf program got executed.. */ + ASSERT_EQ(skel->bss->executed, executed, "executed"); + + if (shstk_is_enabled) { + /* .. and check optimization is disabled under shadow stack. */ + bp = (__u8 *) addr; + ASSERT_EQ(*bp, 0xcc, "int3"); + } else { + /* .. and check the trampoline is as expected. */ + call = (struct __arch_relative_insn *) addr; + tramp = (void *) (call + 1) + call->raddr; + ASSERT_EQ(call->op, 0xe8, "call"); + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); + } + + return tramp; +} + +static void check_detach(void *addr, void *tramp) +{ + /* [uprobes_trampoline] stays after detach */ + ASSERT_OK(!shstk_is_enabled && find_uprobes_trampoline(tramp), "uprobes_trampoline"); + ASSERT_OK(memcmp(addr, nop5, 5), "nop5"); +} + +static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link, + trigger_t trigger, void *addr, int executed) +{ + void *tramp; + + tramp = check_attach(skel, trigger, addr, executed); + bpf_link__destroy(link); + check_detach(addr, tramp); +} + +static void test_uprobe_legacy(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + /* uprobe */ + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe */ + skel->bss->executed = 0; + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uretprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_multi(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + /* uprobe.multi */ + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe.multi */ + skel->bss->executed = 0; + opts.retprobe = true; + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_session(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .session = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 4); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_usdt(void) +{ + struct uprobe_syscall_executed *skel; + struct bpf_link *link; + void *addr; + + errno = 0; + addr = find_nop5(usdt_test); + if (!ASSERT_OK_PTR(addr, "find_nop5")) + return; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_usdt(skel->progs.test_usdt, + -1 /* all PIDs */, "/proc/self/exe", + "optimized_uprobe", "usdt", NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt")) + goto cleanup; + + check(skel, link, usdt_test, addr, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + /* * Borrowed from tools/testing/selftests/x86/test_shadow_stack.c. * @@ -347,11 +612,20 @@ static void test_uretprobe_shadow_stack(void) return; } - /* Run all of the uretprobe tests. */ + /* Run all the tests with shadow stack in place. */ + shstk_is_enabled = true; + test_uretprobe_regs_equal(); test_uretprobe_regs_change(); test_uretprobe_syscall_call(); + test_uprobe_legacy(); + test_uprobe_multi(); + test_uprobe_session(); + test_uprobe_usdt(); + + shstk_is_enabled = false; + ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } @@ -365,6 +639,14 @@ static void __test_uprobe_syscall(void) test_uretprobe_syscall_call(); if (test__start_subtest("uretprobe_shadow_stack")) test_uretprobe_shadow_stack(); + if (test__start_subtest("uprobe_legacy")) + test_uprobe_legacy(); + if (test__start_subtest("uprobe_multi")) + test_uprobe_multi(); + if (test__start_subtest("uprobe_session")) + test_uprobe_session(); + if (test__start_subtest("uprobe_usdt")) + test_uprobe_usdt(); } #else static void __test_uprobe_syscall(void) diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c index 8f48976a33aa5..915d38591bf6a 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "vmlinux.h" #include +#include +#include #include struct pt_regs regs; @@ -10,6 +12,36 @@ char _license[] SEC("license") = "GPL"; int executed = 0; int pid; +SEC("uprobe") +int BPF_UPROBE(test_uprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uretprobe") +int BPF_URETPROBE(test_uretprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uprobe.multi") +int test_uprobe_multi(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + SEC("uretprobe.multi") int test_uretprobe_multi(struct pt_regs *ctx) { @@ -19,3 +51,23 @@ int test_uretprobe_multi(struct pt_regs *ctx) executed++; return 0; } + +SEC("uprobe.session") +int test_uprobe_session(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("usdt") +int test_usdt(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} From c8be59667cf17f281adc9a9387d7a0de60268fcd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:25 +0200 Subject: [PATCH 36/66] selftests/bpf: Add hit/attach/detach race optimized uprobe test Adding test that makes sure parallel execution of the uprobe and attach/detach of optimized uprobe on it works properly. By default the test runs for 500ms, which is adjustable by using BPF_SELFTESTS_UPROBE_SYSCALL_RACE_MSEC env variable. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-16-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index b91135abcf8ab..3d27c8bc019e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -15,6 +15,7 @@ #include #include "uprobe_syscall.skel.h" #include "uprobe_syscall_executed.skel.h" +#include "bpf/libbpf_internal.h" #define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00 #include "usdt.h" @@ -629,6 +630,111 @@ static void test_uretprobe_shadow_stack(void) ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } +static volatile bool race_stop; + +static USDT_DEFINE_SEMA(race); + +static void *worker_trigger(void *arg) +{ + unsigned long rounds = 0; + + while (!race_stop) { + uprobe_test(); + rounds++; + } + + printf("tid %d trigger rounds: %lu\n", gettid(), rounds); + return NULL; +} + +static void *worker_attach(void *arg) +{ + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct uprobe_syscall_executed *skel; + unsigned long rounds = 0, offset; + const char *sema[2] = { + __stringify(USDT_SEMA(race)), + NULL, + }; + unsigned long *ref; + int err; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return NULL; + + err = elf_resolve_syms_offsets("/proc/self/exe", 1, (const char **) &sema, &ref, STT_OBJECT); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema")) + return NULL; + + opts.ref_ctr_offset = *ref; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return NULL; + + skel->bss->pid = getpid(); + + while (!race_stop) { + skel->links.test_uprobe = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.test_uprobe, "bpf_program__attach_uprobe_opts")) + break; + + bpf_link__destroy(skel->links.test_uprobe); + skel->links.test_uprobe = NULL; + rounds++; + } + + printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed); + uprobe_syscall_executed__destroy(skel); + free(ref); + return NULL; +} + +static useconds_t race_msec(void) +{ + char *env; + + env = getenv("BPF_SELFTESTS_UPROBE_SYSCALL_RACE_MSEC"); + if (env) + return atoi(env); + + /* default duration is 500ms */ + return 500; +} + +static void test_uprobe_race(void) +{ + int err, i, nr_threads; + pthread_t *threads; + + nr_threads = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_threads, 0, "libbpf_num_possible_cpus")) + return; + nr_threads = max(2, nr_threads); + + threads = alloca(sizeof(*threads) * nr_threads); + if (!ASSERT_OK_PTR(threads, "malloc")) + return; + + for (i = 0; i < nr_threads; i++) { + err = pthread_create(&threads[i], NULL, i % 2 ? worker_trigger : worker_attach, + NULL); + if (!ASSERT_OK(err, "pthread_create")) + goto cleanup; + } + + usleep(race_msec() * 1000); + +cleanup: + race_stop = true; + for (nr_threads = i, i = 0; i < nr_threads; i++) + pthread_join(threads[i], NULL); + + ASSERT_FALSE(USDT_SEMA_IS_ACTIVE(race), "race_semaphore"); +} + static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) @@ -647,6 +753,8 @@ static void __test_uprobe_syscall(void) test_uprobe_session(); if (test__start_subtest("uprobe_usdt")) test_uprobe_usdt(); + if (test__start_subtest("uprobe_race")) + test_uprobe_race(); } #else static void __test_uprobe_syscall(void) From c11661bd9adf6831a75bb79299de793039dd8b9b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:26 +0200 Subject: [PATCH 37/66] selftests/bpf: Add uprobe syscall sigill signal test Make sure that calling uprobe syscall from outside uprobe trampoline results in sigill signal. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-17-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index 3d27c8bc019e6..02e98cba5cc69 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -735,6 +735,40 @@ static void test_uprobe_race(void) ASSERT_FALSE(USDT_SEMA_IS_ACTIVE(race), "race_semaphore"); } +#ifndef __NR_uprobe +#define __NR_uprobe 336 +#endif + +static void test_uprobe_sigill(void) +{ + int status, err, pid; + + pid = fork(); + if (!ASSERT_GE(pid, 0, "fork")) + return; + /* child */ + if (pid == 0) { + asm volatile ( + "pushq %rax\n" + "pushq %rcx\n" + "pushq %r11\n" + "movq $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "popq %r11\n" + "popq %rcx\n" + "retq\n" + ); + exit(0); + } + + err = waitpid(pid, &status, 0); + ASSERT_EQ(err, pid, "waitpid"); + + /* verify the child got killed with SIGILL */ + ASSERT_EQ(WIFSIGNALED(status), 1, "WIFSIGNALED"); + ASSERT_EQ(WTERMSIG(status), SIGILL, "WTERMSIG"); +} + static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) @@ -755,6 +789,8 @@ static void __test_uprobe_syscall(void) test_uprobe_usdt(); if (test__start_subtest("uprobe_race")) test_uprobe_race(); + if (test__start_subtest("uprobe_sigill")) + test_uprobe_sigill(); } #else static void __test_uprobe_syscall(void) From 875e1705ad9962f2642d098d6bfaabfa6f9c7ace Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:27 +0200 Subject: [PATCH 38/66] selftests/bpf: Add optimized usdt variant for basic usdt test Adding optimized usdt variant for basic usdt test to check that usdt arguments are properly passed in optimized code path. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-18-jolsa@kernel.org --- tools/testing/selftests/bpf/prog_tests/usdt.c | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index 9057e983cc540..833eb87483a1c 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -40,12 +40,19 @@ static void __always_inline trigger_func(int x) { } } -static void subtest_basic_usdt(void) +static void subtest_basic_usdt(bool optimized) { LIBBPF_OPTS(bpf_usdt_opts, opts); struct test_usdt *skel; struct test_usdt__bss *bss; - int err, i; + int err, i, called; + +#define TRIGGER(x) ({ \ + trigger_func(x); \ + if (optimized) \ + trigger_func(x); \ + optimized ? 2 : 1; \ + }) skel = test_usdt__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open")) @@ -66,11 +73,11 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link")) goto cleanup; - trigger_func(1); + called = TRIGGER(1); - ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie"); ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt"); @@ -119,11 +126,11 @@ static void subtest_basic_usdt(void) * bpf_program__attach_usdt() handles this properly and attaches to * all possible places of USDT invocation. */ - trigger_func(2); + called += TRIGGER(2); - ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); /* only check values that depend on trigger_func()'s input value */ ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1"); @@ -142,9 +149,9 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach")) goto cleanup; - trigger_func(3); + called += TRIGGER(3); - ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); /* this time usdt3 has custom cookie */ ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie"); ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt"); @@ -158,6 +165,7 @@ static void subtest_basic_usdt(void) cleanup: test_usdt__destroy(skel); +#undef TRIGGER } unsigned short test_usdt_100_semaphore SEC(".probes"); @@ -425,7 +433,11 @@ static void subtest_urandom_usdt(bool auto_attach) void test_usdt(void) { if (test__start_subtest("basic")) - subtest_basic_usdt(); + subtest_basic_usdt(false); +#ifdef __x86_64__ + if (test__start_subtest("basic_optimized")) + subtest_basic_usdt(true); +#endif if (test__start_subtest("multispec")) subtest_multispec_usdt(); if (test__start_subtest("urand_auto_attach")) From 275eae6789864904a7319fbb4e993734a0fb4310 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:28 +0200 Subject: [PATCH 39/66] selftests/bpf: Add uprobe_regs_equal test Changing uretprobe_regs_trigger to allow the test for both uprobe and uretprobe and renaming it to uprobe_regs_equal. We check that both uprobe and uretprobe probes (bpf programs) see expected registers with few exceptions. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-19-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 56 ++++++++++++++----- .../selftests/bpf/progs/uprobe_syscall.c | 4 +- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index 02e98cba5cc69..36ce9e261b5c2 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -22,15 +22,17 @@ #pragma GCC diagnostic ignored "-Wattributes" -__naked unsigned long uretprobe_regs_trigger(void) +__attribute__((aligned(16))) +__nocf_check __weak __naked unsigned long uprobe_regs_trigger(void) { asm volatile ( + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n" /* nop5 */ "movq $0xdeadbeef, %rax\n" "ret\n" ); } -__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) +__naked void uprobe_regs(struct pt_regs *before, struct pt_regs *after) { asm volatile ( "movq %r15, 0(%rdi)\n" @@ -51,15 +53,17 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) "movq $0, 120(%rdi)\n" /* orig_rax */ "movq $0, 128(%rdi)\n" /* rip */ "movq $0, 136(%rdi)\n" /* cs */ + "pushq %rax\n" "pushf\n" "pop %rax\n" "movq %rax, 144(%rdi)\n" /* eflags */ + "pop %rax\n" "movq %rsp, 152(%rdi)\n" /* rsp */ "movq $0, 160(%rdi)\n" /* ss */ /* save 2nd argument */ "pushq %rsi\n" - "call uretprobe_regs_trigger\n" + "call uprobe_regs_trigger\n" /* save return value and load 2nd argument pointer to rax */ "pushq %rax\n" @@ -99,25 +103,37 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) ); } -static void test_uretprobe_regs_equal(void) +static void test_uprobe_regs_equal(bool retprobe) { + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = retprobe, + ); struct uprobe_syscall *skel = NULL; struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; unsigned long *pa = (unsigned long *) &after; unsigned long *pp; + unsigned long offset; unsigned int i, cnt; - int err; + + offset = get_uprobe_offset(&uprobe_regs_trigger); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return; skel = uprobe_syscall__open_and_load(); if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load")) goto cleanup; - err = uprobe_syscall__attach(skel); - if (!ASSERT_OK(err, "uprobe_syscall__attach")) + skel->links.probe = bpf_program__attach_uprobe_opts(skel->progs.probe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.probe, "bpf_program__attach_uprobe_opts")) goto cleanup; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + if (!retprobe) + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); pp = (unsigned long *) &skel->bss->regs; cnt = sizeof(before)/sizeof(*pb); @@ -126,7 +142,7 @@ static void test_uretprobe_regs_equal(void) unsigned int offset = i * sizeof(unsigned long); /* - * Check register before and after uretprobe_regs_trigger call + * Check register before and after uprobe_regs_trigger call * that triggers the uretprobe. */ switch (offset) { @@ -140,7 +156,7 @@ static void test_uretprobe_regs_equal(void) /* * Check register seen from bpf program and register after - * uretprobe_regs_trigger call + * uprobe_regs_trigger call (with rax exception, check below). */ switch (offset) { /* @@ -153,6 +169,15 @@ static void test_uretprobe_regs_equal(void) case offsetof(struct pt_regs, rsp): case offsetof(struct pt_regs, ss): break; + /* + * uprobe does not see return value in rax, it needs to see the + * original (before) rax value + */ + case offsetof(struct pt_regs, rax): + if (!retprobe) { + ASSERT_EQ(pp[i], pb[i], "uprobe rax prog-before value check"); + break; + } default: if (!ASSERT_EQ(pp[i], pa[i], "register prog-after value check")) fprintf(stdout, "failed register offset %u\n", offset); @@ -190,13 +215,13 @@ static void test_uretprobe_regs_change(void) unsigned long cnt = sizeof(before)/sizeof(*pb); unsigned int i, err, offset; - offset = get_uprobe_offset(uretprobe_regs_trigger); + offset = get_uprobe_offset(uprobe_regs_trigger); err = write_bpf_testmod_uprobe(offset); if (!ASSERT_OK(err, "register_uprobe")) return; - uretprobe_regs(&before, &after); + uprobe_regs(&before, &after); err = write_bpf_testmod_uprobe(0); if (!ASSERT_OK(err, "unregister_uprobe")) @@ -616,7 +641,8 @@ static void test_uretprobe_shadow_stack(void) /* Run all the tests with shadow stack in place. */ shstk_is_enabled = true; - test_uretprobe_regs_equal(); + test_uprobe_regs_equal(false); + test_uprobe_regs_equal(true); test_uretprobe_regs_change(); test_uretprobe_syscall_call(); @@ -772,7 +798,7 @@ static void test_uprobe_sigill(void) static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) - test_uretprobe_regs_equal(); + test_uprobe_regs_equal(true); if (test__start_subtest("uretprobe_regs_change")) test_uretprobe_regs_change(); if (test__start_subtest("uretprobe_syscall_call")) @@ -791,6 +817,8 @@ static void __test_uprobe_syscall(void) test_uprobe_race(); if (test__start_subtest("uprobe_sigill")) test_uprobe_sigill(); + if (test__start_subtest("uprobe_regs_equal")) + test_uprobe_regs_equal(false); } #else static void __test_uprobe_syscall(void) diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall.c b/tools/testing/selftests/bpf/progs/uprobe_syscall.c index 8a4fa6c7ef590..e08c31669e5a7 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall.c @@ -7,8 +7,8 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; -SEC("uretprobe//proc/self/exe:uretprobe_regs_trigger") -int uretprobe(struct pt_regs *ctx) +SEC("uprobe") +int probe(struct pt_regs *ctx) { __builtin_memcpy(®s, ctx, sizeof(regs)); return 0; From 3abf4298c6139cf10a41472d87b2f608666302b0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:29 +0200 Subject: [PATCH 40/66] selftests/bpf: Change test_uretprobe_regs_change for uprobe and uretprobe Changing the test_uretprobe_regs_change test to test both uprobe and uretprobe by adding entry consumer handler to the testmod and making it to change one of the registers. Making sure that changed values both uprobe and uretprobe handlers propagate to the user space. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250720112133.244369-20-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 12 ++++++++---- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 11 +++++++++-- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index 36ce9e261b5c2..c1f945cacebc2 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -207,7 +207,7 @@ static int write_bpf_testmod_uprobe(unsigned long offset) return ret != n ? (int) ret : 0; } -static void test_uretprobe_regs_change(void) +static void test_regs_change(void) { struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; @@ -221,6 +221,9 @@ static void test_uretprobe_regs_change(void) if (!ASSERT_OK(err, "register_uprobe")) return; + /* make sure uprobe gets optimized */ + uprobe_regs_trigger(); + uprobe_regs(&before, &after); err = write_bpf_testmod_uprobe(0); @@ -643,7 +646,6 @@ static void test_uretprobe_shadow_stack(void) test_uprobe_regs_equal(false); test_uprobe_regs_equal(true); - test_uretprobe_regs_change(); test_uretprobe_syscall_call(); test_uprobe_legacy(); @@ -651,6 +653,8 @@ static void test_uretprobe_shadow_stack(void) test_uprobe_session(); test_uprobe_usdt(); + test_regs_change(); + shstk_is_enabled = false; ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); @@ -799,8 +803,6 @@ static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) test_uprobe_regs_equal(true); - if (test__start_subtest("uretprobe_regs_change")) - test_uretprobe_regs_change(); if (test__start_subtest("uretprobe_syscall_call")) test_uretprobe_syscall_call(); if (test__start_subtest("uretprobe_shadow_stack")) @@ -819,6 +821,8 @@ static void __test_uprobe_syscall(void) test_uprobe_sigill(); if (test__start_subtest("uprobe_regs_equal")) test_uprobe_regs_equal(false); + if (test__start_subtest("regs_change")) + test_regs_change(); } #else static void __test_uprobe_syscall(void) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index e9e918cdf31ff..511911053bdc5 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -500,15 +500,21 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { */ #ifdef __x86_64__ +static int +uprobe_handler(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data) +{ + regs->cx = 0x87654321feebdaed; + return 0; +} + static int uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs, __u64 *data) { regs->ax = 0x12345678deadbeef; - regs->cx = 0x87654321feebdaed; regs->r11 = (u64) -1; - return true; + return 0; } struct testmod_uprobe { @@ -520,6 +526,7 @@ struct testmod_uprobe { static DEFINE_MUTEX(testmod_uprobe_mutex); static struct testmod_uprobe uprobe = { + .consumer.handler = uprobe_handler, .consumer.ret_handler = uprobe_ret_handler, }; From 52718438af2ac8323aeea41b6f59da0962cb73b6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 21 Aug 2025 16:15:57 +0200 Subject: [PATCH 41/66] selftests/bpf: Fix uprobe syscall shadow stack test Now that we have uprobe syscall working properly with shadow stack, we can remove testing limitations for shadow stack tests and make sure uprobe gets properly optimized. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250821141557.13233-1-jolsa@kernel.org --- .../selftests/bpf/prog_tests/uprobe_syscall.c | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index c1f945cacebc2..5da0b49eeacae 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -403,8 +403,6 @@ static void *find_nop5(void *fn) typedef void (__attribute__((nocf_check)) *trigger_t)(void); -static bool shstk_is_enabled; - static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger, void *addr, int executed) { @@ -413,7 +411,6 @@ static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigge __s32 raddr; } __packed *call; void *tramp = NULL; - __u8 *bp; /* Uprobe gets optimized after first trigger, so let's press twice. */ trigger(); @@ -422,17 +419,11 @@ static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigge /* Make sure bpf program got executed.. */ ASSERT_EQ(skel->bss->executed, executed, "executed"); - if (shstk_is_enabled) { - /* .. and check optimization is disabled under shadow stack. */ - bp = (__u8 *) addr; - ASSERT_EQ(*bp, 0xcc, "int3"); - } else { - /* .. and check the trampoline is as expected. */ - call = (struct __arch_relative_insn *) addr; - tramp = (void *) (call + 1) + call->raddr; - ASSERT_EQ(call->op, 0xe8, "call"); - ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); - } + /* .. and check the trampoline is as expected. */ + call = (struct __arch_relative_insn *) addr; + tramp = (void *) (call + 1) + call->raddr; + ASSERT_EQ(call->op, 0xe8, "call"); + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); return tramp; } @@ -440,7 +431,7 @@ static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigge static void check_detach(void *addr, void *tramp) { /* [uprobes_trampoline] stays after detach */ - ASSERT_OK(!shstk_is_enabled && find_uprobes_trampoline(tramp), "uprobes_trampoline"); + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); ASSERT_OK(memcmp(addr, nop5, 5), "nop5"); } @@ -642,7 +633,6 @@ static void test_uretprobe_shadow_stack(void) } /* Run all the tests with shadow stack in place. */ - shstk_is_enabled = true; test_uprobe_regs_equal(false); test_uprobe_regs_equal(true); @@ -655,8 +645,6 @@ static void test_uretprobe_shadow_stack(void) test_regs_change(); - shstk_is_enabled = false; - ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } From 89d1d8434d246c96309a6068dfcf9e36dc61227b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:30 +0200 Subject: [PATCH 42/66] seccomp: passthrough uprobe systemcall without filtering Adding uprobe as another exception to the seccomp filter alongside with the uretprobe syscall. Same as the uretprobe the uprobe syscall is installed by kernel as replacement for the breakpoint exception and is limited to x86_64 arch and isn't expected to ever be supported in i386. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250720112133.244369-21-jolsa@kernel.org --- kernel/seccomp.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 41aa761c7738c..7daf2da09e8e1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -741,6 +741,26 @@ seccomp_prepare_user_filter(const char __user *user_filter) } #ifdef SECCOMP_ARCH_NATIVE +static bool seccomp_uprobe_exception(struct seccomp_data *sd) +{ +#if defined __NR_uretprobe || defined __NR_uprobe +#ifdef SECCOMP_ARCH_COMPAT + if (sd->arch == SECCOMP_ARCH_NATIVE) +#endif + { +#ifdef __NR_uretprobe + if (sd->nr == __NR_uretprobe) + return true; +#endif +#ifdef __NR_uprobe + if (sd->nr == __NR_uprobe) + return true; +#endif + } +#endif + return false; +} + /** * seccomp_is_const_allow - check if filter is constant allow with given data * @fprog: The BPF programs @@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, return false; /* Our single exception to filtering. */ -#ifdef __NR_uretprobe -#ifdef SECCOMP_ARCH_COMPAT - if (sd->arch == SECCOMP_ARCH_NATIVE) -#endif - if (sd->nr == __NR_uretprobe) - return true; -#endif + if (seccomp_uprobe_exception(sd)) + return true; for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; @@ -1042,6 +1057,9 @@ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, #ifdef __NR_uretprobe __NR_uretprobe, +#endif +#ifdef __NR_uprobe + __NR_uprobe, #endif -1, /* negative terminated */ }; From 9ffc7a635c35ad61349a36e9f52d46df9ba67dc3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:31 +0200 Subject: [PATCH 43/66] selftests/seccomp: validate uprobe syscall passes through seccomp Adding uprobe checks into the current uretprobe tests. All the related tests are now executed with attached uprobe or uretprobe or without any probe. Renaming the test fixture to uprobe, because it seems better. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250720112133.244369-22-jolsa@kernel.org --- tools/testing/selftests/seccomp/seccomp_bpf.c | 107 ++++++++++++++---- 1 file changed, 86 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 61acbd45ffaaf..2cf6fc825d865 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -73,6 +73,14 @@ #define noinline __attribute__((noinline)) #endif +#ifndef __nocf_check +#define __nocf_check __attribute__((nocf_check)) +#endif + +#ifndef __naked +#define __naked __attribute__((__naked__)) +#endif + #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #define PR_GET_NO_NEW_PRIVS 39 @@ -4896,7 +4904,36 @@ TEST(tsync_vs_dead_thread_leader) EXPECT_EQ(0, status); } -noinline int probed(void) +#ifdef __x86_64__ + +/* + * We need naked probed_uprobe function. Using __nocf_check + * check to skip possible endbr64 instruction and ignoring + * -Wattributes, otherwise the compilation might fail. + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" + +__naked __nocf_check noinline int probed_uprobe(void) +{ + /* + * Optimized uprobe is possible only on top of nop5 instruction. + */ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} +#pragma GCC diagnostic pop + +#else +noinline int probed_uprobe(void) +{ + return 1; +} +#endif + +noinline int probed_uretprobe(void) { return 1; } @@ -4949,35 +4986,46 @@ static ssize_t get_uprobe_offset(const void *addr) return found ? (uintptr_t)addr - start + base : -1; } -FIXTURE(URETPROBE) { +FIXTURE(UPROBE) { int fd; }; -FIXTURE_VARIANT(URETPROBE) { +FIXTURE_VARIANT(UPROBE) { /* - * All of the URETPROBE behaviors can be tested with either - * uretprobe attached or not + * All of the U(RET)PROBE behaviors can be tested with either + * u(ret)probe attached or not */ bool attach; + /* + * Test both uprobe and uretprobe. + */ + bool uretprobe; }; -FIXTURE_VARIANT_ADD(URETPROBE, attached) { +FIXTURE_VARIANT_ADD(UPROBE, not_attached) { + .attach = false, + .uretprobe = false, +}; + +FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) { .attach = true, + .uretprobe = false, }; -FIXTURE_VARIANT_ADD(URETPROBE, not_attached) { - .attach = false, +FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) { + .attach = true, + .uretprobe = true, }; -FIXTURE_SETUP(URETPROBE) +FIXTURE_SETUP(UPROBE) { const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_event_attr attr; ssize_t offset; int type, bit; -#ifndef __NR_uretprobe - SKIP(return, "__NR_uretprobe syscall not defined"); +#if !defined(__NR_uprobe) || !defined(__NR_uretprobe) + SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined"); #endif if (!variant->attach) @@ -4987,12 +5035,17 @@ FIXTURE_SETUP(URETPROBE) type = determine_uprobe_perf_type(); ASSERT_GE(type, 0); - bit = determine_uprobe_retprobe_bit(); - ASSERT_GE(bit, 0); - offset = get_uprobe_offset(probed); + + if (variant->uretprobe) { + bit = determine_uprobe_retprobe_bit(); + ASSERT_GE(bit, 0); + } + + offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe); ASSERT_GE(offset, 0); - attr.config |= 1 << bit; + if (variant->uretprobe) + attr.config |= 1 << bit; attr.size = attr_sz; attr.type = type; attr.config1 = ptr_to_u64("/proc/self/exe"); @@ -5003,7 +5056,7 @@ FIXTURE_SETUP(URETPROBE) PERF_FLAG_FD_CLOEXEC); } -FIXTURE_TEARDOWN(URETPROBE) +FIXTURE_TEARDOWN(UPROBE) { /* we could call close(self->fd), but we'd need extra filter for * that and since we are calling _exit right away.. @@ -5017,11 +5070,17 @@ static int run_probed_with_filter(struct sock_fprog *prog) return -1; } - probed(); + /* + * Uprobe is optimized after first hit, so let's hit twice. + */ + probed_uprobe(); + probed_uprobe(); + + probed_uretprobe(); return 0; } -TEST_F(URETPROBE, uretprobe_default_allow) +TEST_F(UPROBE, uprobe_default_allow) { struct sock_filter filter[] = { BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), @@ -5034,7 +5093,7 @@ TEST_F(URETPROBE, uretprobe_default_allow) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block) +TEST_F(UPROBE, uprobe_default_block) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, @@ -5051,11 +5110,14 @@ TEST_F(URETPROBE, uretprobe_default_block) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) +TEST_F(UPROBE, uprobe_block_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1), #endif @@ -5070,11 +5132,14 @@ TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall) +TEST_F(UPROBE, uprobe_default_block_with_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0), #endif From e173287b5d2119971fc329473a020171836d14c9 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 5 Aug 2025 10:50:00 +0800 Subject: [PATCH 44/66] uprobes: Remove redundant __GFP_NOWARN Commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") made GFP_NOWAIT implicitly include __GFP_NOWARN. Therefore, explicit __GFP_NOWARN combined with GFP_NOWAIT (e.g., `GFP_NOWAIT | __GFP_NOWARN`) is now redundant. Let's clean up these redundant flags across subsystems. No functional changes. Signed-off-by: Qianfeng Rong Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250805025000.346647-1-rongqianfeng@vivo.com --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4a194d7c838b5..996a81080d563 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1220,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), - GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); + GFP_NOWAIT | __GFP_NOMEMALLOC); if (prev) prev->next = NULL; } From d9cf9c6884d21e01483c4e17479d27636ea4bb50 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:26 +0800 Subject: [PATCH 45/66] perf/x86/intel: Use early_initcall() to hook bts_init() After the commit 'd971342d38bf ("perf/x86/intel: Decouple BTS initialization from PEBS initialization")' is introduced, x86_pmu.bts would initialized in bts_init() which is hooked by arch_initcall(). Whereas init_hw_perf_events() is hooked by early_initcall(). Once the core PMU is initialized, nmi watchdog initialization is called immediately before bts_init() is called. It leads to the BTS buffer is not really initialized since bts_init() is not called and x86_pmu.bts is still false at that time. Worse, BTS buffer would never be initialized then unless all core PMU events are freed and reserve_ds_buffers() is called again. Thus aligning with init_hw_perf_events(), use early_initcall() to hook bts_init() to ensure x86_pmu.bts is initialized before nmi watchdog initialization. Fixes: d971342d38bf ("perf/x86/intel: Decouple BTS initialization from PEBS initialization") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250820023032.17128-2-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/bts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 61da6b8a3d519..cbac54cb3a9ec 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -643,4 +643,4 @@ static __init int bts_init(void) return perf_pmu_register(&bts_pmu, "intel_bts", -1); } -arch_initcall(bts_init); +early_initcall(bts_init); From 43796f30507802d93ead2dc44fc9637f34671a89 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:27 +0800 Subject: [PATCH 46/66] perf/x86/intel: Fix IA32_PMC_x_CFG_B MSRs access error When running perf_fuzzer on PTL, sometimes the below "unchecked MSR access error" is seen when accessing IA32_PMC_x_CFG_B MSRs. [ 55.611268] unchecked MSR access error: WRMSR to 0x1986 (tried to write 0x0000000200000001) at rIP: 0xffffffffac564b28 (native_write_msr+0x8/0x30) [ 55.611280] Call Trace: [ 55.611282] [ 55.611284] ? intel_pmu_config_acr+0x87/0x160 [ 55.611289] intel_pmu_enable_acr+0x6d/0x80 [ 55.611291] intel_pmu_enable_event+0xce/0x460 [ 55.611293] x86_pmu_start+0x78/0xb0 [ 55.611297] x86_pmu_enable+0x218/0x3a0 [ 55.611300] ? x86_pmu_enable+0x121/0x3a0 [ 55.611302] perf_pmu_enable+0x40/0x50 [ 55.611307] ctx_resched+0x19d/0x220 [ 55.611309] __perf_install_in_context+0x284/0x2f0 [ 55.611311] ? __pfx_remote_function+0x10/0x10 [ 55.611314] remote_function+0x52/0x70 [ 55.611317] ? __pfx_remote_function+0x10/0x10 [ 55.611319] generic_exec_single+0x84/0x150 [ 55.611323] smp_call_function_single+0xc5/0x1a0 [ 55.611326] ? __pfx_remote_function+0x10/0x10 [ 55.611329] perf_install_in_context+0xd1/0x1e0 [ 55.611331] ? __pfx___perf_install_in_context+0x10/0x10 [ 55.611333] __do_sys_perf_event_open+0xa76/0x1040 [ 55.611336] __x64_sys_perf_event_open+0x26/0x30 [ 55.611337] x64_sys_call+0x1d8e/0x20c0 [ 55.611339] do_syscall_64+0x4f/0x120 [ 55.611343] entry_SYSCALL_64_after_hwframe+0x76/0x7e On PTL, GP counter 0 and 1 doesn't support auto counter reload feature, thus it would trigger a #GP when trying to write 1 on bit 0 of CFG_B MSR which requires to enable auto counter reload on GP counter 0. The root cause of causing this issue is the check for auto counter reload (ACR) counter mask from user space is incorrect in intel_pmu_acr_late_setup() helper. It leads to an invalid ACR counter mask from user space could be set into hw.config1 and then written into CFG_B MSRs and trigger the MSR access warning. e.g., User may create a perf event with ACR counter mask (config2=0xcb), and there is only 1 event created, so "cpuc->n_events" is 1. The correct check condition should be "i + idx >= cpuc->n_events" instead of "i + idx > cpuc->n_events" (it looks a typo). Otherwise, the counter mask would traverse twice and an invalid "cpuc->assign[1]" bit (bit 0) is set into hw.config1 and cause MSR accessing error. Besides, also check if the ACR counter mask corresponding events are ACR events. If not, filter out these counter mask. If a event is not a ACR event, it could be scheduled to an HW counter which doesn't support ACR. It's invalid to add their counter index in ACR counter mask. Furthermore, remove the WARN_ON_ONCE() since it's easily triggered as user could set any invalid ACR counter mask and the warning message could mislead users. Fixes: ec980e4facef ("perf/x86/intel: Support auto counter reload") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250820023032.17128-3-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c2fb729c270ec..15da60cf69f20 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2997,7 +2997,8 @@ static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) if (event->group_leader != leader->group_leader) break; for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { - if (WARN_ON_ONCE(i + idx > cpuc->n_events)) + if (i + idx >= cpuc->n_events || + !is_acr_event_group(cpuc->event_list[i + idx])) return; __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); } From 0c5caea762de31a85cbcce65d978cec83449f699 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:29 +0800 Subject: [PATCH 47/66] perf/x86: Add PERF_CAP_PEBS_TIMING_INFO flag IA32_PERF_CAPABILITIES.PEBS_TIMING_INFO[bit 17] is introduced to indicate whether timed PEBS is supported. Timed PEBS adds a new "retired latency" field in basic info group to show the timing info. Please find detailed information about timed PEBS in section 8.4.1 "Timed Processor Event Based Sampling" of "Intel Architecture Instruction Set Extensions and Future Features". This patch adds PERF_CAP_PEBS_TIMING_INFO flag and KVM module leverages this flag to expose timed PEBS feature to guest. Moreover, opportunistically refine the indents and make the macros share consistent indents. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Tested-by: Yi Lai Link: https://lore.kernel.org/r/20250820023032.17128-5-dapeng1.mi@linux.intel.com --- arch/x86/include/asm/msr-index.h | 14 ++++++++------ tools/arch/x86/include/asm/msr-index.h | 14 ++++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa141..f627196eb7966 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -315,12 +315,14 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ + PERF_CAP_PEBS_TIMING_INFO) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 5cfb5d74dd5f5..daebfd926f08e 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -315,12 +315,14 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ + PERF_CAP_PEBS_TIMING_INFO) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) From 9b3e119784bc3671fde5043001a5c9a607c7d920 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:30 +0800 Subject: [PATCH 48/66] perf/x86/intel: Change macro GLOBAL_CTRL_EN_PERF_METRICS to BIT_ULL(48) Macro GLOBAL_CTRL_EN_PERF_METRICS is defined to 48 instead of BIT_ULL(48), it's inconsistent with other similar macros. This leads to this macro is quite easily used wrongly since users thinks it's a bit-mask just like other similar macros. Thus change GLOBAL_CTRL_EN_PERF_METRICS to BIT_ULL(48) and eliminate this potential misuse. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Tested-by: Yi Lai Link: https://lore.kernel.org/r/20250820023032.17128-6-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 8 ++++---- arch/x86/include/asm/perf_event.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 15da60cf69f20..f88a99d8d125c 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5319,9 +5319,9 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); if (pmu->intel_cap.perf_metrics) - pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + pmu->intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS; else - pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; intel_pmu_check_event_constraints(pmu->event_constraints, pmu->cntr_mask64, @@ -5456,7 +5456,7 @@ static void intel_pmu_cpu_starting(int cpu) rdmsrq(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities); if (!perf_cap.perf_metrics) { x86_pmu.intel_cap.perf_metrics = 0; - x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + x86_pmu.intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; } } @@ -7790,7 +7790,7 @@ __init int intel_pmu_init(void) } if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) - x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + x86_pmu.intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS; if (x86_pmu.intel_cap.pebs_timing_info) x86_pmu.flags |= PMU_FL_RETIRE_LATENCY; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 70d1d94aca7e6..f8247ac276c41 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -430,7 +430,7 @@ static inline bool is_topdown_idx(int idx) #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) #define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48 -#define GLOBAL_CTRL_EN_PERF_METRICS 48 +#define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48) /* * We model guest LBR event tracing as another fixed-mode PMC like BTS. * From 2676dbf9f4fb7f6739d1207c0f1deaf63124642a Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:31 +0800 Subject: [PATCH 49/66] perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into INTEL_FIXED_BITS_MASK ICL_FIXED_0_ADAPTIVE is missed to be added into INTEL_FIXED_BITS_MASK, add it. With help of this new INTEL_FIXED_BITS_MASK, intel_pmu_enable_fixed() can be optimized. The old fixed counter control bits can be unconditionally cleared with INTEL_FIXED_BITS_MASK and then set new control bits base on new configuration. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Tested-by: Yi Lai Link: https://lore.kernel.org/r/20250820023032.17128-7-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 10 +++------- arch/x86/include/asm/perf_event.h | 6 +++++- arch/x86/kvm/pmu.h | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index f88a99d8d125c..28f5468a6ea36 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2845,8 +2845,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 mask, bits = 0; int idx = hwc->idx; + u64 bits = 0; if (is_topdown_idx(idx)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2885,14 +2885,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event) idx -= INTEL_PMC_IDX_FIXED; bits = intel_fixed_bits_by_idx(idx, bits); - mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); - - if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - } - cpuc->fixed_ctrl_val &= ~mask; + cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); cpuc->fixed_ctrl_val |= bits; } diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index f8247ac276c41..49a4d442f3fc2 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -35,7 +35,6 @@ #define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) #define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) -#define INTEL_FIXED_BITS_MASK 0xFULL #define INTEL_FIXED_BITS_STRIDE 4 #define INTEL_FIXED_0_KERNEL (1ULL << 0) #define INTEL_FIXED_0_USER (1ULL << 1) @@ -48,6 +47,11 @@ #define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) #define ICL_FIXED_0_ADAPTIVE (1ULL << 32) +#define INTEL_FIXED_BITS_MASK \ + (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \ + INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \ + ICL_FIXED_0_ADAPTIVE) + #define intel_fixed_bits_by_idx(_idx, _bits) \ ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ad89d0bd60058..103604c4b33b5 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -13,7 +13,7 @@ #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) -/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ +/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) \ (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) From f49e1be19542487921e82b29004908966cb99d7c Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:32 +0800 Subject: [PATCH 50/66] perf/x86: Print PMU counters bitmap in x86_pmu_show_pmu_cap() Along with the introduction Perfmon v6, pmu counters could be incontinuous, like fixed counters on CWF, only fixed counters 0-3 and 5-7 are supported, there is no fixed counter 4 on CWF. To accommodate this change, archPerfmonExt CPUID (0x23) leaves are introduced to enumerate the true-view of counters bitmap. Current perf code already supports archPerfmonExt CPUID and uses counters-bitmap to enumerate HW really supported counters, but x86_pmu_show_pmu_cap() still only dumps the absolute counter number instead of true-view bitmap, it's out-dated and may mislead readers. So dump counters true-view bitmap in x86_pmu_show_pmu_cap() and opportunistically change the dump sequence and words. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250820023032.17128-8-dapeng1.mi@linux.intel.com --- arch/x86/events/core.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7610f26dfbd90..745caa6c15a32 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2069,13 +2069,15 @@ static void _x86_pmu_read(struct perf_event *event) void x86_pmu_show_pmu_cap(struct pmu *pmu) { - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu)); - pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu)); - pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl)); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu)); + pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64)); + pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu)); + pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64)); + pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask); + pr_info("... max period: %016llx\n", x86_pmu.max_period); + pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl)); } static int __init init_hw_perf_events(void) From e649bcda25b5ae1a30a182cc450f928a0b282c93 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Aug 2025 14:03:39 -0400 Subject: [PATCH 51/66] perf: Remove get_perf_callchain() init_nr argument The 'init_nr' argument has double duty: it's used to initialize both the number of contexts and the number of stack entries. That's confusing and the callers always pass zero anyway. Hard code the zero. Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20250820180428.259565081@kernel.org --- include/linux/perf_event.h | 2 +- kernel/bpf/stackmap.c | 4 ++-- kernel/events/callchain.c | 12 ++++++------ kernel/events/core.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index bfbf9ea53f259..fd1d91017b99b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1719,7 +1719,7 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3615c06b7dfa9..ec3a57a5fba1f 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, if (max_depth > sysctl_perf_event_max_stack) max_depth = sysctl_perf_event_max_stack; - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, false, false); if (unlikely(!trace)) @@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, else if (kernel && task) trace = get_callchain_entry_for_task(task, max_depth); else - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, crosstask, false); if (unlikely(!trace) || trace->nr < skip) { diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6c83ad674d010..b0f5bd228cd8c 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -217,7 +217,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr } struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; @@ -228,11 +228,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (!entry) return NULL; - ctx.entry = entry; - ctx.max_stack = max_stack; - ctx.nr = entry->nr = init_nr; - ctx.contexts = 0; - ctx.contexts_maxed = false; + ctx.entry = entry; + ctx.max_stack = max_stack; + ctx.nr = entry->nr = 0; + ctx.contexts = 0; + ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) diff --git a/kernel/events/core.c b/kernel/events/core.c index ea357044d7806..bade8e0fced79 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8210,7 +8210,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, 0, kernel, user, + callchain = get_perf_callchain(regs, kernel, user, max_stack, crosstask, true); return callchain ?: &__empty_callchain; } From 153f9e74dec230f2e070e16fa061bc7adfd2c450 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Aug 2025 14:03:40 -0400 Subject: [PATCH 52/66] perf: Have get_perf_callchain() return NULL if crosstask and user are set get_perf_callchain() doesn't support cross-task unwinding for user space stacks, have it return NULL if both the crosstask and user arguments are set. Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250820180428.426423415@kernel.org --- kernel/events/callchain.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index b0f5bd228cd8c..cd0e3fc7ed05a 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -224,6 +224,10 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, struct perf_callchain_entry_ctx ctx; int rctx, start_entry_idx; + /* crosstask is not supported for user stacks */ + if (crosstask && user && !kernel) + return NULL; + entry = get_callchain_entry(&rctx); if (!entry) return NULL; @@ -240,7 +244,7 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, perf_callchain_kernel(&ctx, regs); } - if (user) { + if (user && !crosstask) { if (!user_mode(regs)) { if (current->mm) regs = task_pt_regs(current); @@ -249,9 +253,6 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, } if (regs) { - if (crosstask) - goto exit_put; - if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); @@ -261,7 +262,6 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, } } -exit_put: put_callchain_entry(rctx); return entry; From 90942f9fac05702065ff82ed0bade0d08168d4ea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Aug 2025 14:03:41 -0400 Subject: [PATCH 53/66] perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of current->mm == NULL To determine if a task is a kernel thread or not, it is more reliable to use (current->flags & (PF_KTHREAD|PF_USER_WORKERi)) than to rely on current->mm being NULL. That is because some kernel tasks (io_uring helpers) may have a mm field. Signed-off-by: Steven Rostedt (Google) Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250820180428.592367294@kernel.org --- kernel/events/callchain.c | 6 +++--- kernel/events/core.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index cd0e3fc7ed05a..5982d18f169bd 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -246,10 +246,10 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, if (user && !crosstask) { if (!user_mode(regs)) { - if (current->mm) - regs = task_pt_regs(current); - else + if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) regs = NULL; + else + regs = task_pt_regs(current); } if (regs) { diff --git a/kernel/events/core.c b/kernel/events/core.c index bade8e0fced79..f880cec0c9809 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7446,7 +7446,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - } else if (!(current->flags & PF_KTHREAD)) { + } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; @@ -8086,7 +8086,7 @@ static u64 perf_virt_to_phys(u64 virt) * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ - if (current->mm != NULL) { + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { struct page *p; pagefault_disable(); From d77e3319e31098a6cb97b7ce4e71ba676e327fd7 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Aug 2025 14:03:42 -0400 Subject: [PATCH 54/66] perf: Simplify get_perf_callchain() user logic Simplify the get_perf_callchain() user logic a bit. task_pt_regs() should never be NULL. Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20250820180428.760066227@kernel.org --- kernel/events/callchain.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 5982d18f169bd..808c0d7a31faf 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -247,21 +247,19 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, if (user && !crosstask) { if (!user_mode(regs)) { if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) - regs = NULL; - else - regs = task_pt_regs(current); + goto exit_put; + regs = task_pt_regs(current); } - if (regs) { - if (add_mark) - perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + if (add_mark) + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - start_entry_idx = entry->nr; - perf_callchain_user(&ctx, regs); - fixup_uretprobe_trampoline_entries(entry, start_entry_idx); - } + start_entry_idx = entry->nr; + perf_callchain_user(&ctx, regs); + fixup_uretprobe_trampoline_entries(entry, start_entry_idx); } +exit_put: put_callchain_entry(rctx); return entry; From 16ed389227651330879e17bd83d43bd234006722 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Aug 2025 14:03:43 -0400 Subject: [PATCH 55/66] perf: Skip user unwind if the task is a kernel thread If the task is not a user thread, there's no user stack to unwind. Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250820180428.930791978@kernel.org --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index f880cec0c9809..28de3baff7922 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8198,7 +8198,8 @@ struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; - bool user = !event->attr.exclude_callchain_user; + bool user = !event->attr.exclude_callchain_user && + !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; const u32 max_stack = event->attr.sample_max_stack; From 44325b9ba0c5ecaab43039d23ddeeb64a8b8b86b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 4 Aug 2025 10:26:08 +0200 Subject: [PATCH 56/66] uprobes: Add unique flag to uprobe consumer Adding unique flag to uprobe consumer to ensure it's the only consumer attached on the uprobe. This is helpful for use cases when consumer wants to change user space registers which might confuse other consumers, so it's desirable it's the only consumer on specific uprobe. Signed-off-by: Jiri Olsa --- include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 30 ++++++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 08ef78439d0df..0df849dee720d 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -60,6 +60,7 @@ struct uprobe_consumer { struct list_head cons_node; __u64 id; /* set when uprobe_consumer is registered */ + bool is_unique; /* the only consumer on uprobe */ }; #ifdef CONFIG_UPROBES diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 996a81080d563..b9b088f7333aa 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1024,14 +1024,35 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, return uprobe; } -static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) +static bool consumer_can_add(struct list_head *head, struct uprobe_consumer *uc) +{ + /* Uprobe has no consumer, we can add any. */ + if (list_empty(head)) + return true; + /* Uprobe has consumer/s, we can't add unique one. */ + if (uc->is_unique) + return false; + /* + * Uprobe has consumer/s, we can add nother consumer only if the + * current consumer is not unique. + **/ + return !list_first_entry(head, struct uprobe_consumer, cons_node)->is_unique; +} + +static int consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { static atomic64_t id; + int ret = -EBUSY; down_write(&uprobe->consumer_rwsem); + if (!consumer_can_add(&uprobe->consumers, uc)) + goto unlock; list_add_rcu(&uc->cons_node, &uprobe->consumers); uc->id = (__u64) atomic64_inc_return(&id); + ret = 0; +unlock: up_write(&uprobe->consumer_rwsem); + return ret; } /* @@ -1420,7 +1441,12 @@ struct uprobe *uprobe_register(struct inode *inode, return uprobe; down_write(&uprobe->register_rwsem); - consumer_add(uprobe, uc); + ret = consumer_add(uprobe, uc); + if (ret) { + put_uprobe(uprobe); + up_write(&uprobe->register_rwsem); + return ERR_PTR(ret); + } ret = register_for_each_vma(uprobe, uc); up_write(&uprobe->register_rwsem); From bce71cd980ad9321b6a0f72a176e309c3becd741 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 4 Aug 2025 10:26:51 +0200 Subject: [PATCH 57/66] uprobes: Skip emulate/sstep on unique uprobe when ip is changed If uprobe consumer changes instruction pointer we still execute (single step or emulate) the original instruction and increment the ip register with the size of the instruction. In case the instruction is emulated, the new ip register value is incremented with the instructions size and process is likely to crash with illegal instruction. In case the instruction is single stepped, the ip register change is lost and process continues with the original ip register value. If user decided to take execution elsewhere, it makes little sense to execute the original instruction, so let's skip it. Allowing this only for uprobe with unique consumer attached. Signed-off-by: Jiri Olsa --- kernel/events/uprobes.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b9b088f7333aa..da8291941c6bf 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2568,7 +2568,7 @@ static bool ignore_ret_handler(int rc) return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE; } -static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) +static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs, bool *is_unique) { struct uprobe_consumer *uc; bool has_consumers = false, remove = true; @@ -2582,6 +2582,9 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) __u64 cookie = 0; int rc = 0; + if (is_unique) + *is_unique |= uc->is_unique; + if (uc->handler) { rc = uc->handler(uc, regs, &cookie); WARN(rc < 0 || rc > 2, @@ -2735,6 +2738,7 @@ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; + bool is_unique = false; int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); @@ -2789,7 +2793,10 @@ static void handle_swbp(struct pt_regs *regs) if (arch_uprobe_ignore(&uprobe->arch, regs)) goto out; - handler_chain(uprobe, regs); + handler_chain(uprobe, regs, &is_unique); + + if (is_unique && instruction_pointer(regs) != bp_vaddr) + goto out; /* Try to optimize after first hit. */ arch_uprobe_optimize(&uprobe->arch, bp_vaddr); @@ -2819,7 +2826,7 @@ void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) return; if (arch_uprobe_ignore(&uprobe->arch, regs)) return; - handler_chain(uprobe, regs); + handler_chain(uprobe, regs, NULL); } /* From 1ac7a140d814d31d088da2743b9b9bf0d60f1558 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 1 Sep 2025 18:15:24 +0200 Subject: [PATCH 58/66] perf: Add support to attach standard unique uprobe Adding support to attach unique probe through perf uprobe pmu. Adding new 'unique' format attribute that allows to pass the request to create unique uprobe the uprobe consumer. Signed-off-by: Jiri Olsa --- include/linux/trace_events.h | 2 +- kernel/events/core.c | 8 ++++++-- kernel/trace/trace_event_perf.c | 4 ++-- kernel/trace/trace_probe.h | 2 +- kernel/trace/trace_uprobe.c | 9 +++++---- 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 04307a19cde30..1d35727fda27a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -877,7 +877,7 @@ extern int bpf_get_kprobe_info(const struct perf_event *event, #endif #ifdef CONFIG_UPROBE_EVENTS extern int perf_uprobe_init(struct perf_event *event, - unsigned long ref_ctr_offset, bool is_retprobe); + unsigned long ref_ctr_offset, bool is_retprobe, bool is_unique); extern void perf_uprobe_destroy(struct perf_event *event); extern int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, const char **filename, diff --git a/kernel/events/core.c b/kernel/events/core.c index 28de3baff7922..10a9341c638f3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11046,11 +11046,13 @@ EXPORT_SYMBOL_GPL(perf_tp_event); */ enum perf_probe_config { PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ + PERF_PROBE_CONFIG_IS_UNIQUE = 1U << 1, /* unique uprobe */ PERF_UPROBE_REF_CTR_OFFSET_BITS = 32, PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS, }; PMU_FORMAT_ATTR(retprobe, "config:0"); +PMU_FORMAT_ATTR(unique, "config:1"); #endif #ifdef CONFIG_KPROBE_EVENTS @@ -11114,6 +11116,7 @@ PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63"); static struct attribute *uprobe_attrs[] = { &format_attr_retprobe.attr, + &format_attr_unique.attr, &format_attr_ref_ctr_offset.attr, NULL, }; @@ -11144,7 +11147,7 @@ static int perf_uprobe_event_init(struct perf_event *event) { int err; unsigned long ref_ctr_offset; - bool is_retprobe; + bool is_retprobe, is_unique; if (event->attr.type != perf_uprobe.type) return -ENOENT; @@ -11159,8 +11162,9 @@ static int perf_uprobe_event_init(struct perf_event *event) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; + is_unique = event->attr.config & PERF_PROBE_CONFIG_IS_UNIQUE; ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT; - err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe); + err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe, is_unique); if (err) return err; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a6bb7577e8c59..b4383ab21d88b 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -296,7 +296,7 @@ void perf_kprobe_destroy(struct perf_event *p_event) #ifdef CONFIG_UPROBE_EVENTS int perf_uprobe_init(struct perf_event *p_event, - unsigned long ref_ctr_offset, bool is_retprobe) + unsigned long ref_ctr_offset, bool is_retprobe, bool is_unique) { int ret; char *path = NULL; @@ -317,7 +317,7 @@ int perf_uprobe_init(struct perf_event *p_event, } tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset, - ref_ctr_offset, is_retprobe); + ref_ctr_offset, is_retprobe, is_unique); if (IS_ERR(tp_event)) { ret = PTR_ERR(tp_event); goto out; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 842383fbc03b9..92870b98b296e 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -469,7 +469,7 @@ extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); extern struct trace_event_call * create_local_trace_uprobe(char *name, unsigned long offs, - unsigned long ref_ctr_offset, bool is_return); + unsigned long ref_ctr_offset, bool is_return, bool is_unique); extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); #endif extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8b0bcc0d8f41b..919efa91011d0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -333,7 +333,7 @@ trace_uprobe_primary_from_call(struct trace_event_call *call) * Allocate new trace_uprobe and initialize it (including uprobes). */ static struct trace_uprobe * -alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) +alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret, bool is_unique) { struct trace_uprobe *tu; int ret; @@ -356,6 +356,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) tu->consumer.handler = uprobe_dispatcher; if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; + tu->consumer.is_unique = is_unique; init_trace_uprobe_filter(tu->tp.event->filter); return tu; @@ -688,7 +689,7 @@ static int __trace_uprobe_create(int argc, const char **argv) argc -= 2; argv += 2; - tu = alloc_trace_uprobe(group, event, argc, is_return); + tu = alloc_trace_uprobe(group, event, argc, is_return, false /* not supported */); if (IS_ERR(tu)) { ret = PTR_ERR(tu); /* This must return -ENOMEM otherwise there is a bug */ @@ -1636,7 +1637,7 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) #ifdef CONFIG_PERF_EVENTS struct trace_event_call * create_local_trace_uprobe(char *name, unsigned long offs, - unsigned long ref_ctr_offset, bool is_return) + unsigned long ref_ctr_offset, bool is_return, bool is_unique) { enum probe_print_type ptype; struct trace_uprobe *tu; @@ -1658,7 +1659,7 @@ create_local_trace_uprobe(char *name, unsigned long offs, * duplicated name "DUMMY_EVENT" here. */ tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0, - is_return); + is_return, is_unique); if (IS_ERR(tu)) { pr_info("Failed to allocate trace_uprobe.(%d)\n", From a7b5bb48384370e391655cae417c52e54325870f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 2 Sep 2025 12:40:47 +0200 Subject: [PATCH 59/66] bpf: Add support to attach uprobe_multi unique uprobe Adding support to attach unique uprobe through uprobe multi link interface. Adding new BPF_F_UPROBE_MULTI_UNIQUE flag that denotes the unique uprobe creation. Signed-off-by: Jiri Olsa --- include/uapi/linux/bpf.h | 3 ++- kernel/trace/bpf_trace.c | 7 ++++++- tools/include/uapi/linux/bpf.h | 3 ++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 233de8677382e..3de9eb469fe25 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1300,7 +1300,8 @@ enum { * BPF_TRACE_UPROBE_MULTI attach type to create return probe. */ enum { - BPF_F_UPROBE_MULTI_RETURN = (1U << 0) + BPF_F_UPROBE_MULTI_RETURN = (1U << 0), + BPF_F_UPROBE_MULTI_UNIQUE = (1U << 1), }; /* link_create.netfilter.flags used in LINK_CREATE command for diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 606007c387c52..5172c5da33fa4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3354,7 +3354,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr return -EINVAL; flags = attr->link_create.uprobe_multi.flags; - if (flags & ~BPF_F_UPROBE_MULTI_RETURN) + if (flags & ~(BPF_F_UPROBE_MULTI_RETURN|BPF_F_UPROBE_MULTI_UNIQUE)) + return -EINVAL; + + if (is_uprobe_session(prog) && (flags & BPF_F_UPROBE_MULTI_UNIQUE)) return -EINVAL; /* @@ -3428,6 +3431,8 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr uprobes[i].link = link; + if (flags & BPF_F_UPROBE_MULTI_UNIQUE) + uprobes[i].consumer.is_unique = true; if (!(flags & BPF_F_UPROBE_MULTI_RETURN)) uprobes[i].consumer.handler = uprobe_multi_link_handler; if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog)) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 233de8677382e..3de9eb469fe25 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1300,7 +1300,8 @@ enum { * BPF_TRACE_UPROBE_MULTI attach type to create return probe. */ enum { - BPF_F_UPROBE_MULTI_RETURN = (1U << 0) + BPF_F_UPROBE_MULTI_RETURN = (1U << 0), + BPF_F_UPROBE_MULTI_UNIQUE = (1U << 1), }; /* link_create.netfilter.flags used in LINK_CREATE command for From 778c32edbd008019bc4cb5cb6e00c4dfa87877ff Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 31 Jul 2025 22:49:52 +0200 Subject: [PATCH 60/66] bpf: Allow uprobe program to change context registers Currently uprobe (BPF_PROG_TYPE_KPROBE) program can't write to the context registers data. While this makes sense for kprobe attachments, for uprobe attachment it might make sense to be able to change user space registers to alter application execution. Since uprobe and kprobe programs share the same type (BPF_PROG_TYPE_KPROBE), we can't deny write access to context during the program load. We need to check on it during program attachment to see if it's going to be kprobe or uprobe. Storing the program's write attempt to context and checking on it during the attachment. Signed-off-by: Jiri Olsa --- include/linux/bpf.h | 1 + kernel/events/core.c | 4 ++++ kernel/trace/bpf_trace.c | 3 +-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8f6e87f0f3a89..5e40ca25baa58 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1624,6 +1624,7 @@ struct bpf_prog_aux { bool priv_stack_requested; bool changes_pkt_data; bool might_sleep; + bool kprobe_write_ctx; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; diff --git a/kernel/events/core.c b/kernel/events/core.c index 10a9341c638f3..74bb5008a1c45 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11242,6 +11242,10 @@ static int __perf_event_set_bpf_prog(struct perf_event *event, if (prog->kprobe_override && !is_kprobe) return -EINVAL; + /* Writing to context allowed only for uprobes. */ + if (prog->aux->kprobe_write_ctx && !is_uprobe) + return -EINVAL; + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5172c5da33fa4..b66c91e39961e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1521,8 +1521,6 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type { if (off < 0 || off >= sizeof(struct pt_regs)) return false; - if (type != BPF_READ) - return false; if (off % size != 0) return false; /* @@ -1532,6 +1530,7 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type if (off + size > sizeof(struct pt_regs)) return false; + prog->aux->kprobe_write_ctx |= type == BPF_WRITE; return true; } From aa642491f7383a1a44972cb2aae9805b86041a38 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 2 Sep 2025 12:41:10 +0200 Subject: [PATCH 61/66] libbpf: Add support to attach unique uprobe_multi uprobe Adding support to attach unique uprobe multi by adding the 'unique' bool flag to struct bpf_uprobe_multi_opts. Also adding "uprobe.unique[.s]" auto load sections to create uprobe_multi unique uprobe. Signed-off-by: Jiri Olsa --- tools/lib/bpf/libbpf.c | 10 +++++++++- tools/lib/bpf/libbpf.h | 4 +++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index fe4fc5438678c..94a2dd94814dc 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9535,6 +9535,8 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("uprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), SEC_DEF("uretprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), SEC_DEF("uprobe.session+", KPROBE, BPF_TRACE_UPROBE_SESSION, SEC_NONE, attach_uprobe_multi), + SEC_DEF("uprobe.unique+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), + SEC_DEF("uprobe.unique.s+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi), SEC_DEF("uprobe.multi.s+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi), SEC_DEF("uretprobe.multi.s+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi), SEC_DEF("uprobe.session.s+", KPROBE, BPF_TRACE_UPROBE_SESSION, SEC_SLEEPABLE, attach_uprobe_multi), @@ -11890,6 +11892,7 @@ static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, stru case 3: opts.session = str_has_pfx(probe_type, "uprobe.session"); opts.retprobe = str_has_pfx(probe_type, "uretprobe.multi"); + opts.unique = str_has_pfx(probe_type, "uprobe.unique"); *link = bpf_program__attach_uprobe_multi(prog, -1, binary_path, func_name, &opts); ret = libbpf_get_error(*link); @@ -12126,10 +12129,10 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, LIBBPF_OPTS(bpf_link_create_opts, lopts); unsigned long *resolved_offsets = NULL; enum bpf_attach_type attach_type; + bool retprobe, session, unique; int err = 0, link_fd, prog_fd; struct bpf_link *link = NULL; char full_path[PATH_MAX]; - bool retprobe, session; const __u64 *cookies; const char **syms; size_t cnt; @@ -12151,6 +12154,7 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, cnt = OPTS_GET(opts, cnt, 0); retprobe = OPTS_GET(opts, retprobe, false); session = OPTS_GET(opts, session, false); + unique = OPTS_GET(opts, unique, false); /* * User can specify 2 mutually exclusive set of inputs: @@ -12182,6 +12186,9 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, if (retprobe && session) return libbpf_err_ptr(-EINVAL); + if (unique && session) + return libbpf_err_ptr(-EINVAL); + if (func_pattern) { if (!strchr(path, '/')) { err = resolve_full_path(path, full_path, sizeof(full_path)); @@ -12213,6 +12220,7 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, lopts.uprobe_multi.cookies = cookies; lopts.uprobe_multi.cnt = cnt; lopts.uprobe_multi.flags = retprobe ? BPF_F_UPROBE_MULTI_RETURN : 0; + lopts.uprobe_multi.flags |= unique ? BPF_F_UPROBE_MULTI_UNIQUE : 0; if (pid == 0) pid = getpid(); diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 2e91148d9b44d..429f03c48ad3d 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -621,10 +621,12 @@ struct bpf_uprobe_multi_opts { bool retprobe; /* create session kprobes */ bool session; + /* create unique uprobe */ + bool unique; size_t :0; }; -#define bpf_uprobe_multi_opts__last_field session +#define bpf_uprobe_multi_opts__last_field unique /** * @brief **bpf_program__attach_uprobe_multi()** attaches a BPF program From db168c9d3b6407a49e265597b8703b2ba5f239b6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 1 Sep 2025 18:37:36 +0200 Subject: [PATCH 62/66] libbpf: Add support to attach generic unique uprobe Signed-off-by: Jiri Olsa --- tools/lib/bpf/libbpf.c | 29 ++++++++++++++++++++++++----- tools/lib/bpf/libbpf.h | 4 +++- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 94a2dd94814dc..ef11f78697ec4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -11055,11 +11055,19 @@ static int determine_uprobe_retprobe_bit(void) return parse_uint_from_file(file, "config:%d\n"); } +static int determine_uprobe_unique_bit(void) +{ + const char *file = "/sys/bus/event_source/devices/uprobe/format/unique"; + + return parse_uint_from_file(file, "config:%d\n"); +} + #define PERF_UPROBE_REF_CTR_OFFSET_BITS 32 #define PERF_UPROBE_REF_CTR_OFFSET_SHIFT 32 static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, - uint64_t offset, int pid, size_t ref_ctr_off) + uint64_t offset, int pid, size_t ref_ctr_off, + bool unique) { const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_event_attr attr; @@ -11090,6 +11098,16 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, } attr.config |= 1 << bit; } + if (uprobe && unique) { + int bit = determine_uprobe_unique_bit(); + + if (bit < 0) { + pr_warn("failed to determine uprobe unique bit: %s\n", + errstr(bit)); + return bit; + } + attr.config |= 1 << bit; + } attr.size = attr_sz; attr.type = type; attr.config |= (__u64)ref_ctr_off << PERF_UPROBE_REF_CTR_OFFSET_SHIFT; @@ -11296,7 +11314,7 @@ int probe_kern_syscall_wrapper(int token_fd) if (determine_kprobe_perf_type() >= 0) { int pfd; - pfd = perf_event_open_probe(false, false, syscall_name, 0, getpid(), 0); + pfd = perf_event_open_probe(false, false, syscall_name, 0, getpid(), 0, false); if (pfd >= 0) close(pfd); @@ -11358,7 +11376,7 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog, if (!legacy) { pfd = perf_event_open_probe(false /* uprobe */, retprobe, func_name, offset, - -1 /* pid */, 0 /* ref_ctr_off */); + -1 /* pid */, 0 /* ref_ctr_off */, false /* unique */); } else { char probe_name[MAX_EVENT_NAME_LEN]; @@ -12264,7 +12282,7 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, struct bpf_link *link; size_t ref_ctr_off; int pfd, err; - bool retprobe, legacy; + bool retprobe, legacy, unique; const char *func_name; if (!OPTS_VALID(opts, bpf_uprobe_opts)) @@ -12274,6 +12292,7 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, retprobe = OPTS_GET(opts, retprobe, false); ref_ctr_off = OPTS_GET(opts, ref_ctr_offset, 0); pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + unique = OPTS_GET(opts, unique, false); if (!binary_path) return libbpf_err_ptr(-EINVAL); @@ -12334,7 +12353,7 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, if (!legacy) { pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path, - func_offset, pid, ref_ctr_off); + func_offset, pid, ref_ctr_off, unique); } else { char probe_name[MAX_EVENT_NAME_LEN]; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 429f03c48ad3d..c7ab9de8cc906 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -726,9 +726,11 @@ struct bpf_uprobe_opts { const char *func_name; /* uprobe attach mode */ enum probe_attach_mode attach_mode; + /* create unique uprobe */ + bool unique; size_t :0; }; -#define bpf_uprobe_opts__last_field attach_mode +#define bpf_uprobe_opts__last_field unique /** * @brief **bpf_program__attach_uprobe()** attaches a BPF program From bf502d82e020da7eb6ca11d26acd6d7e4c1d0c62 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 6 Aug 2025 22:32:26 +0200 Subject: [PATCH 63/66] selftests/bpf: Add uprobe multi context registers changes test Adding test to check we can change common register values through uprobe program. It's x86_64 specific test. Signed-off-by: Jiri Olsa --- .../selftests/bpf/prog_tests/bpf_cookie.c | 1 + .../bpf/prog_tests/uprobe_multi_test.c | 107 ++++++++++++++++++ .../selftests/bpf/progs/uprobe_multi.c | 24 ++++ 3 files changed, 132 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c index 4a0670c056bad..2425fd26d1ea1 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 2ee17ef1dae28..8ee9ad4f90442 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -2,6 +2,7 @@ #include #include +#include #include #include "uprobe_multi.skel.h" #include "uprobe_multi_bench.skel.h" @@ -1340,6 +1341,111 @@ static void test_bench_attach_usdt(void) printf("%s: detached in %7.3lfs\n", __func__, detach_delta); } +#ifdef __x86_64__ +__naked __maybe_unused unsigned long uprobe_regs_change_trigger(void) +{ + asm volatile ( + "ret\n" + ); +} + +static __naked void uprobe_regs_change(struct pt_regs *before, struct pt_regs *after) +{ + asm volatile ( + "movq %r11, 48(%rdi)\n" + "movq %r10, 56(%rdi)\n" + "movq %r9, 64(%rdi)\n" + "movq %r8, 72(%rdi)\n" + "movq %rax, 80(%rdi)\n" + "movq %rcx, 88(%rdi)\n" + "movq %rdx, 96(%rdi)\n" + "movq %rsi, 104(%rdi)\n" + "movq %rdi, 112(%rdi)\n" + + /* save 2nd argument */ + "pushq %rsi\n" + "call uprobe_regs_change_trigger\n" + + /* save return value and load 2nd argument pointer to rax */ + "pushq %rax\n" + "movq 8(%rsp), %rax\n" + + "movq %r11, 48(%rax)\n" + "movq %r10, 56(%rax)\n" + "movq %r9, 64(%rax)\n" + "movq %r8, 72(%rax)\n" + "movq %rcx, 88(%rax)\n" + "movq %rdx, 96(%rax)\n" + "movq %rsi, 104(%rax)\n" + "movq %rdi, 112(%rax)\n" + + /* restore return value and 2nd argument */ + "pop %rax\n" + "pop %rsi\n" + + "movq %rax, 80(%rsi)\n" + "ret\n" + ); +} + +static void unique_regs_common(void) +{ + struct pt_regs before = {}, after = {}, expected = { + .rax = 0xc0ffe, + .rcx = 0xbad, + .rdx = 0xdead, + .r8 = 0x8, + .r9 = 0x9, + .r10 = 0x10, + .r11 = 0x11, + .rdi = 0x12, + .rsi = 0x13, + }; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .unique = true, + ); + struct uprobe_multi *skel; + + skel = uprobe_multi__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->pid = getpid(); + skel->bss->regs = expected; + + skel->links.uprobe_change_regs= bpf_program__attach_uprobe_multi( + skel->progs.uprobe_change_regs, + -1, "/proc/self/exe", + "uprobe_regs_change_trigger", + &opts); + if (!ASSERT_OK_PTR(skel->links.uprobe_change_regs , "bpf_program__attach_uprobe_multi")) + goto cleanup; + + uprobe_regs_change(&before, &after); + + ASSERT_EQ(after.rax, expected.rax, "ax"); + ASSERT_EQ(after.rcx, expected.rcx, "cx"); + ASSERT_EQ(after.rdx, expected.rdx, "dx"); + ASSERT_EQ(after.r8, expected.r8, "r8"); + ASSERT_EQ(after.r9, expected.r9, "r9"); + ASSERT_EQ(after.r10, expected.r10, "r10"); + ASSERT_EQ(after.r11, expected.r11, "r11"); + ASSERT_EQ(after.rdi, expected.rdi, "rdi"); + ASSERT_EQ(after.rsi, expected.rsi, "rsi"); + +cleanup: + uprobe_multi__destroy(skel); +} + +static void test_unique(void) +{ + if (test__start_subtest("unique_regs_common")) + unique_regs_common(); +} +#else +static void test_unique(void) { } +#endif + void test_uprobe_multi_test(void) { if (test__start_subtest("skel_api")) @@ -1372,5 +1478,6 @@ void test_uprobe_multi_test(void) test_session_cookie_skel_api(); if (test__start_subtest("session_cookie_recursive")) test_session_recursive_skel_api(); + test_unique(); RUN_TESTS(uprobe_multi_verifier); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi.c b/tools/testing/selftests/bpf/progs/uprobe_multi.c index 44190efcdba21..f26f8b8409852 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi.c @@ -141,3 +141,27 @@ int usdt_extra(struct pt_regs *ctx) /* we need this one just to mix PID-filtered and global USDT probes */ return 0; } + +#if defined(__TARGET_ARCH_x86) +struct pt_regs regs; + +SEC("uprobe.unique") +int BPF_UPROBE(uprobe_change_regs) +{ + pid_t cur_pid = bpf_get_current_pid_tgid() >> 32; + + if (cur_pid != pid) + return 0; + + ctx->ax = regs.ax; + ctx->cx = regs.cx; + ctx->dx = regs.dx; + ctx->r8 = regs.r8; + ctx->r9 = regs.r9; + ctx->r10 = regs.r10; + ctx->r11 = regs.r11; + ctx->di = regs.di; + ctx->si = regs.si; + return 0; +} +#endif From f26daa307023782c83b64a596ec1b2d4f028c225 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Aug 2025 15:39:29 +0200 Subject: [PATCH 64/66] selftests/bpf: Add uprobe multi context ip register change test Adding test to check we can change the application execution through instruction pointer change through uprobe program. It's x86_64 specific test. Signed-off-by: Jiri Olsa --- .../bpf/prog_tests/uprobe_multi_test.c | 42 +++++++++++++++++++ .../selftests/bpf/progs/uprobe_multi.c | 14 +++++++ 2 files changed, 56 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 8ee9ad4f90442..1ac246f983639 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -1437,10 +1437,52 @@ static void unique_regs_common(void) uprobe_multi__destroy(skel); } +static unsigned long uprobe_regs_change_ip_1(void) +{ + return 0xc0ffee; +} + +static unsigned long uprobe_regs_change_ip_2(void) +{ + return 0xdeadbeef; +} + +static void unique_regs_ip(void) +{ + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .unique = true, + ); + struct uprobe_multi *skel; + int ret; + + skel = uprobe_multi__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->pid = getpid(); + skel->bss->ip = (unsigned long) uprobe_regs_change_ip_2; + + skel->links.uprobe_change_ip = bpf_program__attach_uprobe_multi( + skel->progs.uprobe_change_ip, + -1, "/proc/self/exe", + "uprobe_regs_change_ip_1", + &opts); + if (!ASSERT_OK_PTR(skel->links.uprobe_change_ip, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + ret = uprobe_regs_change_ip_1(); + ASSERT_EQ(ret, 0xdeadbeef, "ret"); + +cleanup: + uprobe_multi__destroy(skel); +} + static void test_unique(void) { if (test__start_subtest("unique_regs_common")) unique_regs_common(); + if (test__start_subtest("unique_regs_ip")) + unique_regs_ip(); } #else static void test_unique(void) { } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi.c b/tools/testing/selftests/bpf/progs/uprobe_multi.c index f26f8b8409852..563fd37ed77d1 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi.c @@ -164,4 +164,18 @@ int BPF_UPROBE(uprobe_change_regs) ctx->si = regs.si; return 0; } + +unsigned long ip; + +SEC("uprobe.unique") +int BPF_UPROBE(uprobe_change_ip) +{ + pid_t cur_pid = bpf_get_current_pid_tgid() >> 32; + + if (cur_pid != pid) + return 0; + + ctx->ip = ip; + return 0; +} #endif From 269a90ea83ec95639b9e5a146960b7aafdabd18c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Aug 2025 18:46:30 +0200 Subject: [PATCH 65/66] selftests/bpf: Add uprobe multi unique attach test Signed-off-by: Jiri Olsa --- .../bpf/prog_tests/uprobe_multi_test.c | 99 +++++++++++++++++++ .../selftests/bpf/progs/uprobe_multi_unique.c | 34 +++++++ 2 files changed, 133 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/uprobe_multi_unique.c diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 1ac246f983639..3f95af4c82012 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -14,6 +14,7 @@ #include "uprobe_multi_session_cookie.skel.h" #include "uprobe_multi_session_recursive.skel.h" #include "uprobe_multi_verifier.skel.h" +#include "uprobe_multi_unique.skel.h" #include "bpf/libbpf_internal.h" #include "testing_helpers.h" #include "../sdt.h" @@ -1477,12 +1478,110 @@ static void unique_regs_ip(void) uprobe_multi__destroy(skel); } +static void unique_attach(void) +{ + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .unique = true, + ); + struct bpf_link *link_1, *link_2 = NULL; + struct bpf_program *prog_1, *prog_2; + struct uprobe_multi_unique *skel; + + skel = uprobe_multi_unique__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_multi_unique__open_and_load")) + return; + + skel->bss->my_pid = getpid(); + + prog_1 = skel->progs.test1; + prog_2 = skel->progs.test2; + + /* not-unique and unique */ + link_1 = bpf_program__attach_uprobe_multi(prog_1, -1, "/proc/self/exe", + "uprobe_multi_func_1", + NULL); + if (!ASSERT_OK_PTR(link_1, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + link_2 = bpf_program__attach_uprobe_multi(prog_2, -1, "/proc/self/exe", + "uprobe_multi_func_1", + &opts); + if (!ASSERT_ERR_PTR(link_2, "bpf_program__attach_uprobe_multi")) { + bpf_link__destroy(link_2); + goto cleanup; + } + + bpf_link__destroy(link_1); + + /* unique and unique */ + link_1 = bpf_program__attach_uprobe_multi(prog_1, -1, "/proc/self/exe", + "uprobe_multi_func_1", + &opts); + if (!ASSERT_OK_PTR(link_1, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + link_2 = bpf_program__attach_uprobe_multi(prog_2, -1, "/proc/self/exe", + "uprobe_multi_func_1", + &opts); + if (!ASSERT_ERR_PTR(link_2, "bpf_program__attach_uprobe_multi")) { + bpf_link__destroy(link_2); + goto cleanup; + } + + bpf_link__destroy(link_1); + + /* unique and not-unique */ + link_1 = bpf_program__attach_uprobe_multi(prog_1, -1, "/proc/self/exe", + "uprobe_multi_func_1", + &opts); + if (!ASSERT_OK_PTR(link_1, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + link_2 = bpf_program__attach_uprobe_multi(prog_2, -1, "/proc/self/exe", + "uprobe_multi_func_1", + NULL); + if (!ASSERT_ERR_PTR(link_2, "bpf_program__attach_uprobe_multi")) { + bpf_link__destroy(link_2); + goto cleanup; + } + + bpf_link__destroy(link_1); + + /* not-unique and not-unique */ + link_1 = bpf_program__attach_uprobe_multi(prog_1, -1, "/proc/self/exe", + "uprobe_multi_func_1", + NULL); + if (!ASSERT_OK_PTR(link_1, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + link_2 = bpf_program__attach_uprobe_multi(prog_2, -1, "/proc/self/exe", + "uprobe_multi_func_1", + NULL); + if (!ASSERT_OK_PTR(link_2, "bpf_program__attach_uprobe_multi")) { + bpf_link__destroy(link_1); + goto cleanup; + } + + uprobe_multi_func_1(); + + ASSERT_EQ(skel->bss->test1_result, 1, "test1_result"); + ASSERT_EQ(skel->bss->test2_result, 1, "test2_result"); + + bpf_link__destroy(link_1); + bpf_link__destroy(link_2); + +cleanup: + uprobe_multi_unique__destroy(skel); +} + static void test_unique(void) { if (test__start_subtest("unique_regs_common")) unique_regs_common(); if (test__start_subtest("unique_regs_ip")) unique_regs_ip(); + if (test__start_subtest("unique_attach")) + unique_attach(); } #else static void test_unique(void) { } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_unique.c b/tools/testing/selftests/bpf/progs/uprobe_multi_unique.c new file mode 100644 index 0000000000000..e31e17bd85ea2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_unique.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +pid_t my_pid = 0; + +int test1_result = 0; +int test2_result = 0; + +SEC("uprobe.multi") +int BPF_UPROBE(test1) +{ + pid_t pid = bpf_get_current_pid_tgid() >> 32; + + if (pid != my_pid) + return 0; + + test1_result = 1; + return 0; +} + +SEC("uprobe.multi") +int BPF_UPROBE(test2) +{ + pid_t pid = bpf_get_current_pid_tgid() >> 32; + + if (pid != my_pid) + return 0; + + test2_result = 1; + return 0; +} From ef664af706085dc986c9dc0681026a7f6974e169 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 1 Sep 2025 22:47:54 +0200 Subject: [PATCH 66/66] selftests/bpf: Add uprobe unique attach test Signed-off-by: Jiri Olsa --- .../testing/selftests/bpf/prog_tests/uprobe.c | 111 +++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe.c b/tools/testing/selftests/bpf/prog_tests/uprobe.c index cf3e0e7a64fae..342fcdb207aa7 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe.c @@ -33,7 +33,7 @@ static int urand_trigger(FILE **urand_pipe) return exit_code; } -void test_uprobe(void) +static void test_uprobe_urandlib(void) { LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); struct test_uprobe *skel; @@ -93,3 +93,112 @@ void test_uprobe(void) pclose(urand_pipe); test_uprobe__destroy(skel); } + +static noinline void uprobe_unique_trigger(void) +{ + asm volatile (""); +} + +static void test_uprobe_unique(void) +{ + LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts, + .func_name = "uprobe_unique_trigger", + ); + struct bpf_link *link_1, *link_2 = NULL; + struct bpf_program *prog_1, *prog_2; + struct test_uprobe *skel; + + skel = test_uprobe__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_uprobe__open_and_load")) + return; + + skel->bss->my_pid = getpid(); + + prog_1 = skel->progs.test1; + prog_2 = skel->progs.test2; + + /* not-unique and unique */ + uprobe_opts.unique = false; + link_1 = bpf_program__attach_uprobe_opts(prog_1, -1, "/proc/self/exe", + 0 /* offset */, &uprobe_opts); + if (!ASSERT_OK_PTR(link_1, "bpf_program__attach_uprobe_opts_1")) + goto cleanup; + + uprobe_opts.unique = true; + link_2 = bpf_program__attach_uprobe_opts(prog_2, -1, "/proc/self/exe", + 0 /* offset */, &uprobe_opts); + if (!ASSERT_ERR_PTR(link_2, "bpf_program__attach_uprobe_opts_2")) { + bpf_link__destroy(link_2); + goto cleanup; + } + + bpf_link__destroy(link_1); + + /* unique and unique */ + uprobe_opts.unique = true; + link_1 = bpf_program__attach_uprobe_opts(prog_1, -1, "/proc/self/exe", + 0 /* offset */, &uprobe_opts); + if (!ASSERT_OK_PTR(link_1, "bpf_program__attach_uprobe_opts_1")) + goto cleanup; + + uprobe_opts.unique = true; + link_2 = bpf_program__attach_uprobe_opts(prog_2, -1, "/proc/self/exe", + 0 /* offset */, &uprobe_opts); + if (!ASSERT_ERR_PTR(link_2, "bpf_program__attach_uprobe_opts_2")) { + bpf_link__destroy(link_2); + goto cleanup; + } + + bpf_link__destroy(link_1); + + /* unique and not-unique */ + uprobe_opts.unique = true; + link_1 = bpf_program__attach_uprobe_opts(prog_1, -1, "/proc/self/exe", + 0 /* offset */, &uprobe_opts); + if (!ASSERT_OK_PTR(link_1, "bpf_program__attach_uprobe_opts_1")) + goto cleanup; + + uprobe_opts.unique = false; + link_2 = bpf_program__attach_uprobe_opts(prog_2, -1, "/proc/self/exe", + 0 /* offset */, &uprobe_opts); + if (!ASSERT_ERR_PTR(link_2, "bpf_program__attach_uprobe_opts_2")) { + bpf_link__destroy(link_2); + goto cleanup; + } + + bpf_link__destroy(link_1); + + /* not-unique and not-unique */ + uprobe_opts.unique = false; + link_1 = bpf_program__attach_uprobe_opts(prog_1, -1, "/proc/self/exe", + 0 /* offset */, &uprobe_opts); + if (!ASSERT_OK_PTR(link_1, "bpf_program__attach_uprobe_opts_1")) + goto cleanup; + + uprobe_opts.unique = false; + link_2 = bpf_program__attach_uprobe_opts(prog_2, -1, "/proc/self/exe", + 0 /* offset */, &uprobe_opts); + if (!ASSERT_OK_PTR(link_2, "bpf_program__attach_uprobe_opts_2")) { + bpf_link__destroy(link_1); + goto cleanup; + } + + uprobe_unique_trigger(); + + ASSERT_EQ(skel->bss->test1_result, 1, "test1_result"); + ASSERT_EQ(skel->bss->test2_result, 1, "test2_result"); + + bpf_link__destroy(link_1); + bpf_link__destroy(link_2); + +cleanup: + test_uprobe__destroy(skel); +} + +void test_uprobe(void) +{ + if (test__start_subtest("urandlib")) + test_uprobe_urandlib(); + if (test__start_subtest("unique")) + test_uprobe_unique(); +}