From 3bfee4d47e11cc6bdd611133163b5aa64e1b15c6 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 2 Aug 2024 14:55:55 +0100 Subject: [PATCH 01/28] clockevents/drivers/i8253: Fix stop sequence for timer 0 commit 531b2ca0a940ac9db03f246c8b77c4201de72b00 upstream. According to the data sheet, writing the MODE register should stop the counter (and thus the interrupts). This appears to work on real hardware, at least modern Intel and AMD systems. It should also work on Hyper-V. However, on some buggy virtual machines the mode change doesn't have any effect until the counter is subsequently loaded (or perhaps when the IRQ next fires). So, set MODE 0 and then load the counter, to ensure that those buggy VMs do the right thing and the interrupts stop. And then write MODE 0 *again* to stop the counter on compliant implementations too. Apparently, Hyper-V keeps firing the IRQ *repeatedly* even in mode zero when it should only happen once, but the second MODE write stops that too. Userspace test program (mostly written by tglx): ===== #include #include #include #include #include static __always_inline void __out##bwl(type value, uint16_t port) \ { \ asm volatile("out" #bwl " %" #bw "0, %w1" \ : : "a"(value), "Nd"(port)); \ } \ \ static __always_inline type __in##bwl(uint16_t port) \ { \ type value; \ asm volatile("in" #bwl " %w1, %" #bw "0" \ : "=a"(value) : "Nd"(port)); \ return value; \ } BUILDIO(b, b, uint8_t) #define inb __inb #define outb __outb #define PIT_MODE 0x43 #define PIT_CH0 0x40 #define PIT_CH2 0x42 static int is8254; static void dump_pit(void) { if (is8254) { // Latch and output counter and status outb(0xC2, PIT_MODE); printf("%02x %02x %02x\n", inb(PIT_CH0), inb(PIT_CH0), inb(PIT_CH0)); } else { // Latch and output counter outb(0x0, PIT_MODE); printf("%02x %02x\n", inb(PIT_CH0), inb(PIT_CH0)); } } int main(int argc, char* argv[]) { int nr_counts = 2; if (argc > 1) nr_counts = atoi(argv[1]); if (argc > 2) is8254 = 1; if (ioperm(0x40, 4, 1) != 0) return 1; dump_pit(); printf("Set oneshot\n"); outb(0x38, PIT_MODE); outb(0x00, PIT_CH0); outb(0x0F, PIT_CH0); dump_pit(); usleep(1000); dump_pit(); printf("Set periodic\n"); outb(0x34, PIT_MODE); outb(0x00, PIT_CH0); outb(0x0F, PIT_CH0); dump_pit(); usleep(1000); dump_pit(); dump_pit(); usleep(100000); dump_pit(); usleep(100000); dump_pit(); printf("Set stop (%d counter writes)\n", nr_counts); outb(0x30, PIT_MODE); while (nr_counts--) outb(0xFF, PIT_CH0); dump_pit(); usleep(100000); dump_pit(); usleep(100000); dump_pit(); printf("Set MODE 0\n"); outb(0x30, PIT_MODE); dump_pit(); usleep(100000); dump_pit(); usleep(100000); dump_pit(); return 0; } ===== Suggested-by: Sean Christopherson Co-developed-by: Li RongQing Signed-off-by: Li RongQing Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Link: https://lore.kernel.org/all/20240802135555.564941-2-dwmw2@infradead.org Signed-off-by: Greg Kroah-Hartman (cherry picked from commit de47f33dde89af53928ca860a5c4ed64fb1e4fe3) --- arch/x86/kernel/cpu/mshyperv.c | 11 ----------- drivers/clocksource/i8253.c | 36 +++++++++++++++++++++++----------- include/linux/i8253.h | 1 - 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 5ae77d966cafe..e709070eed708 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -575,16 +574,6 @@ static void __init ms_hyperv_init_platform(void) if (efi_enabled(EFI_BOOT)) x86_platform.get_nmi_reason = hv_get_nmi_reason; - /* - * Hyper-V VMs have a PIT emulation quirk such that zeroing the - * counter register during PIT shutdown restarts the PIT. So it - * continues to interrupt @18.2 HZ. Setting i8253_clear_counter - * to false tells pit_shutdown() not to zero the counter so that - * the PIT really is shutdown. Generation 2 VMs don't have a PIT, - * and setting this value has no effect. - */ - i8253_clear_counter_on_shutdown = false; - #if IS_ENABLED(CONFIG_HYPERV) if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) || ms_hyperv.paravisor_present) diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c index cb215e6f2e834..39f7c2d736d16 100644 --- a/drivers/clocksource/i8253.c +++ b/drivers/clocksource/i8253.c @@ -20,13 +20,6 @@ DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -/* - * Handle PIT quirk in pit_shutdown() where zeroing the counter register - * restarts the PIT, negating the shutdown. On platforms with the quirk, - * platform specific code can set this to false. - */ -bool i8253_clear_counter_on_shutdown __ro_after_init = true; - #ifdef CONFIG_CLKSRC_I8253 /* * Since the PIT overflows every tick, its not very useful @@ -112,12 +105,33 @@ void clockevent_i8253_disable(void) { raw_spin_lock(&i8253_lock); + /* + * Writing the MODE register should stop the counter, according to + * the datasheet. This appears to work on real hardware (well, on + * modern Intel and AMD boxes; I didn't dig the Pegasos out of the + * shed). + * + * However, some virtual implementations differ, and the MODE change + * doesn't have any effect until either the counter is written (KVM + * in-kernel PIT) or the next interrupt (QEMU). And in those cases, + * it may not stop the *count*, only the interrupts. Although in + * the virt case, that probably doesn't matter, as the value of the + * counter will only be calculated on demand if the guest reads it; + * it's the interrupts which cause steal time. + * + * Hyper-V apparently has a bug where even in mode 0, the IRQ keeps + * firing repeatedly if the counter is running. But it *does* do the + * right thing when the MODE register is written. + * + * So: write the MODE and then load the counter, which ensures that + * the IRQ is stopped on those buggy virt implementations. And then + * write the MODE again, which is the right way to stop it. + */ outb_p(0x30, PIT_MODE); + outb_p(0, PIT_CH0); + outb_p(0, PIT_CH0); - if (i8253_clear_counter_on_shutdown) { - outb_p(0, PIT_CH0); - outb_p(0, PIT_CH0); - } + outb_p(0x30, PIT_MODE); raw_spin_unlock(&i8253_lock); } diff --git a/include/linux/i8253.h b/include/linux/i8253.h index bf169cfef7f12..56c280eb2d4fd 100644 --- a/include/linux/i8253.h +++ b/include/linux/i8253.h @@ -21,7 +21,6 @@ #define PIT_LATCH ((PIT_TICK_RATE + HZ/2) / HZ) extern raw_spinlock_t i8253_lock; -extern bool i8253_clear_counter_on_shutdown; extern struct clock_event_device i8253_clockevent; extern void clockevent_i8253_init(bool oneshot); extern void clockevent_i8253_disable(void); From 4933606d0ac9ce03dd76965962b723823ac953a1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 11 Apr 2024 16:39:05 +0200 Subject: [PATCH 02/28] sched/isolation: Prevent boot crash when the boot CPU is nohz_full Documentation/timers/no_hz.rst states that the "nohz_full=" mask must not include the boot CPU, which is no longer true after: 08ae95f4fd3b ("nohz_full: Allow the boot CPU to be nohz_full"). However after: aae17ebb53cd ("workqueue: Avoid using isolated cpus' timers on queue_delayed_work") the kernel will crash at boot time in this case; housekeeping_any_cpu() returns an invalid CPU number until smp_init() brings the first housekeeping CPU up. Change housekeeping_any_cpu() to check the result of cpumask_any_and() and return smp_processor_id() in this case. This is just the simple and backportable workaround which fixes the symptom, but smp_processor_id() at boot time should be safe at least for type == HK_TYPE_TIMER, this more or less matches the tick_do_timer_boot_cpu logic. There is no worry about cpu_down(); tick_nohz_cpu_down() will not allow to offline tick_do_timer_cpu (the 1st online housekeeping CPU). [ Apply only documentation changes as commit which causes boot crash when boot CPU is nohz_full is not backported to stable kernels - Krishanth ] Reported-by: Chris von Recklinghausen Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Phil Auld Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240411143905.GA19288@redhat.com Closes: https://lore.kernel.org/all/20240402105847.GA24832@redhat.com/ Signed-off-by: Krishanth Jagaduri [ strip out upstream commit and Fixes: so tools don't get confused that this commit actually does anything real - gregkh] Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 6e0447fa7d872c3557d75f355ccd1e4777ea437a) --- Documentation/timers/no_hz.rst | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Documentation/timers/no_hz.rst b/Documentation/timers/no_hz.rst index f8786be15183c..7fe8ef9718d8e 100644 --- a/Documentation/timers/no_hz.rst +++ b/Documentation/timers/no_hz.rst @@ -129,11 +129,8 @@ adaptive-tick CPUs: At least one non-adaptive-tick CPU must remain online to handle timekeeping tasks in order to ensure that system calls like gettimeofday() returns accurate values on adaptive-tick CPUs. (This is not an issue for CONFIG_NO_HZ_IDLE=y because there are no running -user processes to observe slight drifts in clock rate.) Therefore, the -boot CPU is prohibited from entering adaptive-ticks mode. Specifying a -"nohz_full=" mask that includes the boot CPU will result in a boot-time -error message, and the boot CPU will be removed from the mask. Note that -this means that your system must have at least two CPUs in order for +user processes to observe slight drifts in clock rate.) Note that this +means that your system must have at least two CPUs in order for CONFIG_NO_HZ_FULL=y to do anything for you. Finally, adaptive-ticks CPUs must have their RCU callbacks offloaded. From 9b34ec8fcadb1f24820bf55a3f1187ce95cf1e96 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 8 Nov 2024 18:01:47 +0800 Subject: [PATCH 03/28] zram: fix NULL pointer in comp_algorithm_show() commit f364cdeb38938f9d03061682b8ff3779dd1730e5 upstream. LTP reported a NULL pointer dereference as followed: CPU: 7 UID: 0 PID: 5995 Comm: cat Kdump: loaded Not tainted 6.12.0-rc6+ #3 Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __pi_strcmp+0x24/0x140 lr : zcomp_available_show+0x60/0x100 [zram] sp : ffff800088b93b90 x29: ffff800088b93b90 x28: 0000000000000001 x27: 0000000000400cc0 x26: 0000000000000ffe x25: ffff80007b3e2388 x24: 0000000000000000 x23: ffff80007b3e2390 x22: ffff0004041a9000 x21: ffff80007b3e2900 x20: 0000000000000000 x19: 0000000000000000 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: ffff80007b3e2900 x9 : ffff80007b3cb280 x8 : 0101010101010101 x7 : 0000000000000000 x6 : 0000000000000000 x5 : 0000000000000040 x4 : 0000000000000000 x3 : 00656c722d6f7a6c x2 : 0000000000000000 x1 : ffff80007b3e2900 x0 : 0000000000000000 Call trace: __pi_strcmp+0x24/0x140 comp_algorithm_show+0x40/0x70 [zram] dev_attr_show+0x28/0x80 sysfs_kf_seq_show+0x90/0x140 kernfs_seq_show+0x34/0x48 seq_read_iter+0x1d4/0x4e8 kernfs_fop_read_iter+0x40/0x58 new_sync_read+0x9c/0x168 vfs_read+0x1a8/0x1f8 ksys_read+0x74/0x108 __arm64_sys_read+0x24/0x38 invoke_syscall+0x50/0x120 el0_svc_common.constprop.0+0xc8/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x38/0x138 el0t_64_sync_handler+0xc0/0xc8 el0t_64_sync+0x188/0x190 The zram->comp_algs[ZRAM_PRIMARY_COMP] can be NULL in zram_add() if comp_algorithm_set() has not been called. User can access the zram device by sysfs after device_add_disk(), so there is a time window to trigger the NULL pointer dereference. Move it ahead device_add_disk() to make sure when user can access the zram device, it is ready. comp_algorithm_set() is protected by zram->init_lock in other places and no such problem. Link: https://lkml.kernel.org/r/20241108100147.3776123-1-liushixin2@huawei.com Fixes: 7ac07a26dea7 ("zram: preparation for multi-zcomp support") Signed-off-by: Liu Shixin Reviewed-by: Sergey Senozhatsky Cc: Jens Axboe Cc: Minchan Kim Signed-off-by: Andrew Morton [This fix does not backport zram_comp_params_reset which was introduced after v6.6, in commit f2bac7ad187d ("zram: introduce zcomp_params structure")] Signed-off-by: Jianqi Ren Signed-off-by: He Zhe Signed-off-by: Greg Kroah-Hartman (cherry picked from commit c7ee791e538537b281f60945298796f0a3971bbd) --- drivers/block/zram/zram_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index b73038ad86f7f..44cf0e51d7db6 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2247,6 +2247,8 @@ static int zram_add(void) zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); + comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); + /* Actual capacity set using sysfs (/sys/block/zram/disksize */ set_capacity(zram->disk, 0); /* zram devices sort of resembles non-rotational disks */ @@ -2281,8 +2283,6 @@ static int zram_add(void) if (ret) goto out_cleanup_disk; - comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); - zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; From 91eb186d03c25fd5011749f80cc8dcdb43681ea2 Mon Sep 17 00:00:00 2001 From: Felix Moessbauer Date: Wed, 14 Aug 2024 14:10:32 +0200 Subject: [PATCH 04/28] hrtimer: Use and report correct timerslack values for realtime tasks commit ed4fb6d7ef68111bb539283561953e5c6e9a6e38 upstream. The timerslack_ns setting is used to specify how much the hardware timers should be delayed, to potentially dispatch multiple timers in a single interrupt. This is a performance optimization. Timers of realtime tasks (having a realtime scheduling policy) should not be delayed. This logic was inconsitently applied to the hrtimers, leading to delays of realtime tasks which used timed waits for events (e.g. condition variables). Due to the downstream override of the slack for rt tasks, the procfs reported incorrect (non-zero) timerslack_ns values. This is changed by setting the timer_slack_ns task attribute to 0 for all tasks with a rt policy. By that, downstream users do not need to specially handle rt tasks (w.r.t. the slack), and the procfs entry shows the correct value of "0". Setting non-zero slack values (either via procfs or PR_SET_TIMERSLACK) on tasks with a rt policy is ignored, as stated in "man 2 PR_SET_TIMERSLACK": Timer slack is not applied to threads that are scheduled under a real-time scheduling policy (see sched_setscheduler(2)). The special handling of timerslack on rt tasks in downstream users is removed as well. Signed-off-by: Felix Moessbauer Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240814121032.368444-2-felix.moessbauer@siemens.com Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 472173544e74709590b3827a9a32e7ce0387624f) --- fs/proc/base.c | 9 +++++---- fs/select.c | 11 ++++------- kernel/sched/core.c | 8 ++++++++ kernel/sys.c | 2 ++ kernel/time/hrtimer.c | 18 +++--------------- 5 files changed, 22 insertions(+), 26 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 699f085d4de7d..91fe20b7657c0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2633,10 +2633,11 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, } task_lock(p); - if (slack_ns == 0) - p->timer_slack_ns = p->default_timer_slack_ns; - else - p->timer_slack_ns = slack_ns; + if (task_is_realtime(p)) + slack_ns = 0; + else if (slack_ns == 0) + slack_ns = p->default_timer_slack_ns; + p->timer_slack_ns = slack_ns; task_unlock(p); out: diff --git a/fs/select.c b/fs/select.c index 3f730b8581f65..e66b6189845ea 100644 --- a/fs/select.c +++ b/fs/select.c @@ -77,19 +77,16 @@ u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; struct timespec64 now; + u64 slack = current->timer_slack_ns; - /* - * Realtime tasks get a slack of 0 for obvious reasons. - */ - - if (rt_task(current)) + if (slack == 0) return 0; ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); - if (ret < current->timer_slack_ns) - return current->timer_slack_ns; + if (ret < slack) + return slack; return ret; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f47a1c40c3f4e..c5ad7e72ef161 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7540,6 +7540,14 @@ static void __setscheduler_params(struct task_struct *p, else if (fair_policy(policy)) p->static_prio = NICE_TO_PRIO(attr->sched_nice); + /* rt-policy tasks do not have a timerslack */ + if (task_is_realtime(p)) { + p->timer_slack_ns = 0; + } else if (p->timer_slack_ns == 0) { + /* when switching back to non-rt policy, restore timerslack */ + p->timer_slack_ns = p->default_timer_slack_ns; + } + /* * __sched_setscheduler() ensures attr->sched_priority == 0 when * !rt_policy. Always setting this ensures that things like diff --git a/kernel/sys.c b/kernel/sys.c index 44b5759903332..355de0b65c235 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2535,6 +2535,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: + if (task_is_realtime(current)) + break; if (arg2 <= 0) current->timer_slack_ns = current->default_timer_slack_ns; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e99b1305e1a5f..5db6912b8f6e1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2093,14 +2093,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - u64 slack; - - slack = current->timer_slack_ns; - if (rt_task(current)) - slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); + hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; @@ -2281,7 +2276,7 @@ void __init hrtimers_init(void) /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks + * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * @clock_id: timer clock to be used */ @@ -2308,13 +2303,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return -EINTR; } - /* - * Override any slack passed by the user if under - * rt contraints. - */ - if (rt_task(current)) - delta = 0; - hrtimer_init_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); @@ -2334,7 +2322,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks + * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has From 79bb7a64741ca7ac6be6d4b6071def7d2623e183 Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Fri, 20 Sep 2024 16:06:59 -0300 Subject: [PATCH 05/28] bpf: Use raw_spinlock_t in ringbuf commit 8b62645b09f870d70c7910e7550289d444239a46 upstream. The function __bpf_ringbuf_reserve is invoked from a tracepoint, which disables preemption. Using spinlock_t in this context can lead to a "sleep in atomic" warning in the RT variant. This issue is illustrated in the example below: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 556208, name: test_progs preempt_count: 1, expected: 0 RCU nest depth: 1, expected: 1 INFO: lockdep is turned off. Preemption disabled at: [] migrate_enable+0xc0/0x39c CPU: 7 PID: 556208 Comm: test_progs Tainted: G Hardware name: Qualcomm SA8775P Ride (DT) Call trace: dump_backtrace+0xac/0x130 show_stack+0x1c/0x30 dump_stack_lvl+0xac/0xe8 dump_stack+0x18/0x30 __might_resched+0x3bc/0x4fc rt_spin_lock+0x8c/0x1a4 __bpf_ringbuf_reserve+0xc4/0x254 bpf_ringbuf_reserve_dynptr+0x5c/0xdc bpf_prog_ac3d15160d62622a_test_read_write+0x104/0x238 trace_call_bpf+0x238/0x774 perf_call_bpf_enter.isra.0+0x104/0x194 perf_syscall_enter+0x2f8/0x510 trace_sys_enter+0x39c/0x564 syscall_trace_enter+0x220/0x3c0 do_el0_svc+0x138/0x1dc el0_svc+0x54/0x130 el0t_64_sync_handler+0x134/0x150 el0t_64_sync+0x17c/0x180 Switch the spinlock to raw_spinlock_t to avoid this error. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: Brian Grech Signed-off-by: Wander Lairson Costa Signed-off-by: Wander Lairson Costa Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240920190700.617253-1-wander@redhat.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jianqi Ren Signed-off-by: He Zhe (cherry picked from commit f9543375d9b150b2bcf16bb182e6b62309db0888) --- kernel/bpf/ringbuf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 528f4d6342262..6aff5ee483b60 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -29,7 +29,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; - spinlock_t spinlock ____cacheline_aligned_in_smp; + raw_spinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is @@ -173,7 +173,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) if (!rb) return NULL; - spin_lock_init(&rb->spinlock); + raw_spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -417,10 +417,10 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) cons_pos = smp_load_acquire(&rb->consumer_pos); if (in_nmi()) { - if (!spin_trylock_irqsave(&rb->spinlock, flags)) + if (!raw_spin_trylock_irqsave(&rb->spinlock, flags)) return NULL; } else { - spin_lock_irqsave(&rb->spinlock, flags); + raw_spin_lock_irqsave(&rb->spinlock, flags); } pend_pos = rb->pending_pos; @@ -446,7 +446,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) */ if (new_prod_pos - cons_pos > rb->mask || new_prod_pos - pend_pos > rb->mask) { - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } @@ -458,7 +458,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; } From 95d3e0a36d05d9ec6e6e177011b94dfcc485434d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 May 2024 12:52:52 +0000 Subject: [PATCH 06/28] tcp: fix races in tcp_abort() commit 5ce4645c23cf5f048eb8e9ce49e514bababdee85 upstream. tcp_abort() has the same issue than the one fixed in the prior patch in tcp_write_err(). In order to get consistent results from tcp_poll(), we must call sk_error_report() after tcp_done(). We can use tcp_done_with_error() to centralize this logic. Fixes: c1e64e298b8c ("net: diag: Support destroying TCP sockets.") Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Link: https://lore.kernel.org/r/20240528125253.1966136-4-edumazet@google.com Signed-off-by: Jakub Kicinski [youngmin: Resolved minor conflict in net/ipv4/tcp.c] Signed-off-by: Youngmin Nam Signed-off-by: Greg Kroah-Hartman (cherry picked from commit abadaa355730fbdcef902eb36983e6011fb91e49) --- net/ipv4/tcp.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f741f17345ff9..528c4005c95d5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4630,13 +4630,9 @@ int tcp_abort(struct sock *sk, int err) bh_lock_sock(sk); if (!sock_flag(sk, SOCK_DEAD)) { - WRITE_ONCE(sk->sk_err, err); - /* This barrier is coupled with smp_rmb() in tcp_poll() */ - smp_wmb(); - sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC); - tcp_done(sk); + tcp_done_with_error(sk, err); } bh_unlock_sock(sk); From 120f120bbf6ebdc5e879ef7d323de90005475b07 Mon Sep 17 00:00:00 2001 From: Xueming Feng Date: Mon, 26 Aug 2024 18:23:27 +0800 Subject: [PATCH 07/28] tcp: fix forever orphan socket caused by tcp_abort commit bac76cf89816bff06c4ec2f3df97dc34e150a1c4 upstream. We have some problem closing zero-window fin-wait-1 tcp sockets in our environment. This patch come from the investigation. Previously tcp_abort only sends out reset and calls tcp_done when the socket is not SOCK_DEAD, aka orphan. For orphan socket, it will only purging the write queue, but not close the socket and left it to the timer. While purging the write queue, tp->packets_out and sk->sk_write_queue is cleared along the way. However tcp_retransmit_timer have early return based on !tp->packets_out and tcp_probe_timer have early return based on !sk->sk_write_queue. This caused ICSK_TIME_RETRANS and ICSK_TIME_PROBE0 not being resched and socket not being killed by the timers, converting a zero-windowed orphan into a forever orphan. This patch removes the SOCK_DEAD check in tcp_abort, making it send reset to peer and close the socket accordingly. Preventing the timer-less orphan from happening. According to Lorenzo's email in the v1 thread, the check was there to prevent force-closing the same socket twice. That situation is handled by testing for TCP_CLOSE inside lock, and returning -ENOENT if it is already closed. The -ENOENT code comes from the associate patch Lorenzo made for iproute2-ss; link attached below, which also conform to RFC 9293. At the end of the patch, tcp_write_queue_purge(sk) is removed because it was already called in tcp_done_with_error(). p.s. This is the same patch with v2. Resent due to mis-labeled "changes requested" on patchwork.kernel.org. Link: https://patchwork.ozlabs.org/project/netdev/patch/1450773094-7978-3-git-send-email-lorenzo@google.com/ Fixes: c1e64e298b8c ("net: diag: Support destroying TCP sockets.") Signed-off-by: Xueming Feng Tested-by: Lorenzo Colitti Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240826102327.1461482-1-kuro@kuroa.me Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/lkml/Z9OZS%2Fhc+v5og6%2FU@perf/ [youngmin: Resolved minor conflict in net/ipv4/tcp.c] Signed-off-by: Youngmin Nam Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 372df1f2057c389490d4ae726eef9772a3a4d5f2) --- net/ipv4/tcp.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 528c4005c95d5..15becbbfa55e2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4620,6 +4620,13 @@ int tcp_abort(struct sock *sk, int err) /* Don't race with userspace socket closes such as tcp_close. */ lock_sock(sk); + /* Avoid closing the same socket twice. */ + if (sk->sk_state == TCP_CLOSE) { + if (!has_current_bpf_ctx()) + release_sock(sk); + return -ENOENT; + } + if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); inet_csk_listen_stop(sk); @@ -4629,15 +4636,12 @@ int tcp_abort(struct sock *sk, int err) local_bh_disable(); bh_lock_sock(sk); - if (!sock_flag(sk, SOCK_DEAD)) { - if (tcp_need_reset(sk->sk_state)) - tcp_send_active_reset(sk, GFP_ATOMIC); - tcp_done_with_error(sk, err); - } + if (tcp_need_reset(sk->sk_state)) + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_done_with_error(sk, err); bh_unlock_sock(sk); local_bh_enable(); - tcp_write_queue_purge(sk); if (!has_current_bpf_ctx()) release_sock(sk); return 0; From 3ea7d707d70bac449746fd51510a4580237d7e64 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 9 Feb 2025 15:52:52 -0800 Subject: [PATCH 08/28] fbdev: hyperv_fb: iounmap() the correct memory when removing a device [ Upstream commit 7241c886a71797cc51efc6fadec7076fcf6435c2 ] When a Hyper-V framebuffer device is removed, or the driver is unbound from a device, any allocated and/or mapped memory must be released. In particular, MMIO address space that was mapped to the framebuffer must be unmapped. Current code unmaps the wrong address, resulting in an error like: [ 4093.980597] iounmap: bad address 00000000c936c05c followed by a stack dump. Commit d21987d709e8 ("video: hyperv: hyperv_fb: Support deferred IO for Hyper-V frame buffer driver") changed the kind of address stored in info->screen_base, and the iounmap() call in hvfb_putmem() was not updated accordingly. Fix this by updating hvfb_putmem() to unmap the correct address. Fixes: d21987d709e8 ("video: hyperv: hyperv_fb: Support deferred IO for Hyper-V frame buffer driver") Signed-off-by: Michael Kelley Reviewed-by: Saurabh Sengar Link: https://lore.kernel.org/r/20250209235252.2987-1-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20250209235252.2987-1-mhklinux@outlook.com> Signed-off-by: Sasha Levin (cherry picked from commit c198157ae1585495908a23e7adf578c571e78323) --- drivers/video/fbdev/hyperv_fb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c index b9965cbdd7642..80e8ec36b7db2 100644 --- a/drivers/video/fbdev/hyperv_fb.c +++ b/drivers/video/fbdev/hyperv_fb.c @@ -1106,7 +1106,7 @@ static void hvfb_putmem(struct hv_device *hdev, struct fb_info *info) if (par->need_docopy) { vfree(par->dio_vp); - iounmap(info->screen_base); + iounmap(par->mmio_vp); vmbus_free_mmio(par->mem->start, screen_fb_size); } else { hvfb_release_phymem(hdev, info->fix.smem_start, From ddc4b735b4ad69d278e4956d04a3341c17f778c5 Mon Sep 17 00:00:00 2001 From: Artur Weber Date: Fri, 7 Feb 2025 21:02:41 +0100 Subject: [PATCH 09/28] pinctrl: bcm281xx: Fix incorrect regmap max_registers value [ Upstream commit 68283c1cb573143c0b7515e93206f3503616bc10 ] The max_registers value does not take into consideration the stride; currently, it's set to the number of the last pin, but this does not accurately represent the final register. Fix this by multiplying the current value by 4. Fixes: 54b1aa5a5b16 ("ARM: pinctrl: Add Broadcom Capri pinctrl driver") Signed-off-by: Artur Weber Link: https://lore.kernel.org/20250207-bcm21664-pinctrl-v1-2-e7cfac9b2d3b@gmail.com Signed-off-by: Linus Walleij Signed-off-by: Sasha Levin (cherry picked from commit 21e65f1fef649cc2949aaaa3f2bb7c05931fb63d) --- drivers/pinctrl/bcm/pinctrl-bcm281xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/bcm/pinctrl-bcm281xx.c b/drivers/pinctrl/bcm/pinctrl-bcm281xx.c index 73dbf29c002f3..cf6efa9c0364a 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm281xx.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm281xx.c @@ -974,7 +974,7 @@ static const struct regmap_config bcm281xx_pinctrl_regmap_config = { .reg_bits = 32, .reg_stride = 4, .val_bits = 32, - .max_register = BCM281XX_PIN_VC_CAM3_SDA, + .max_register = BCM281XX_PIN_VC_CAM3_SDA * 4, }; static int bcm281xx_pinctrl_get_groups_count(struct pinctrl_dev *pctldev) From b40d2a844b588d54a7c0dfcf74507fdece5cff28 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 17 Feb 2025 17:02:42 +0100 Subject: [PATCH 10/28] netfilter: nft_ct: Use __refcount_inc() for per-CPU nft_ct_pcpu_template. [ Upstream commit 5cfe5612ca9590db69b9be29dc83041dbf001108 ] nft_ct_pcpu_template is a per-CPU variable and relies on disabled BH for its locking. The refcounter is read and if its value is set to one then the refcounter is incremented and variable is used - otherwise it is already in use and left untouched. Without per-CPU locking in local_bh_disable() on PREEMPT_RT the read-then-increment operation is not atomic and therefore racy. This can be avoided by using unconditionally __refcount_inc() which will increment counter and return the old value as an atomic operation. In case the returned counter is not one, the variable is in use and we need to decrement counter. Otherwise we can use it. Use __refcount_inc() instead of read and a conditional increment. Fixes: edee4f1e9245 ("netfilter: nft_ct: add zone id set support") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin (cherry picked from commit 29bde9751cb708f0f7e2b4f21d95addc59711988) --- net/netfilter/nft_ct.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 255640013ab84..ab1214da99ff3 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -230,6 +230,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, enum ip_conntrack_info ctinfo; u16 value = nft_reg_load16(®s->data[priv->sreg]); struct nf_conn *ct; + int oldcnt; ct = nf_ct_get(skb, &ctinfo); if (ct) /* already tracked */ @@ -250,10 +251,11 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, ct = this_cpu_read(nft_ct_pcpu_template); - if (likely(refcount_read(&ct->ct_general.use) == 1)) { - refcount_inc(&ct->ct_general.use); + __refcount_inc(&ct->ct_general.use, &oldcnt); + if (likely(oldcnt == 1)) { nf_ct_zone_add(ct, &zone); } else { + refcount_dec(&ct->ct_general.use); /* previous skb got queued to userspace, allocate temporary * one until percpu template can be reused. */ From 164e97e70b80e4145a25cb4d03f8b508f89a1835 Mon Sep 17 00:00:00 2001 From: Grzegorz Nitka Date: Thu, 23 Jan 2025 09:15:39 +0100 Subject: [PATCH 11/28] ice: fix memory leak in aRFS after reset [ Upstream commit 23d97f18901ef5e4e264e3b1777fe65c760186b5 ] Fix aRFS (accelerated Receive Flow Steering) structures memory leak by adding a checker to verify if aRFS memory is already allocated while configuring VSI. aRFS objects are allocated in two cases: - as part of VSI initialization (at probe), and - as part of reset handling However, VSI reconfiguration executed during reset involves memory allocation one more time, without prior releasing already allocated resources. This led to the memory leak with the following signature: [root@os-delivery ~]# cat /sys/kernel/debug/kmemleak unreferenced object 0xff3c1ca7252e6000 (size 8192): comm "kworker/0:0", pid 8, jiffies 4296833052 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 0): [] __kmalloc_cache_noprof+0x275/0x340 [] ice_init_arfs+0x3a/0xe0 [ice] [] ice_vsi_cfg_def+0x607/0x850 [ice] [] ice_vsi_setup+0x5b/0x130 [ice] [] ice_init+0x1c1/0x460 [ice] [] ice_probe+0x2af/0x520 [ice] [] local_pci_probe+0x43/0xa0 [] work_for_cpu_fn+0x13/0x20 [] process_one_work+0x179/0x390 [] worker_thread+0x239/0x340 [] kthread+0xcc/0x100 [] ret_from_fork+0x2d/0x50 [] ret_from_fork_asm+0x1a/0x30 ... Fixes: 28bf26724fdb ("ice: Implement aRFS") Reviewed-by: Michal Swiatkowski Signed-off-by: Grzegorz Nitka Reviewed-by: Simon Horman Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin (cherry picked from commit 5d30d256661fc11b6e73fac6c3783a702e1006a3) --- drivers/net/ethernet/intel/ice/ice_arfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_arfs.c b/drivers/net/ethernet/intel/ice/ice_arfs.c index cca0e753f38ff..d7e0116f67737 100644 --- a/drivers/net/ethernet/intel/ice/ice_arfs.c +++ b/drivers/net/ethernet/intel/ice/ice_arfs.c @@ -510,7 +510,7 @@ void ice_init_arfs(struct ice_vsi *vsi) struct hlist_head *arfs_fltr_list; unsigned int i; - if (!vsi || vsi->type != ICE_VSI_PF) + if (!vsi || vsi->type != ICE_VSI_PF || ice_is_arfs_active(vsi)) return; arfs_fltr_list = kcalloc(ICE_MAX_ARFS_LIST, sizeof(*arfs_fltr_list), From 243e8b70f3b0936945cbba2c0d8b42c15a63738b Mon Sep 17 00:00:00 2001 From: Nicklas Bo Jensen Date: Thu, 27 Feb 2025 13:32:34 +0000 Subject: [PATCH 12/28] netfilter: nf_conncount: garbage collection is not skipped when jiffies wrap around [ Upstream commit df08c94baafb001de6cf44bb7098bb557f36c335 ] nf_conncount is supposed to skip garbage collection if it has already run garbage collection in the same jiffy. Unfortunately, this is broken when jiffies wrap around which this patch fixes. The problem is that last_gc in the nf_conncount_list struct is an u32, but jiffies is an unsigned long which is 8 bytes on my systems. When those two are compared it only works until last_gc wraps around. See bug report: https://bugzilla.netfilter.org/show_bug.cgi?id=1778 for more details. Fixes: d265929930e2 ("netfilter: nf_conncount: reduce unnecessary GC") Signed-off-by: Nicklas Bo Jensen Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin (cherry picked from commit 0a9f0cfd2ae8d0e61168eaf1648568692327bc0d) --- net/netfilter/nf_conncount.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 5885810da412f..71869ad466467 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -132,7 +132,7 @@ static int __nf_conncount_add(struct net *net, struct nf_conn *found_ct; unsigned int collect = 0; - if (time_is_after_eq_jiffies((unsigned long)list->last_gc)) + if ((u32)jiffies == list->last_gc) goto add_new_node; /* check the saved connections */ @@ -234,7 +234,7 @@ bool nf_conncount_gc_list(struct net *net, bool ret = false; /* don't bother if we just did GC */ - if (time_is_after_eq_jiffies((unsigned long)READ_ONCE(list->last_gc))) + if ((u32)jiffies == READ_ONCE(list->last_gc)) return false; /* don't bother if other cpu is already doing GC */ From fe940c7898b66097d3de54f72818682cdf16b3eb Mon Sep 17 00:00:00 2001 From: Jun Yang Date: Wed, 5 Mar 2025 23:44:10 +0800 Subject: [PATCH 13/28] sched: address a potential NULL pointer dereference in the GRED scheduler. [ Upstream commit 115ef44a98220fddfab37a39a19370497cd718b9 ] If kzalloc in gred_init returns a NULL pointer, the code follows the error handling path, invoking gred_destroy. This, in turn, calls gred_offload, where memset could receive a NULL pointer as input, potentially leading to a kernel crash. When table->opt is NULL in gred_init(), gred_change_table_def() is not called yet, so it is not necessary to call ->ndo_setup_tc() in gred_offload(). Signed-off-by: Jun Yang Reviewed-by: Cong Wang Fixes: f25c0515c521 ("net: sched: gred: dynamically allocate tc_gred_qopt_offload") Link: https://patch.msgid.link/20250305154410.3505642-1-juny24602@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin (cherry picked from commit 0f0a152957d64ce45b4c27c687e7d087e8f45079) --- net/sched/sch_gred.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 872d127c9db42..fa7a1b69c0f35 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -913,7 +913,8 @@ static void gred_destroy(struct Qdisc *sch) for (i = 0; i < table->DPs; i++) gred_destroy_vq(table->tab[i]); - gred_offload(sch, TC_GRED_DESTROY); + if (table->opt) + gred_offload(sch, TC_GRED_DESTROY); kfree(table->opt); } From 9ad43c90c02198900fe117200e04c4f9ce835c3f Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Thu, 6 Mar 2025 12:37:59 +0200 Subject: [PATCH 14/28] wifi: cfg80211: cancel wiphy_work before freeing wiphy [ Upstream commit 72d520476a2fab6f3489e8388ab524985d6c4b90 ] A wiphy_work can be queued from the moment the wiphy is allocated and initialized (i.e. wiphy_new_nm). When a wiphy_work is queued, the rdev::wiphy_work is getting queued. If wiphy_free is called before the rdev::wiphy_work had a chance to run, the wiphy memory will be freed, and then when it eventally gets to run it'll use invalid memory. Fix this by canceling the work before freeing the wiphy. Fixes: a3ee4dc84c4e ("wifi: cfg80211: add a work abstraction with special semantics") Signed-off-by: Miri Korenblit Reviewed-by: Johannes Berg Link: https://patch.msgid.link/20250306123626.efd1d19f6e07.I48229f96f4067ef73f5b87302335e2fd750136c9@changeid Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin (cherry picked from commit 75d262ad3c36d52852d764588fcd887f0fcd9138) --- net/wireless/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/wireless/core.c b/net/wireless/core.c index 63312f9c6e3ef..f5e5f9be875d1 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1149,6 +1149,13 @@ void cfg80211_dev_free(struct cfg80211_registered_device *rdev) { struct cfg80211_internal_bss *scan, *tmp; struct cfg80211_beacon_registration *reg, *treg; + unsigned long flags; + + spin_lock_irqsave(&rdev->wiphy_work_lock, flags); + WARN_ON(!list_empty(&rdev->wiphy_work_list)); + spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); + cancel_work_sync(&rdev->wiphy_work); + rfkill_destroy(rdev->wiphy.rfkill); list_for_each_entry_safe(reg, treg, &rdev->beacon_registrations, list) { list_del(®->list); From 2b8bf37b8ed0916243adc6eadfd359dd2807def3 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 28 Feb 2025 13:12:54 -0500 Subject: [PATCH 15/28] Bluetooth: hci_event: Fix enabling passive scanning [ Upstream commit 0bdd88971519cfa8a76d1a4dde182e74cfbd5d5c ] Passive scanning shall only be enabled when disconnecting LE links, otherwise it may start result in triggering scanning when e.g. an ISO link disconnects: > HCI Event: LE Meta Event (0x3e) plen 29 LE Connected Isochronous Stream Established (0x19) Status: Success (0x00) Connection Handle: 257 CIG Synchronization Delay: 0 us (0x000000) CIS Synchronization Delay: 0 us (0x000000) Central to Peripheral Latency: 10000 us (0x002710) Peripheral to Central Latency: 10000 us (0x002710) Central to Peripheral PHY: LE 2M (0x02) Peripheral to Central PHY: LE 2M (0x02) Number of Subevents: 1 Central to Peripheral Burst Number: 1 Peripheral to Central Burst Number: 1 Central to Peripheral Flush Timeout: 2 Peripheral to Central Flush Timeout: 2 Central to Peripheral MTU: 320 Peripheral to Central MTU: 160 ISO Interval: 10.00 msec (0x0008) ... > HCI Event: Disconnect Complete (0x05) plen 4 Status: Success (0x00) Handle: 257 Reason: Remote User Terminated Connection (0x13) < HCI Command: LE Set Extended Scan Enable (0x08|0x0042) plen 6 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) Fixes: 9fcb18ef3acb ("Bluetooth: Introduce LE auto connect options") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin (cherry picked from commit 7e8cd2bc09b0ca57879fe30ad1e4d99658a39751) --- net/bluetooth/hci_event.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 3925f2aeea121..716f078cffd08 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3394,23 +3394,30 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, void *data, hci_update_scan(hdev); } - params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); - if (params) { - switch (params->auto_connect) { - case HCI_AUTO_CONN_LINK_LOSS: - if (ev->reason != HCI_ERROR_CONNECTION_TIMEOUT) + /* Re-enable passive scanning if disconnected device is marked + * as auto-connectable. + */ + if (conn->type == LE_LINK) { + params = hci_conn_params_lookup(hdev, &conn->dst, + conn->dst_type); + if (params) { + switch (params->auto_connect) { + case HCI_AUTO_CONN_LINK_LOSS: + if (ev->reason != HCI_ERROR_CONNECTION_TIMEOUT) + break; + fallthrough; + + case HCI_AUTO_CONN_DIRECT: + case HCI_AUTO_CONN_ALWAYS: + hci_pend_le_list_del_init(params); + hci_pend_le_list_add(params, + &hdev->pend_le_conns); + hci_update_passive_scan(hdev); break; - fallthrough; - case HCI_AUTO_CONN_DIRECT: - case HCI_AUTO_CONN_ALWAYS: - hci_pend_le_list_del_init(params); - hci_pend_le_list_add(params, &hdev->pend_le_conns); - hci_update_passive_scan(hdev); - break; - - default: - break; + default: + break; + } } } From 68f6da0f550df65cac37b01ad89941df8b9292ad Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 4 Mar 2025 10:06:10 -0500 Subject: [PATCH 16/28] Revert "Bluetooth: hci_core: Fix sleeping function called from invalid context" [ Upstream commit ab6ab707a4d060a51c45fc13e3b2228d5f7c0b87 ] This reverts commit 4d94f05558271654670d18c26c912da0c1c15549 which has problems (see [1]) and is no longer needed since 581dd2dc168f ("Bluetooth: hci_event: Fix using rcu_read_(un)lock while iterating") has reworked the code where the original bug has been found. [1] Link: https://lore.kernel.org/linux-bluetooth/877c55ci1r.wl-tiwai@suse.de/T/#t Fixes: 4d94f0555827 ("Bluetooth: hci_core: Fix sleeping function called from invalid context") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin (cherry picked from commit 5e8ce74fb04151ef415ecbedb3af17eed1ead1f5) --- include/net/bluetooth/hci_core.h | 108 +++++++++++-------------------- net/bluetooth/hci_core.c | 10 ++- net/bluetooth/iso.c | 6 -- net/bluetooth/l2cap_core.c | 12 ++-- net/bluetooth/rfcomm/core.c | 6 -- net/bluetooth/sco.c | 12 ++-- 6 files changed, 57 insertions(+), 97 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2373ea839b229..1fc8e843c1619 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -801,6 +801,7 @@ struct hci_conn_params { extern struct list_head hci_dev_list; extern struct list_head hci_cb_list; extern rwlock_t hci_dev_list_lock; +extern struct mutex hci_cb_list_lock; #define hci_dev_set_flag(hdev, nr) set_bit((nr), (hdev)->dev_flags) #define hci_dev_clear_flag(hdev, nr) clear_bit((nr), (hdev)->dev_flags) @@ -1949,47 +1950,24 @@ struct hci_cb { char *name; - bool (*match) (struct hci_conn *conn); void (*connect_cfm) (struct hci_conn *conn, __u8 status); void (*disconn_cfm) (struct hci_conn *conn, __u8 status); void (*security_cfm) (struct hci_conn *conn, __u8 status, - __u8 encrypt); + __u8 encrypt); void (*key_change_cfm) (struct hci_conn *conn, __u8 status); void (*role_switch_cfm) (struct hci_conn *conn, __u8 status, __u8 role); }; -static inline void hci_cb_lookup(struct hci_conn *conn, struct list_head *list) -{ - struct hci_cb *cb, *cpy; - - rcu_read_lock(); - list_for_each_entry_rcu(cb, &hci_cb_list, list) { - if (cb->match && cb->match(conn)) { - cpy = kmalloc(sizeof(*cpy), GFP_ATOMIC); - if (!cpy) - break; - - *cpy = *cb; - INIT_LIST_HEAD(&cpy->list); - list_add_rcu(&cpy->list, list); - } - } - rcu_read_unlock(); -} - static inline void hci_connect_cfm(struct hci_conn *conn, __u8 status) { - struct list_head list; - struct hci_cb *cb, *tmp; - - INIT_LIST_HEAD(&list); - hci_cb_lookup(conn, &list); + struct hci_cb *cb; - list_for_each_entry_safe(cb, tmp, &list, list) { + mutex_lock(&hci_cb_list_lock); + list_for_each_entry(cb, &hci_cb_list, list) { if (cb->connect_cfm) cb->connect_cfm(conn, status); - kfree(cb); } + mutex_unlock(&hci_cb_list_lock); if (conn->connect_cfm_cb) conn->connect_cfm_cb(conn, status); @@ -1997,43 +1975,22 @@ static inline void hci_connect_cfm(struct hci_conn *conn, __u8 status) static inline void hci_disconn_cfm(struct hci_conn *conn, __u8 reason) { - struct list_head list; - struct hci_cb *cb, *tmp; - - INIT_LIST_HEAD(&list); - hci_cb_lookup(conn, &list); + struct hci_cb *cb; - list_for_each_entry_safe(cb, tmp, &list, list) { + mutex_lock(&hci_cb_list_lock); + list_for_each_entry(cb, &hci_cb_list, list) { if (cb->disconn_cfm) cb->disconn_cfm(conn, reason); - kfree(cb); } + mutex_unlock(&hci_cb_list_lock); if (conn->disconn_cfm_cb) conn->disconn_cfm_cb(conn, reason); } -static inline void hci_security_cfm(struct hci_conn *conn, __u8 status, - __u8 encrypt) -{ - struct list_head list; - struct hci_cb *cb, *tmp; - - INIT_LIST_HEAD(&list); - hci_cb_lookup(conn, &list); - - list_for_each_entry_safe(cb, tmp, &list, list) { - if (cb->security_cfm) - cb->security_cfm(conn, status, encrypt); - kfree(cb); - } - - if (conn->security_cfm_cb) - conn->security_cfm_cb(conn, status); -} - static inline void hci_auth_cfm(struct hci_conn *conn, __u8 status) { + struct hci_cb *cb; __u8 encrypt; if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) @@ -2041,11 +1998,20 @@ static inline void hci_auth_cfm(struct hci_conn *conn, __u8 status) encrypt = test_bit(HCI_CONN_ENCRYPT, &conn->flags) ? 0x01 : 0x00; - hci_security_cfm(conn, status, encrypt); + mutex_lock(&hci_cb_list_lock); + list_for_each_entry(cb, &hci_cb_list, list) { + if (cb->security_cfm) + cb->security_cfm(conn, status, encrypt); + } + mutex_unlock(&hci_cb_list_lock); + + if (conn->security_cfm_cb) + conn->security_cfm_cb(conn, status); } static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status) { + struct hci_cb *cb; __u8 encrypt; if (conn->state == BT_CONFIG) { @@ -2072,38 +2038,40 @@ static inline void hci_encrypt_cfm(struct hci_conn *conn, __u8 status) conn->sec_level = conn->pending_sec_level; } - hci_security_cfm(conn, status, encrypt); + mutex_lock(&hci_cb_list_lock); + list_for_each_entry(cb, &hci_cb_list, list) { + if (cb->security_cfm) + cb->security_cfm(conn, status, encrypt); + } + mutex_unlock(&hci_cb_list_lock); + + if (conn->security_cfm_cb) + conn->security_cfm_cb(conn, status); } static inline void hci_key_change_cfm(struct hci_conn *conn, __u8 status) { - struct list_head list; - struct hci_cb *cb, *tmp; - - INIT_LIST_HEAD(&list); - hci_cb_lookup(conn, &list); + struct hci_cb *cb; - list_for_each_entry_safe(cb, tmp, &list, list) { + mutex_lock(&hci_cb_list_lock); + list_for_each_entry(cb, &hci_cb_list, list) { if (cb->key_change_cfm) cb->key_change_cfm(conn, status); - kfree(cb); } + mutex_unlock(&hci_cb_list_lock); } static inline void hci_role_switch_cfm(struct hci_conn *conn, __u8 status, __u8 role) { - struct list_head list; - struct hci_cb *cb, *tmp; - - INIT_LIST_HEAD(&list); - hci_cb_lookup(conn, &list); + struct hci_cb *cb; - list_for_each_entry_safe(cb, tmp, &list, list) { + mutex_lock(&hci_cb_list_lock); + list_for_each_entry(cb, &hci_cb_list, list) { if (cb->role_switch_cfm) cb->role_switch_cfm(conn, status, role); - kfree(cb); } + mutex_unlock(&hci_cb_list_lock); } static inline bool hci_bdaddr_is_rpa(bdaddr_t *bdaddr, u8 addr_type) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 948bf344cb9a8..076e4e7061c99 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -58,6 +58,7 @@ DEFINE_RWLOCK(hci_dev_list_lock); /* HCI callback list */ LIST_HEAD(hci_cb_list); +DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); @@ -2972,7 +2973,9 @@ int hci_register_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); - list_add_tail_rcu(&cb->list, &hci_cb_list); + mutex_lock(&hci_cb_list_lock); + list_add_tail(&cb->list, &hci_cb_list); + mutex_unlock(&hci_cb_list_lock); return 0; } @@ -2982,8 +2985,9 @@ int hci_unregister_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); - list_del_rcu(&cb->list); - synchronize_rcu(); + mutex_lock(&hci_cb_list_lock); + list_del(&cb->list); + mutex_unlock(&hci_cb_list_lock); return 0; } diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index f165cafa3aa98..b94d202bf3745 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1929,11 +1929,6 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) return lm; } -static bool iso_match(struct hci_conn *hcon) -{ - return hcon->type == ISO_LINK || hcon->type == LE_LINK; -} - static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) { if (hcon->type != ISO_LINK) { @@ -2115,7 +2110,6 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) static struct hci_cb iso_cb = { .name = "ISO", - .match = iso_match, .connect_cfm = iso_connect_cfm, .disconn_cfm = iso_disconn_cfm, }; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 304ebb31cebba..76a85d8b17574 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7228,11 +7228,6 @@ static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c, return NULL; } -static bool l2cap_match(struct hci_conn *hcon) -{ - return hcon->type == ACL_LINK || hcon->type == LE_LINK; -} - static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) { struct hci_dev *hdev = hcon->hdev; @@ -7240,6 +7235,9 @@ static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) struct l2cap_chan *pchan; u8 dst_type; + if (hcon->type != ACL_LINK && hcon->type != LE_LINK) + return; + BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); if (status) { @@ -7304,6 +7302,9 @@ int l2cap_disconn_ind(struct hci_conn *hcon) static void l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason) { + if (hcon->type != ACL_LINK && hcon->type != LE_LINK) + return; + BT_DBG("hcon %p reason %d", hcon, reason); l2cap_conn_del(hcon, bt_to_errno(reason)); @@ -7582,7 +7583,6 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) static struct hci_cb l2cap_cb = { .name = "L2CAP", - .match = l2cap_match, .connect_cfm = l2cap_connect_cfm, .disconn_cfm = l2cap_disconn_cfm, .security_cfm = l2cap_security_cfm, diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 9d46afb24caf0..1d34d84970332 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -2134,11 +2134,6 @@ static int rfcomm_run(void *unused) return 0; } -static bool rfcomm_match(struct hci_conn *hcon) -{ - return hcon->type == ACL_LINK; -} - static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) { struct rfcomm_session *s; @@ -2185,7 +2180,6 @@ static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) static struct hci_cb rfcomm_cb = { .name = "RFCOMM", - .match = rfcomm_match, .security_cfm = rfcomm_security_cfm }; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index c4c36ff25fb20..64d4d57c7033a 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1353,13 +1353,11 @@ int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) return lm; } -static bool sco_match(struct hci_conn *hcon) -{ - return hcon->type == SCO_LINK || hcon->type == ESCO_LINK; -} - static void sco_connect_cfm(struct hci_conn *hcon, __u8 status) { + if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) + return; + BT_DBG("hcon %p bdaddr %pMR status %u", hcon, &hcon->dst, status); if (!status) { @@ -1374,6 +1372,9 @@ static void sco_connect_cfm(struct hci_conn *hcon, __u8 status) static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) { + if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) + return; + BT_DBG("hcon %p reason %d", hcon, reason); sco_conn_del(hcon, bt_to_errno(reason)); @@ -1399,7 +1400,6 @@ void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) static struct hci_cb sco_cb = { .name = "SCO", - .match = sco_match, .connect_cfm = sco_connect_cfm, .disconn_cfm = sco_disconn_cfm, }; From fab03ab16dec1014d8652b1e5aa12a8503b517a5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 6 Mar 2025 23:25:29 +0200 Subject: [PATCH 17/28] net/mlx5: Fill out devlink dev info only for PFs [ Upstream commit d749d901b2168389f060b654fdaa08acf6b367d2 ] Firmware version query is supported on the PFs. Due to this following kernel warning log is observed: [ 188.590344] mlx5_core 0000:08:00.2: mlx5_fw_version_query:816:(pid 1453): fw query isn't supported by the FW Fix it by restricting the query and devlink info to the PF. Fixes: 8338d9378895 ("net/mlx5: Added devlink info callback") Signed-off-by: Jiri Pirko Reviewed-by: Kalesh AP Signed-off-by: Tariq Toukan Reviewed-by: Parav Pandit Link: https://patch.msgid.link/20250306212529.429329-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin (cherry picked from commit 20b667285306386e485fb3734a82e052bfa00114) --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 1bccb5633ab4b..f66788a2ed77e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -46,6 +46,9 @@ mlx5_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, u32 running_fw, stored_fw; int err; + if (!mlx5_core_is_pf(dev)) + return 0; + err = devlink_info_version_fixed_put(req, "fw.psid", dev->board_id); if (err) return err; From 82f7ba473f6ac7818503026a665f6e2ab8a7b201 Mon Sep 17 00:00:00 2001 From: Joseph Huang Date: Thu, 6 Mar 2025 12:23:05 -0500 Subject: [PATCH 18/28] net: dsa: mv88e6xxx: Verify after ATU Load ops [ Upstream commit dc5340c3133a3ebe54853fd299116149e528cfaa ] ATU Load operations could fail silently if there's not enough space on the device to hold the new entry. When this happens, the symptom depends on the unknown flood settings. If unknown multicast flood is disabled, the multicast packets are dropped when the ATU table is full. If unknown multicast flood is enabled, the multicast packets will be flooded to all ports. Either way, IGMP snooping is broken when the ATU Load operation fails silently. Do a Read-After-Write verification after each fdb/mdb add operation to make sure that the operation was really successful, and return -ENOSPC otherwise. Fixes: defb05b9b9b4 ("net: dsa: mv88e6xxx: Add support for fdb_add, fdb_del, and fdb_getnext") Signed-off-by: Joseph Huang Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250306172306.3859214-1-Joseph.Huang@garmin.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin (cherry picked from commit 78f83ea6b81a3daef3f37488ce85bb538bcace46) --- drivers/net/dsa/mv88e6xxx/chip.c | 59 ++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 062bcbe6255cf..a39b33353ca6c 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2125,13 +2125,11 @@ mv88e6xxx_port_vlan_prepare(struct dsa_switch *ds, int port, return err; } -static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port, - const unsigned char *addr, u16 vid, - u8 state) +static int mv88e6xxx_port_db_get(struct mv88e6xxx_chip *chip, + const unsigned char *addr, u16 vid, + u16 *fid, struct mv88e6xxx_atu_entry *entry) { - struct mv88e6xxx_atu_entry entry; struct mv88e6xxx_vtu_entry vlan; - u16 fid; int err; /* Ports have two private address databases: one for when the port is @@ -2142,7 +2140,7 @@ static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port, * VLAN ID into the port's database used for VLAN-unaware bridging. */ if (vid == 0) { - fid = MV88E6XXX_FID_BRIDGED; + *fid = MV88E6XXX_FID_BRIDGED; } else { err = mv88e6xxx_vtu_get(chip, vid, &vlan); if (err) @@ -2152,14 +2150,39 @@ static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port, if (!vlan.valid) return -EOPNOTSUPP; - fid = vlan.fid; + *fid = vlan.fid; } - entry.state = 0; - ether_addr_copy(entry.mac, addr); - eth_addr_dec(entry.mac); + entry->state = 0; + ether_addr_copy(entry->mac, addr); + eth_addr_dec(entry->mac); + + return mv88e6xxx_g1_atu_getnext(chip, *fid, entry); +} + +static bool mv88e6xxx_port_db_find(struct mv88e6xxx_chip *chip, + const unsigned char *addr, u16 vid) +{ + struct mv88e6xxx_atu_entry entry; + u16 fid; + int err; - err = mv88e6xxx_g1_atu_getnext(chip, fid, &entry); + err = mv88e6xxx_port_db_get(chip, addr, vid, &fid, &entry); + if (err) + return false; + + return entry.state && ether_addr_equal(entry.mac, addr); +} + +static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port, + const unsigned char *addr, u16 vid, + u8 state) +{ + struct mv88e6xxx_atu_entry entry; + u16 fid; + int err; + + err = mv88e6xxx_port_db_get(chip, addr, vid, &fid, &entry); if (err) return err; @@ -2757,6 +2780,13 @@ static int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port, mv88e6xxx_reg_lock(chip); err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid, MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC); + if (err) + goto out; + + if (!mv88e6xxx_port_db_find(chip, addr, vid)) + err = -ENOSPC; + +out: mv88e6xxx_reg_unlock(chip); return err; @@ -6454,6 +6484,13 @@ static int mv88e6xxx_port_mdb_add(struct dsa_switch *ds, int port, mv88e6xxx_reg_lock(chip); err = mv88e6xxx_port_db_load_purge(chip, port, mdb->addr, mdb->vid, MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC); + if (err) + goto out; + + if (!mv88e6xxx_port_db_find(chip, mdb->addr, mdb->vid)) + err = -ENOSPC; + +out: mv88e6xxx_reg_unlock(chip); return err; From 572e41611ca88beb6aa7d35baa9b188c995cd70d Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Thu, 6 Mar 2025 10:33:20 +0800 Subject: [PATCH 19/28] net: mctp i2c: Copy headers if cloned [ Upstream commit df8ce77ba8b7c012a3edd1ca7368b46831341466 ] Use skb_cow_head() prior to modifying the TX SKB. This is necessary when the SKB has been cloned, to avoid modifying other shared clones. Signed-off-by: Matt Johnston Fixes: f5b8abf9fc3d ("mctp i2c: MCTP I2C binding driver") Link: https://patch.msgid.link/20250306-matt-mctp-i2c-cow-v1-1-293827212681@codeconstruct.com.au Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin (cherry picked from commit 6c5bb3f7acb7bf4970019a9e7e0151a13c952771) --- drivers/net/mctp/mctp-i2c.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/mctp/mctp-i2c.c b/drivers/net/mctp/mctp-i2c.c index 20b8d7d528baf..fbe8483a07b58 100644 --- a/drivers/net/mctp/mctp-i2c.c +++ b/drivers/net/mctp/mctp-i2c.c @@ -543,6 +543,7 @@ static int mctp_i2c_header_create(struct sk_buff *skb, struct net_device *dev, struct mctp_i2c_hdr *hdr; struct mctp_hdr *mhdr; u8 lldst, llsrc; + int rc; if (len > MCTP_I2C_MAXMTU) return -EMSGSIZE; @@ -553,6 +554,10 @@ static int mctp_i2c_header_create(struct sk_buff *skb, struct net_device *dev, lldst = *((u8 *)daddr); llsrc = *((u8 *)saddr); + rc = skb_cow_head(skb, sizeof(struct mctp_i2c_hdr)); + if (rc) + return rc; + skb_push(skb, sizeof(struct mctp_i2c_hdr)); skb_reset_mac_header(skb); hdr = (void *)skb_mac_header(skb); From 5fff25da6b97e387dbc21d898d15a17236336d25 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 6 Mar 2025 05:16:18 -0800 Subject: [PATCH 20/28] netpoll: hold rcu read lock in __netpoll_send_skb() [ Upstream commit 505ead7ab77f289f12d8a68ac83da068e4d4408b ] The function __netpoll_send_skb() is being invoked without holding the RCU read lock. This oversight triggers a warning message when CONFIG_PROVE_RCU_LIST is enabled: net/core/netpoll.c:330 suspicious rcu_dereference_check() usage! netpoll_send_skb netpoll_send_udp write_ext_msg console_flush_all console_unlock vprintk_emit To prevent npinfo from disappearing unexpectedly, ensure that __netpoll_send_skb() is protected with the RCU read lock. Fixes: 2899656b494dcd1 ("netpoll: take rcu_read_lock_bh() in netpoll_send_skb_on_dev()") Signed-off-by: Breno Leitao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250306-netpoll_rcu_v2-v2-1-bc4f5c51742a@debian.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin (cherry picked from commit 9d1966bdaf76f8887ffc8326e13db7bd09efa354) --- net/core/netpoll.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 1791462f1600a..1a4d2a61b060b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -326,6 +326,7 @@ static int netpoll_owner_active(struct net_device *dev) static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; + netdev_tx_t ret = NET_XMIT_DROP; struct net_device *dev; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ @@ -334,11 +335,12 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) lockdep_assert_irqs_disabled(); dev = np->dev; + rcu_read_lock(); npinfo = rcu_dereference_bh(dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); - return NET_XMIT_DROP; + goto out; } /* don't get messages out of order, and no recursion */ @@ -377,7 +379,10 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } - return NETDEV_TX_OK; + ret = NETDEV_TX_OK; +out: + rcu_read_unlock(); + return ret; } netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) From 1f39848919022bd64d83670cca2cace92cda3f89 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 10 Feb 2025 11:34:41 -0800 Subject: [PATCH 21/28] drm/hyperv: Fix address space leak when Hyper-V DRM device is removed [ Upstream commit aed709355fd05ef747e1af24a1d5d78cd7feb81e ] When a Hyper-V DRM device is probed, the driver allocates MMIO space for the vram, and maps it cacheable. If the device removed, or in the error path for device probing, the MMIO space is released but no unmap is done. Consequently the kernel address space for the mapping is leaked. Fix this by adding iounmap() calls in the device removal path, and in the error path during device probing. Fixes: f1f63cbb705d ("drm/hyperv: Fix an error handling path in hyperv_vmbus_probe()") Fixes: a0ab5abced55 ("drm/hyperv : Removing the restruction of VRAM allocation with PCI bar size") Signed-off-by: Michael Kelley Reviewed-by: Saurabh Sengar Tested-by: Saurabh Sengar Link: https://lore.kernel.org/r/20250210193441.2414-1-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20250210193441.2414-1-mhklinux@outlook.com> Signed-off-by: Sasha Levin (cherry picked from commit ad27b4a51495490b815580d9b935e8eee14d1a9c) --- drivers/gpu/drm/hyperv/hyperv_drm_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/hyperv/hyperv_drm_drv.c b/drivers/gpu/drm/hyperv/hyperv_drm_drv.c index 8026118c6e033..8a7933f5c6ebe 100644 --- a/drivers/gpu/drm/hyperv/hyperv_drm_drv.c +++ b/drivers/gpu/drm/hyperv/hyperv_drm_drv.c @@ -157,6 +157,7 @@ static int hyperv_vmbus_probe(struct hv_device *hdev, return 0; err_free_mmio: + iounmap(hv->vram); vmbus_free_mmio(hv->mem->start, hv->fb_size); err_vmbus_close: vmbus_close(hdev->channel); @@ -175,6 +176,7 @@ static void hyperv_vmbus_remove(struct hv_device *hdev) vmbus_close(hdev->channel); hv_set_drvdata(hdev, NULL); + iounmap(hv->vram); vmbus_free_mmio(hv->mem->start, hv->fb_size); } From e6ab5674997111d377af3236ffdcf579b1e586db Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 9 Mar 2025 20:52:08 -0700 Subject: [PATCH 22/28] Drivers: hv: vmbus: Don't release fb_mmio resource in vmbus_free_mmio() [ Upstream commit 73fe9073c0cc28056cb9de0c8a516dac070f1d1f ] The VMBus driver manages the MMIO space it owns via the hyperv_mmio resource tree. Because the synthetic video framebuffer portion of the MMIO space is initially setup by the Hyper-V host for each guest, the VMBus driver does an early reserve of that portion of MMIO space in the hyperv_mmio resource tree. It saves a pointer to that resource in fb_mmio. When a VMBus driver requests MMIO space and passes "true" for the "fb_overlap_ok" argument, the reserved framebuffer space is used if possible. In that case it's not necessary to do another request against the "shadow" hyperv_mmio resource tree because that resource was already requested in the early reserve steps. However, the vmbus_free_mmio() function currently does no special handling for the fb_mmio resource. When a framebuffer device is removed, or the driver is unbound, the current code for vmbus_free_mmio() releases the reserved resource, leaving fb_mmio pointing to memory that has been freed. If the same or another driver is subsequently bound to the device, vmbus_allocate_mmio() checks against fb_mmio, and potentially gets garbage. Furthermore a second unbind operation produces this "nonexistent resource" error because of the unbalanced behavior between vmbus_allocate_mmio() and vmbus_free_mmio(): [ 55.499643] resource: Trying to free nonexistent resource <0x00000000f0000000-0x00000000f07fffff> Fix this by adding logic to vmbus_free_mmio() to recognize when MMIO space in the fb_mmio reserved area would be released, and don't release it. This filtering ensures the fb_mmio resource always exists, and makes vmbus_free_mmio() more parallel with vmbus_allocate_mmio(). Fixes: be000f93e5d7 ("drivers:hv: Track allocations of children of hv_vmbus in private resource tree") Signed-off-by: Michael Kelley Tested-by: Saurabh Sengar Reviewed-by: Saurabh Sengar Link: https://lore.kernel.org/r/20250310035208.275764-1-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20250310035208.275764-1-mhklinux@outlook.com> Signed-off-by: Sasha Levin (cherry picked from commit 466ae740f88cda3ba541e09eec1ae75b1a7793c9) --- drivers/hv/vmbus_drv.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 756aebf324735..c54d759b07384 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2242,12 +2242,25 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size) struct resource *iter; mutex_lock(&hyperv_mmio_lock); + + /* + * If all bytes of the MMIO range to be released are within the + * special case fb_mmio shadow region, skip releasing the shadow + * region since no corresponding __request_region() was done + * in vmbus_allocate_mmio(). + */ + if (fb_mmio && start >= fb_mmio->start && + (start + size - 1 <= fb_mmio->end)) + goto skip_shadow_release; + for (iter = hyperv_mmio; iter; iter = iter->sibling) { if ((iter->start >= start + size) || (iter->end <= start)) continue; __release_region(iter, start, size); } + +skip_shadow_release: release_mem_region(start, size); mutex_unlock(&hyperv_mmio_lock); From 500eac4a6e535a0cfc1d28cff5b63b39f859ede4 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Fri, 7 Mar 2025 10:18:20 +0800 Subject: [PATCH 23/28] net/mlx5: handle errors in mlx5_chains_create_table() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit eab0396353be1c778eba1c0b5180176f04dd21ce ] In mlx5_chains_create_table(), the return value of mlx5_get_fdb_sub_ns() and mlx5_get_flow_namespace() must be checked to prevent NULL pointer dereferences. If either function fails, the function should log error message with mlx5_core_warn() and return error pointer. Fixes: 39ac237ce009 ("net/mlx5: E-Switch, Refactor chains and priorities") Signed-off-by: Wentao Liang Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20250307021820.2646-1-vulab@iscas.ac.cn Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin (cherry picked from commit 637105ef0d46fe5beac15aceb431da3ec832bb00) --- drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c index a80ecb672f33d..711d14dea2485 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c @@ -196,6 +196,11 @@ mlx5_chains_create_table(struct mlx5_fs_chains *chains, ns = mlx5_get_flow_namespace(chains->dev, chains->ns); } + if (!ns) { + mlx5_core_warn(chains->dev, "Failed to get flow namespace\n"); + return ERR_PTR(-EOPNOTSUPP); + } + ft_attr.autogroup.num_reserved_entries = 2; ft_attr.autogroup.max_num_groups = chains->group_num; ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); From 2252aa25bc29a6bc4edcb243a30bbfef500b2f74 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 9 Mar 2025 13:42:15 +0000 Subject: [PATCH 24/28] eth: bnxt: do not update checksum in bnxt_xdp_build_skb() [ Upstream commit c03e7d05aa0e2f7e9a9ce5ad8a12471a53f941dc ] The bnxt_rx_pkt() updates ip_summed value at the end if checksum offload is enabled. When the XDP-MB program is attached and it returns XDP_PASS, the bnxt_xdp_build_skb() is called to update skb_shared_info. The main purpose of bnxt_xdp_build_skb() is to update skb_shared_info, but it updates ip_summed value too if checksum offload is enabled. This is actually duplicate work. When the bnxt_rx_pkt() updates ip_summed value, it checks if ip_summed is CHECKSUM_NONE or not. It means that ip_summed should be CHECKSUM_NONE at this moment. But ip_summed may already be updated to CHECKSUM_UNNECESSARY in the XDP-MB-PASS path. So the by skb_checksum_none_assert() WARNS about it. This is duplicate work and updating ip_summed in the bnxt_xdp_build_skb() is not needed. Splat looks like: WARNING: CPU: 3 PID: 5782 at ./include/linux/skbuff.h:5155 bnxt_rx_pkt+0x479b/0x7610 [bnxt_en] Modules linked in: bnxt_re bnxt_en rdma_ucm rdma_cm iw_cm ib_cm ib_uverbs veth xt_nat xt_tcpudp xt_conntrack nft_chain_nat xt_MASQUERADE nf_] CPU: 3 UID: 0 PID: 5782 Comm: socat Tainted: G W 6.14.0-rc4+ #27 Tainted: [W]=WARN Hardware name: ASUS System Product Name/PRIME Z690-P D4, BIOS 0603 11/01/2021 RIP: 0010:bnxt_rx_pkt+0x479b/0x7610 [bnxt_en] Code: 54 24 0c 4c 89 f1 4c 89 ff c1 ea 1f ff d3 0f 1f 00 49 89 c6 48 85 c0 0f 84 4c e5 ff ff 48 89 c7 e8 ca 3d a0 c8 e9 8f f4 ff ff <0f> 0b f RSP: 0018:ffff88881ba09928 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 00000000c7590303 RCX: 0000000000000000 RDX: 1ffff1104e7d1610 RSI: 0000000000000001 RDI: ffff8881c91300b8 RBP: ffff88881ba09b28 R08: ffff888273e8b0d0 R09: ffff888273e8b070 R10: ffff888273e8b010 R11: ffff888278b0f000 R12: ffff888273e8b080 R13: ffff8881c9130e00 R14: ffff8881505d3800 R15: ffff888273e8b000 FS: 00007f5a2e7be080(0000) GS:ffff88881ba00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fff2e708ff8 CR3: 000000013e3b0000 CR4: 00000000007506f0 PKRU: 55555554 Call Trace: ? __warn+0xcd/0x2f0 ? bnxt_rx_pkt+0x479b/0x7610 ? report_bug+0x326/0x3c0 ? handle_bug+0x53/0xa0 ? exc_invalid_op+0x14/0x50 ? asm_exc_invalid_op+0x16/0x20 ? bnxt_rx_pkt+0x479b/0x7610 ? bnxt_rx_pkt+0x3e41/0x7610 ? __pfx_bnxt_rx_pkt+0x10/0x10 ? napi_complete_done+0x2cf/0x7d0 __bnxt_poll_work+0x4e8/0x1220 ? __pfx___bnxt_poll_work+0x10/0x10 ? __pfx_mark_lock.part.0+0x10/0x10 bnxt_poll_p5+0x36a/0xfa0 ? __pfx_bnxt_poll_p5+0x10/0x10 __napi_poll.constprop.0+0xa0/0x440 net_rx_action+0x899/0xd00 ... Following ping.py patch adds xdp-mb-pass case. so ping.py is going to be able to reproduce this issue. Fixes: 1dc4c557bfed ("bnxt: adding bnxt_xdp_build_skb to build skb from multibuffer xdp_buff") Signed-off-by: Taehee Yoo Reviewed-by: Somnath Kotur Link: https://patch.msgid.link/20250309134219.91670-5-ap420073@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin (cherry picked from commit ee086c8e775f9690282e3d26471dbcfd5dad5a6a) --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 ++- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 11 ++--------- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h | 3 +-- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index f32e8460856b0..b903c98ef6c9d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1958,7 +1958,8 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, if (!skb) goto oom_next_rx; } else { - skb = bnxt_xdp_build_skb(bp, skb, agg_bufs, rxr->page_pool, &xdp, rxcmp1); + skb = bnxt_xdp_build_skb(bp, skb, agg_bufs, + rxr->page_pool, &xdp); if (!skb) { /* we should be able to free the old skb here */ bnxt_xdp_buff_frags_free(rxr, &xdp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 2845796f782c2..758f51366ef03 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -462,20 +462,13 @@ int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp) struct sk_buff * bnxt_xdp_build_skb(struct bnxt *bp, struct sk_buff *skb, u8 num_frags, - struct page_pool *pool, struct xdp_buff *xdp, - struct rx_cmp_ext *rxcmp1) + struct page_pool *pool, struct xdp_buff *xdp) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); if (!skb) return NULL; - skb_checksum_none_assert(skb); - if (RX_CMP_L4_CS_OK(rxcmp1)) { - if (bp->dev->features & NETIF_F_RXCSUM) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = RX_CMP_ENCAP(rxcmp1); - } - } + xdp_update_skb_shared_info(skb, num_frags, sinfo->xdp_frags_size, BNXT_RX_PAGE_SIZE * sinfo->nr_frags, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h index 5e412c5655ba5..9f5829a0adeb1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h @@ -33,6 +33,5 @@ void bnxt_xdp_buff_frags_free(struct bnxt_rx_ring_info *rxr, struct xdp_buff *xdp); struct sk_buff *bnxt_xdp_build_skb(struct bnxt *bp, struct sk_buff *skb, u8 num_frags, struct page_pool *pool, - struct xdp_buff *xdp, - struct rx_cmp_ext *rxcmp1); + struct xdp_buff *xdp); #endif From 747ccd803d58fdb02b762761e3f16aa3bc6523ab Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 5 Mar 2025 14:15:09 +0200 Subject: [PATCH 25/28] net: switchdev: Convert blocking notification chain to a raw one [ Upstream commit 62531a1effa87bdab12d5104015af72e60d926ff ] A blocking notification chain uses a read-write semaphore to protect the integrity of the chain. The semaphore is acquired for writing when adding / removing notifiers to / from the chain and acquired for reading when traversing the chain and informing notifiers about an event. In case of the blocking switchdev notification chain, recursive notifications are possible which leads to the semaphore being acquired twice for reading and to lockdep warnings being generated [1]. Specifically, this can happen when the bridge driver processes a SWITCHDEV_BRPORT_UNOFFLOADED event which causes it to emit notifications about deferred events when calling switchdev_deferred_process(). Fix this by converting the notification chain to a raw notification chain in a similar fashion to the netdev notification chain. Protect the chain using the RTNL mutex by acquiring it when modifying the chain. Events are always informed under the RTNL mutex, but add an assertion in call_switchdev_blocking_notifiers() to make sure this is not violated in the future. Maintain the "blocking" prefix as events are always emitted from process context and listeners are allowed to block. [1]: WARNING: possible recursive locking detected 6.14.0-rc4-custom-g079270089484 #1 Not tainted -------------------------------------------- ip/52731 is trying to acquire lock: ffffffff850918d8 ((switchdev_blocking_notif_chain).rwsem){++++}-{4:4}, at: blocking_notifier_call_chain+0x58/0xa0 but task is already holding lock: ffffffff850918d8 ((switchdev_blocking_notif_chain).rwsem){++++}-{4:4}, at: blocking_notifier_call_chain+0x58/0xa0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock((switchdev_blocking_notif_chain).rwsem); lock((switchdev_blocking_notif_chain).rwsem); *** DEADLOCK *** May be due to missing lock nesting notation 3 locks held by ip/52731: #0: ffffffff84f795b0 (rtnl_mutex){+.+.}-{4:4}, at: rtnl_newlink+0x727/0x1dc0 #1: ffffffff8731f628 (&net->rtnl_mutex){+.+.}-{4:4}, at: rtnl_newlink+0x790/0x1dc0 #2: ffffffff850918d8 ((switchdev_blocking_notif_chain).rwsem){++++}-{4:4}, at: blocking_notifier_call_chain+0x58/0xa0 stack backtrace: ... ? __pfx_down_read+0x10/0x10 ? __pfx_mark_lock+0x10/0x10 ? __pfx_switchdev_port_attr_set_deferred+0x10/0x10 blocking_notifier_call_chain+0x58/0xa0 switchdev_port_attr_notify.constprop.0+0xb3/0x1b0 ? __pfx_switchdev_port_attr_notify.constprop.0+0x10/0x10 ? mark_held_locks+0x94/0xe0 ? switchdev_deferred_process+0x11a/0x340 switchdev_port_attr_set_deferred+0x27/0xd0 switchdev_deferred_process+0x164/0x340 br_switchdev_port_unoffload+0xc8/0x100 [bridge] br_switchdev_blocking_event+0x29f/0x580 [bridge] notifier_call_chain+0xa2/0x440 blocking_notifier_call_chain+0x6e/0xa0 switchdev_bridge_port_unoffload+0xde/0x1a0 ... Fixes: f7a70d650b0b6 ("net: bridge: switchdev: Ensure deferred event delivery on unoffload") Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Reviewed-by: Simon Horman Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Link: https://patch.msgid.link/20250305121509.631207-1-amcohen@nvidia.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin (cherry picked from commit 1f7d051814e7a0cb1f0717ed5527c1059992129d) --- net/switchdev/switchdev.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index c9189a970eec3..fb0e65c89525d 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -381,7 +381,7 @@ bool switchdev_port_obj_act_is_deferred(struct net_device *dev, EXPORT_SYMBOL_GPL(switchdev_port_obj_act_is_deferred); static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain); -static BLOCKING_NOTIFIER_HEAD(switchdev_blocking_notif_chain); +static RAW_NOTIFIER_HEAD(switchdev_blocking_notif_chain); /** * register_switchdev_notifier - Register notifier @@ -427,17 +427,27 @@ EXPORT_SYMBOL_GPL(call_switchdev_notifiers); int register_switchdev_blocking_notifier(struct notifier_block *nb) { - struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain; + struct raw_notifier_head *chain = &switchdev_blocking_notif_chain; + int err; + + rtnl_lock(); + err = raw_notifier_chain_register(chain, nb); + rtnl_unlock(); - return blocking_notifier_chain_register(chain, nb); + return err; } EXPORT_SYMBOL_GPL(register_switchdev_blocking_notifier); int unregister_switchdev_blocking_notifier(struct notifier_block *nb) { - struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain; + struct raw_notifier_head *chain = &switchdev_blocking_notif_chain; + int err; - return blocking_notifier_chain_unregister(chain, nb); + rtnl_lock(); + err = raw_notifier_chain_unregister(chain, nb); + rtnl_unlock(); + + return err; } EXPORT_SYMBOL_GPL(unregister_switchdev_blocking_notifier); @@ -445,10 +455,11 @@ int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info, struct netlink_ext_ack *extack) { + ASSERT_RTNL(); info->dev = dev; info->extack = extack; - return blocking_notifier_call_chain(&switchdev_blocking_notif_chain, - val, info); + return raw_notifier_call_chain(&switchdev_blocking_notif_chain, + val, info); } EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers); From 4f4a21632ebcfbdfcc2bd3054cc13122f9cc0a46 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 6 Mar 2025 02:39:22 +0000 Subject: [PATCH 26/28] bonding: fix incorrect MAC address setting to receive NS messages [ Upstream commit 0c5e145a350de3b38cd5ae77a401b12c46fb7c1d ] When validation on the backup slave is enabled, we need to validate the Neighbor Solicitation (NS) messages received on the backup slave. To receive these messages, the correct destination MAC address must be added to the slave. However, the target in bonding is a unicast address, which we cannot use directly. Instead, we should first convert it to a Solicited-Node Multicast Address and then derive the corresponding MAC address. Fix the incorrect MAC address setting on both slave_set_ns_maddr() and slave_set_ns_maddrs(). Since the two function names are similar. Add some description for the functions. Also only use one mac_addr variable in slave_set_ns_maddr() to save some code and logic. Fixes: 8eb36164d1a6 ("bonding: add ns target multicast address to slave device") Acked-by: Jay Vosburgh Reviewed-by: Nikolay Aleksandrov Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250306023923.38777-2-liuhangbin@gmail.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin (cherry picked from commit 6e4edd9e2deb3425c49da46f7c6a805809d7b227) --- drivers/net/bonding/bond_options.c | 55 +++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 8 deletions(-) diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 8c326e41b8d63..6d003c0ef6698 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1226,10 +1226,28 @@ static bool slave_can_set_ns_maddr(const struct bonding *bond, struct slave *sla slave->dev->flags & IFF_MULTICAST; } +/** + * slave_set_ns_maddrs - add/del all NS mac addresses for slave + * @bond: bond device + * @slave: slave device + * @add: add or remove all the NS mac addresses + * + * This function tries to add or delete all the NS mac addresses on the slave + * + * Note, the IPv6 NS target address is the unicast address in Neighbor + * Solicitation (NS) message. The dest address of NS message should be + * solicited-node multicast address of the target. The dest mac of NS message + * is converted from the solicited-node multicast address. + * + * This function is called when + * * arp_validate changes + * * enslaving, releasing new slaves + */ static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool add) { struct in6_addr *targets = bond->params.ns_targets; char slot_maddr[MAX_ADDR_LEN]; + struct in6_addr mcaddr; int i; if (!slave_can_set_ns_maddr(bond, slave)) @@ -1239,7 +1257,8 @@ static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool if (ipv6_addr_any(&targets[i])) break; - if (!ndisc_mc_map(&targets[i], slot_maddr, slave->dev, 0)) { + addrconf_addr_solict_mult(&targets[i], &mcaddr); + if (!ndisc_mc_map(&mcaddr, slot_maddr, slave->dev, 0)) { if (add) dev_mc_add(slave->dev, slot_maddr); else @@ -1262,23 +1281,43 @@ void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave) slave_set_ns_maddrs(bond, slave, false); } +/** + * slave_set_ns_maddr - set new NS mac address for slave + * @bond: bond device + * @slave: slave device + * @target: the new IPv6 target + * @slot: the old IPv6 target in the slot + * + * This function tries to replace the old mac address to new one on the slave. + * + * Note, the target/slot IPv6 address is the unicast address in Neighbor + * Solicitation (NS) message. The dest address of NS message should be + * solicited-node multicast address of the target. The dest mac of NS message + * is converted from the solicited-node multicast address. + * + * This function is called when + * * An IPv6 NS target is added or removed. + */ static void slave_set_ns_maddr(struct bonding *bond, struct slave *slave, struct in6_addr *target, struct in6_addr *slot) { - char target_maddr[MAX_ADDR_LEN], slot_maddr[MAX_ADDR_LEN]; + char mac_addr[MAX_ADDR_LEN]; + struct in6_addr mcast_addr; if (!bond->params.arp_validate || !slave_can_set_ns_maddr(bond, slave)) return; - /* remove the previous maddr from slave */ + /* remove the previous mac addr from slave */ + addrconf_addr_solict_mult(slot, &mcast_addr); if (!ipv6_addr_any(slot) && - !ndisc_mc_map(slot, slot_maddr, slave->dev, 0)) - dev_mc_del(slave->dev, slot_maddr); + !ndisc_mc_map(&mcast_addr, mac_addr, slave->dev, 0)) + dev_mc_del(slave->dev, mac_addr); - /* add new maddr on slave if target is set */ + /* add new mac addr on slave if target is set */ + addrconf_addr_solict_mult(target, &mcast_addr); if (!ipv6_addr_any(target) && - !ndisc_mc_map(target, target_maddr, slave->dev, 0)) - dev_mc_add(slave->dev, target_maddr); + !ndisc_mc_map(&mcast_addr, mac_addr, slave->dev, 0)) + dev_mc_add(slave->dev, mac_addr); } static void _bond_options_ns_ip6_target_set(struct bonding *bond, int slot, From 3bd164f1b49ac049238249890f71408cea6d7283 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Sun, 9 Mar 2025 17:07:38 +0900 Subject: [PATCH 27/28] netfilter: nf_conncount: Fully initialize struct nf_conncount_tuple in insert_tree() [ Upstream commit d653bfeb07ebb3499c403404c21ac58a16531607 ] Since commit b36e4523d4d5 ("netfilter: nf_conncount: fix garbage collection confirm race"), `cpu` and `jiffies32` were introduced to the struct nf_conncount_tuple. The commit made nf_conncount_add() initialize `conn->cpu` and `conn->jiffies32` when allocating the struct. In contrast, count_tree() was not changed to initialize them. By commit 34848d5c896e ("netfilter: nf_conncount: Split insert and traversal"), count_tree() was split and the relevant allocation code now resides in insert_tree(). Initialize `conn->cpu` and `conn->jiffies32` in insert_tree(). BUG: KMSAN: uninit-value in find_or_evict net/netfilter/nf_conncount.c:117 [inline] BUG: KMSAN: uninit-value in __nf_conncount_add+0xd9c/0x2850 net/netfilter/nf_conncount.c:143 find_or_evict net/netfilter/nf_conncount.c:117 [inline] __nf_conncount_add+0xd9c/0x2850 net/netfilter/nf_conncount.c:143 count_tree net/netfilter/nf_conncount.c:438 [inline] nf_conncount_count+0x82f/0x1e80 net/netfilter/nf_conncount.c:521 connlimit_mt+0x7f6/0xbd0 net/netfilter/xt_connlimit.c:72 __nft_match_eval net/netfilter/nft_compat.c:403 [inline] nft_match_eval+0x1a5/0x300 net/netfilter/nft_compat.c:433 expr_call_ops_eval net/netfilter/nf_tables_core.c:240 [inline] nft_do_chain+0x426/0x2290 net/netfilter/nf_tables_core.c:288 nft_do_chain_ipv4+0x1a5/0x230 net/netfilter/nft_chain_filter.c:23 nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline] nf_hook_slow+0xf4/0x400 net/netfilter/core.c:626 nf_hook_slow_list+0x24d/0x860 net/netfilter/core.c:663 NF_HOOK_LIST include/linux/netfilter.h:350 [inline] ip_sublist_rcv+0x17b7/0x17f0 net/ipv4/ip_input.c:633 ip_list_rcv+0x9ef/0xa40 net/ipv4/ip_input.c:669 __netif_receive_skb_list_ptype net/core/dev.c:5936 [inline] __netif_receive_skb_list_core+0x15c5/0x1670 net/core/dev.c:5983 __netif_receive_skb_list net/core/dev.c:6035 [inline] netif_receive_skb_list_internal+0x1085/0x1700 net/core/dev.c:6126 netif_receive_skb_list+0x5a/0x460 net/core/dev.c:6178 xdp_recv_frames net/bpf/test_run.c:280 [inline] xdp_test_run_batch net/bpf/test_run.c:361 [inline] bpf_test_run_xdp_live+0x2e86/0x3480 net/bpf/test_run.c:390 bpf_prog_test_run_xdp+0xf1d/0x1ae0 net/bpf/test_run.c:1316 bpf_prog_test_run+0x5e5/0xa30 kernel/bpf/syscall.c:4407 __sys_bpf+0x6aa/0xd90 kernel/bpf/syscall.c:5813 __do_sys_bpf kernel/bpf/syscall.c:5902 [inline] __se_sys_bpf kernel/bpf/syscall.c:5900 [inline] __ia32_sys_bpf+0xa0/0xe0 kernel/bpf/syscall.c:5900 ia32_sys_call+0x394d/0x4180 arch/x86/include/generated/asm/syscalls_32.h:358 do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline] __do_fast_syscall_32+0xb0/0x110 arch/x86/entry/common.c:387 do_fast_syscall_32+0x38/0x80 arch/x86/entry/common.c:412 do_SYSENTER_32+0x1f/0x30 arch/x86/entry/common.c:450 entry_SYSENTER_compat_after_hwframe+0x84/0x8e Uninit was created at: slab_post_alloc_hook mm/slub.c:4121 [inline] slab_alloc_node mm/slub.c:4164 [inline] kmem_cache_alloc_noprof+0x915/0xe10 mm/slub.c:4171 insert_tree net/netfilter/nf_conncount.c:372 [inline] count_tree net/netfilter/nf_conncount.c:450 [inline] nf_conncount_count+0x1415/0x1e80 net/netfilter/nf_conncount.c:521 connlimit_mt+0x7f6/0xbd0 net/netfilter/xt_connlimit.c:72 __nft_match_eval net/netfilter/nft_compat.c:403 [inline] nft_match_eval+0x1a5/0x300 net/netfilter/nft_compat.c:433 expr_call_ops_eval net/netfilter/nf_tables_core.c:240 [inline] nft_do_chain+0x426/0x2290 net/netfilter/nf_tables_core.c:288 nft_do_chain_ipv4+0x1a5/0x230 net/netfilter/nft_chain_filter.c:23 nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline] nf_hook_slow+0xf4/0x400 net/netfilter/core.c:626 nf_hook_slow_list+0x24d/0x860 net/netfilter/core.c:663 NF_HOOK_LIST include/linux/netfilter.h:350 [inline] ip_sublist_rcv+0x17b7/0x17f0 net/ipv4/ip_input.c:633 ip_list_rcv+0x9ef/0xa40 net/ipv4/ip_input.c:669 __netif_receive_skb_list_ptype net/core/dev.c:5936 [inline] __netif_receive_skb_list_core+0x15c5/0x1670 net/core/dev.c:5983 __netif_receive_skb_list net/core/dev.c:6035 [inline] netif_receive_skb_list_internal+0x1085/0x1700 net/core/dev.c:6126 netif_receive_skb_list+0x5a/0x460 net/core/dev.c:6178 xdp_recv_frames net/bpf/test_run.c:280 [inline] xdp_test_run_batch net/bpf/test_run.c:361 [inline] bpf_test_run_xdp_live+0x2e86/0x3480 net/bpf/test_run.c:390 bpf_prog_test_run_xdp+0xf1d/0x1ae0 net/bpf/test_run.c:1316 bpf_prog_test_run+0x5e5/0xa30 kernel/bpf/syscall.c:4407 __sys_bpf+0x6aa/0xd90 kernel/bpf/syscall.c:5813 __do_sys_bpf kernel/bpf/syscall.c:5902 [inline] __se_sys_bpf kernel/bpf/syscall.c:5900 [inline] __ia32_sys_bpf+0xa0/0xe0 kernel/bpf/syscall.c:5900 ia32_sys_call+0x394d/0x4180 arch/x86/include/generated/asm/syscalls_32.h:358 do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline] __do_fast_syscall_32+0xb0/0x110 arch/x86/entry/common.c:387 do_fast_syscall_32+0x38/0x80 arch/x86/entry/common.c:412 do_SYSENTER_32+0x1f/0x30 arch/x86/entry/common.c:450 entry_SYSENTER_compat_after_hwframe+0x84/0x8e Reported-by: syzbot+83fed965338b573115f7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=83fed965338b573115f7 Fixes: b36e4523d4d5 ("netfilter: nf_conncount: fix garbage collection confirm race") Signed-off-by: Kohei Enju Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin (cherry picked from commit fda50302a13701d47fbe01e1739c7a51114144fb) --- net/netfilter/nf_conncount.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 71869ad466467..6156c0751056c 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -377,6 +377,8 @@ insert_tree(struct net *net, conn->tuple = *tuple; conn->zone = *zone; + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; memcpy(rbconn->key, key, sizeof(u32) * data->keylen); nf_conncount_list_init(&rbconn->list); From aa7e41b65800726a072dfc7fb71c0fa9fc9e2b12 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 10 Mar 2025 10:45:53 +0300 Subject: [PATCH 28/28] ipvs: prevent integer overflow in do_ip_vs_get_ctl() [ Upstream commit 80b78c39eb86e6b55f56363b709eb817527da5aa ] The get->num_services variable is an unsigned int which is controlled by the user. The struct_size() function ensures that the size calculation does not overflow an unsigned long, however, we are saving the result to an int so the calculation can overflow. Both "len" and "get->num_services" come from the user. This check is just a sanity check to help the user and ensure they are using the API correctly. An integer overflow here is not a big deal. This has no security impact. Save the result from struct_size() type size_t to fix this integer overflow bug. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Dan Carpenter Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin (cherry picked from commit 1bd2a8bb1ccb85378a0d2dde0318d7b237c8cbb7) --- net/netfilter/ipvs/ip_vs_ctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index dec5309d9f1f5..ae76542de3e98 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3091,12 +3091,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP_VS_SO_GET_SERVICES: { struct ip_vs_get_services *get; - int size; + size_t size; get = (struct ip_vs_get_services *)arg; size = struct_size(get, entrytable, get->num_services); if (*len != size) { - pr_err("length: %u != %u\n", *len, size); + pr_err("length: %u != %zu\n", *len, size); ret = -EINVAL; goto out; } @@ -3132,12 +3132,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP_VS_SO_GET_DESTS: { struct ip_vs_get_dests *get; - int size; + size_t size; get = (struct ip_vs_get_dests *)arg; size = struct_size(get, entrytable, get->num_dests); if (*len != size) { - pr_err("length: %u != %u\n", *len, size); + pr_err("length: %u != %zu\n", *len, size); ret = -EINVAL; goto out; }