Skip to content

Commit 40afb39

Browse files
committed
Fix snapshot automount expiry cancellation deadlock
A deadlock occurs when snapshot expiry tasks are cancelled while holding locks. The snapshot expiry task (snapentry_expire) spawns an umount process and waits for it to complete. Concurrently, ARC memory pressure triggers arc_prune which calls zfs_exit_fs(), attempting to cancel the expiry task while holding locks. The umount process spawned by the expiry task blocks trying to acquire locks held by arc_prune, which is blocked waiting for the expiry task to complete. This creates a circular dependency: expiry task waits for umount, umount waits for arc_prune, arc_prune waits for expiry task. Fix by adding non-blocking cancellation support to taskq_cancel_id(). The zfs_exit_fs() path calls zfsctl_snapshot_unmount_delay() to reschedule the unmount, which needs to cancel any existing expiry task. It now uses non-blocking cancellation to avoid waiting while holding locks, breaking the deadlock by returning immediately when the task is already running. The per-entry se_taskqid_lock has been removed, with all taskqid operations now protected by the global zfs_snapshot_lock held as WRITER. Additionally, an se_in_umount flag prevents recursive waits when zfsctl_destroy() is called during unmount. The taskqid is now only cleared by the caller on successful cancellation; running tasks clear their own taskqid upon completion. Signed-off-by: Ameer Hamza <[email protected]>
1 parent 74b50a7 commit 40afb39

File tree

14 files changed

+74
-51
lines changed

14 files changed

+74
-51
lines changed

include/os/freebsd/spl/sys/taskq.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ extern void taskq_destroy(taskq_t *);
107107
extern void taskq_wait_id(taskq_t *, taskqid_t);
108108
extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
109109
extern void taskq_wait(taskq_t *);
110-
extern int taskq_cancel_id(taskq_t *, taskqid_t);
110+
extern int taskq_cancel_id(taskq_t *, taskqid_t, boolean_t);
111111
extern int taskq_member(taskq_t *, kthread_t *);
112112
extern taskq_t *taskq_of_curthread(void);
113113
void taskq_suspend(taskq_t *);

include/os/linux/spl/sys/taskq.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ extern void taskq_destroy(taskq_t *);
198198
extern void taskq_wait_id(taskq_t *, taskqid_t);
199199
extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
200200
extern void taskq_wait(taskq_t *);
201-
extern int taskq_cancel_id(taskq_t *, taskqid_t);
201+
extern int taskq_cancel_id(taskq_t *, taskqid_t, boolean_t);
202202
extern int taskq_member(taskq_t *, kthread_t *);
203203
extern taskq_t *taskq_of_curthread(void);
204204

lib/libspl/include/sys/taskq.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ extern void taskq_wait_id(taskq_t *, taskqid_t);
112112
extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
113113
extern int taskq_member(taskq_t *, kthread_t *);
114114
extern taskq_t *taskq_of_curthread(void);
115-
extern int taskq_cancel_id(taskq_t *, taskqid_t);
115+
extern int taskq_cancel_id(taskq_t *, taskqid_t, boolean_t);
116116
extern void system_taskq_init(void);
117117
extern void system_taskq_fini(void);
118118

lib/libspl/taskq.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -398,9 +398,9 @@ taskq_of_curthread(void)
398398
}
399399

400400
int
401-
taskq_cancel_id(taskq_t *tq, taskqid_t id)
401+
taskq_cancel_id(taskq_t *tq, taskqid_t id, boolean_t wait)
402402
{
403-
(void) tq, (void) id;
403+
(void) tq, (void) id, (void) wait;
404404
return (ENOENT);
405405
}
406406

lib/libzfs/libzfs.abi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2279,6 +2279,7 @@
22792279
<function-decl name='taskq_cancel_id' mangled-name='taskq_cancel_id' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='taskq_cancel_id'>
22802280
<parameter type-id='4f8ed29a' name='tq'/>
22812281
<parameter type-id='de0ea20e' name='id'/>
2282+
<parameter type-id='c19b74c3' name='wait'/>
22822283
<return type-id='95e97e5e'/>
22832284
</function-decl>
22842285
<function-decl name='system_taskq_init' mangled-name='system_taskq_init' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='system_taskq_init'>

lib/libzfs_core/libzfs_core.abi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2123,6 +2123,7 @@
21232123
<function-decl name='taskq_cancel_id' mangled-name='taskq_cancel_id' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='taskq_cancel_id'>
21242124
<parameter type-id='4f8ed29a' name='tq'/>
21252125
<parameter type-id='de0ea20e' name='id'/>
2126+
<parameter type-id='c19b74c3' name='wait'/>
21262127
<return type-id='95e97e5e'/>
21272128
</function-decl>
21282129
<function-decl name='system_taskq_init' mangled-name='system_taskq_init' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='system_taskq_init'>

module/os/freebsd/spl/spl_taskq.c

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ taskq_free(taskq_ent_t *task)
351351
}
352352

353353
int
354-
taskq_cancel_id(taskq_t *tq, taskqid_t tid)
354+
taskq_cancel_id(taskq_t *tq, taskqid_t tid, boolean_t wait)
355355
{
356356
uint32_t pend;
357357
int rc;
@@ -362,12 +362,12 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)
362362

363363
if (ent->tqent_type == NORMAL_TASK) {
364364
rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
365-
if (rc == EBUSY)
365+
if (rc == EBUSY && wait)
366366
taskqueue_drain(tq->tq_queue, &ent->tqent_task);
367367
} else {
368368
rc = taskqueue_cancel_timeout(tq->tq_queue,
369369
&ent->tqent_timeout_task, &pend);
370-
if (rc == EBUSY) {
370+
if (rc == EBUSY && wait) {
371371
taskqueue_drain_timeout(tq->tq_queue,
372372
&ent->tqent_timeout_task);
373373
}
@@ -381,6 +381,13 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)
381381
}
382382
/* Free the extra reference we added with taskq_lookup. */
383383
taskq_free(ent);
384+
385+
/*
386+
* If task was running and we didn't wait, return EBUSY.
387+
* Otherwise return 0 if cancelled or ENOENT if not found.
388+
*/
389+
if (rc == EBUSY && !wait)
390+
return (EBUSY);
384391
return (pend ? 0 : ENOENT);
385392
}
386393

module/os/linux/spl/spl-kmem-cache.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -840,7 +840,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
840840
id = skc->skc_taskqid;
841841
spin_unlock(&skc->skc_lock);
842842

843-
taskq_cancel_id(spl_kmem_cache_taskq, id);
843+
taskq_cancel_id(spl_kmem_cache_taskq, id, B_TRUE);
844844

845845
/*
846846
* Wait until all current callers complete, this is mainly

module/os/linux/spl/spl-taskq.c

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -598,13 +598,22 @@ taskq_of_curthread(void)
598598
EXPORT_SYMBOL(taskq_of_curthread);
599599

600600
/*
601-
* Cancel an already dispatched task given the task id. Still pending tasks
602-
* will be immediately canceled, and if the task is active the function will
603-
* block until it completes. Preallocated tasks which are canceled must be
604-
* freed by the caller.
601+
* Cancel a dispatched task. Pending tasks are cancelled immediately.
602+
* If the task is running, behavior depends on wait parameter:
603+
* - wait=B_TRUE: Block until task completes
604+
* - wait=B_FALSE: Return EBUSY immediately
605+
*
606+
* Return values:
607+
* 0 - Cancelled before execution. Caller must release resources.
608+
* EBUSY - Task running (wait=B_FALSE only). Will self-cleanup.
609+
* ENOENT - Not found, or completed after waiting. Already cleaned up.
610+
*
611+
* Note: wait=B_TRUE returns ENOENT (not EBUSY) after waiting because
612+
* the task no longer exists. This distinguishes "cancelled before run"
613+
* from "completed naturally" for proper resource management.
605614
*/
606615
int
607-
taskq_cancel_id(taskq_t *tq, taskqid_t id)
616+
taskq_cancel_id(taskq_t *tq, taskqid_t id, boolean_t wait)
608617
{
609618
taskq_ent_t *t;
610619
int rc = ENOENT;
@@ -650,8 +659,12 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
650659
spin_unlock_irqrestore(&tq->tq_lock, flags);
651660

652661
if (t == ERR_PTR(-EBUSY)) {
653-
taskq_wait_id(tq, id);
654-
rc = EBUSY;
662+
if (wait) {
663+
taskq_wait_id(tq, id);
664+
rc = ENOENT; /* Completed, no longer exists */
665+
} else {
666+
rc = EBUSY; /* Still running */
667+
}
655668
}
656669

657670
return (rc);

module/os/linux/zfs/zfs_ctldir.c

Lines changed: 27 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@ typedef struct {
120120
spa_t *se_spa; /* pool spa */
121121
uint64_t se_objsetid; /* snapshot objset id */
122122
struct dentry *se_root_dentry; /* snapshot root dentry */
123-
krwlock_t se_taskqid_lock; /* scheduled unmount taskqid lock */
124123
taskqid_t se_taskqid; /* scheduled unmount taskqid */
125124
avl_node_t se_node_name; /* zfs_snapshots_by_name link */
126125
avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */
@@ -147,7 +146,6 @@ zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa,
147146
se->se_objsetid = objsetid;
148147
se->se_root_dentry = root_dentry;
149148
se->se_taskqid = TASKQID_INVALID;
150-
rw_init(&se->se_taskqid_lock, NULL, RW_DEFAULT, NULL);
151149

152150
zfs_refcount_create(&se->se_refcount);
153151

@@ -164,7 +162,6 @@ zfsctl_snapshot_free(zfs_snapentry_t *se)
164162
zfs_refcount_destroy(&se->se_refcount);
165163
kmem_strfree(se->se_name);
166164
kmem_strfree(se->se_path);
167-
rw_destroy(&se->se_taskqid_lock);
168165

169166
kmem_free(se, sizeof (zfs_snapentry_t));
170167
}
@@ -340,17 +337,15 @@ snapentry_expire(void *data)
340337
return;
341338
}
342339

343-
rw_enter(&se->se_taskqid_lock, RW_WRITER);
344-
se->se_taskqid = TASKQID_INVALID;
345-
rw_exit(&se->se_taskqid_lock);
346340
(void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
347-
zfsctl_snapshot_rele(se);
348341

349342
/*
350-
* Reschedule the unmount if the zfs_snapentry_t wasn't removed.
343+
* Clear taskqid and reschedule if the snapshot wasn't removed.
351344
* This can occur when the snapshot is busy.
352345
*/
353-
rw_enter(&zfs_snapshot_lock, RW_READER);
346+
rw_enter(&zfs_snapshot_lock, RW_WRITER);
347+
se->se_taskqid = TASKQID_INVALID;
348+
zfsctl_snapshot_rele(se);
354349
if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
355350
zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
356351
zfsctl_snapshot_rele(se);
@@ -364,20 +359,20 @@ snapentry_expire(void *data)
364359
* during dispatch.
365360
*/
366361
static void
367-
zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
362+
zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se, boolean_t wait)
368363
{
369364
int err = 0;
370-
rw_enter(&se->se_taskqid_lock, RW_WRITER);
371-
err = taskq_cancel_id(system_delay_taskq, se->se_taskqid);
365+
366+
ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
367+
368+
err = taskq_cancel_id(system_delay_taskq, se->se_taskqid, wait);
372369
/*
373-
* if we get ENOENT, the taskq couldn't be found to be
374-
* canceled, so we can just mark it as invalid because
375-
* it's already gone. If we got EBUSY, then we already
376-
* blocked until it was gone _anyway_, so we don't care.
370+
* Clear taskqid only if we successfully cancelled before execution.
371+
* For ENOENT, task already cleared it. For EBUSY, task will clear
372+
* it when done.
377373
*/
378-
se->se_taskqid = TASKQID_INVALID;
379-
rw_exit(&se->se_taskqid_lock);
380374
if (err == 0) {
375+
se->se_taskqid = TASKQID_INVALID;
381376
zfsctl_snapshot_rele(se);
382377
}
383378
}
@@ -388,12 +383,11 @@ zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
388383
static void
389384
zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
390385
{
386+
ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
391387

392388
if (delay <= 0)
393389
return;
394390

395-
zfsctl_snapshot_hold(se);
396-
rw_enter(&se->se_taskqid_lock, RW_WRITER);
397391
/*
398392
* If this condition happens, we managed to:
399393
* - dispatch once
@@ -404,13 +398,12 @@ zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
404398
* no problem.
405399
*/
406400
if (se->se_taskqid != TASKQID_INVALID) {
407-
rw_exit(&se->se_taskqid_lock);
408-
zfsctl_snapshot_rele(se);
409401
return;
410402
}
403+
404+
zfsctl_snapshot_hold(se);
411405
se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,
412406
snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
413-
rw_exit(&se->se_taskqid_lock);
414407
}
415408

416409
/*
@@ -425,9 +418,9 @@ zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
425418
zfs_snapentry_t *se;
426419
int error = ENOENT;
427420

428-
rw_enter(&zfs_snapshot_lock, RW_READER);
421+
rw_enter(&zfs_snapshot_lock, RW_WRITER);
429422
if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
430-
zfsctl_snapshot_unmount_cancel(se);
423+
zfsctl_snapshot_unmount_cancel(se, B_FALSE);
431424
zfsctl_snapshot_unmount_delay_impl(se, delay);
432425
zfsctl_snapshot_rele(se);
433426
error = 0;
@@ -614,13 +607,18 @@ zfsctl_destroy(zfsvfs_t *zfsvfs)
614607

615608
rw_enter(&zfs_snapshot_lock, RW_WRITER);
616609
se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);
617-
if (se != NULL)
618-
zfsctl_snapshot_remove(se);
619-
rw_exit(&zfs_snapshot_lock);
620610
if (se != NULL) {
621-
zfsctl_snapshot_unmount_cancel(se);
611+
zfsctl_snapshot_remove(se);
612+
/*
613+
* Don't wait if snapentry_expire task is calling
614+
* umount, which may have resulted in this destroy
615+
* call. Waiting would deadlock: snapentry_expire
616+
* waits for umount while umount waits for task.
617+
*/
618+
zfsctl_snapshot_unmount_cancel(se, B_FALSE);
622619
zfsctl_snapshot_rele(se);
623620
}
621+
rw_exit(&zfs_snapshot_lock);
624622
} else if (zfsvfs->z_ctldir) {
625623
iput(zfsvfs->z_ctldir);
626624
zfsvfs->z_ctldir = NULL;

0 commit comments

Comments
 (0)