Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
7a4bcf1
radix tree test suite: fix allocation calculation in kmem_cache_alloc…
howlett Sep 29, 2023
4d045f4
maple_tree: add mt_free_one() and mt_attr() helpers
Oct 27, 2023
4c2ec2e
maple_tree: introduce {mtree,mas}_lock_nested()
Oct 27, 2023
254698e
maple_tree: introduce interfaces __mt_dup() and mtree_dup()
Oct 27, 2023
56808c3
radix tree test suite: align kmem_cache_alloc_bulk() with kernel beha…
Oct 27, 2023
c6faa0c
maple_tree: add test for mtree_dup()
Oct 27, 2023
51ff451
maple_tree: update the documentation of maple tree
Oct 27, 2023
e92996e
maple_tree: skip other tests when BENCH is enabled
Oct 27, 2023
a53da0c
maple_tree: update check_forking() and bench_forking()
Oct 27, 2023
1071f31
maple_tree: preserve the tree attributes when destroying maple tree
Oct 27, 2023
cbcc9b0
fork: use __mt_dup() to duplicate maple tree in dup_mmap()
Oct 27, 2023
d67897c
fork: do not invoke uffd on fork if error occurs
lorenzo-stoakes Oct 15, 2024
c0f1869
mm/ksm: support fork/exec for prctl
Sep 22, 2023
4e3bd43
mm/ksm: test case for prctl fork/exec workflow
Sep 22, 2023
30944a7
mm/ksm: fix ksm exec support for prctl
tujinjiang11 Mar 28, 2024
dca7653
mm/ksm: remove redundant code in ksm_fork
tujinjiang11 Apr 2, 2024
d3d44d8
fork: only invoke khugepaged, ksm hooks if no error
lorenzo-stoakes Oct 15, 2024
9d6a9f8
fork: avoid inappropriate uprobe access to invalid mm
lorenzo-stoakes Dec 10, 2024
08dcc40
kernel: be more careful about dup_mmap() failures and uprobe registering
howlett Jan 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Documentation/core-api/maple_tree.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ section.
Sometimes it is necessary to ensure the next call to store to a maple tree does
not allocate memory, please see :ref:`maple-tree-advanced-api` for this use case.

You can use mtree_dup() to duplicate an entire maple tree. It is a more
efficient way than inserting all elements one by one into a new tree.

Finally, you can remove all entries from a maple tree by calling
mtree_destroy(). If the maple tree entries are pointers, you may wish to free
the entries first.
Expand Down Expand Up @@ -112,6 +115,7 @@ Takes ma_lock internally:
* mtree_insert()
* mtree_insert_range()
* mtree_erase()
* mtree_dup()
* mtree_destroy()
* mt_set_in_rcu()
* mt_clear_in_rcu()
Expand Down
12 changes: 12 additions & 0 deletions fs/exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@
#include <linux/coredump.h>
#include <linux/time_namespace.h>
#include <linux/user_events.h>
#include <linux/rseq.h>
#include <linux/ksm.h>

#include <linux/uaccess.h>
#include <asm/mmu_context.h>
Expand Down Expand Up @@ -264,6 +266,14 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
goto err_free;
}

/*
* Need to be called with mmap write lock
* held, to avoid race with ksmd.
*/
err = ksm_execve(mm);
if (err)
goto err_ksm;

/*
* Place the stack at the largest stack address the architecture
* supports. Later, we'll move this to an appropriate place. We don't
Expand All @@ -285,6 +295,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
bprm->p = vma->vm_end - sizeof(void *);
return 0;
err:
ksm_exit(mm);
err_ksm:
mmap_write_unlock(mm);
err_free:
bprm->vma = NULL;
Expand Down
28 changes: 28 additions & 0 deletions fs/userfaultfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,34 @@ void dup_userfaultfd_complete(struct list_head *fcs)
}
}

void dup_userfaultfd_fail(struct list_head *fcs)
{
struct userfaultfd_fork_ctx *fctx, *n;

/*
* An error has occurred on fork, we will tear memory down, but have
* allocated memory for fctx's and raised reference counts for both the
* original and child contexts (and on the mm for each as a result).
*
* These would ordinarily be taken care of by a user handling the event,
* but we are no longer doing so, so manually clean up here.
*
* mm tear down will take care of cleaning up VMA contexts.
*/
list_for_each_entry_safe(fctx, n, fcs, list) {
struct userfaultfd_ctx *octx = fctx->orig;
struct userfaultfd_ctx *ctx = fctx->new;

atomic_dec(&octx->mmap_changing);
VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
userfaultfd_ctx_put(octx);
userfaultfd_ctx_put(ctx);

list_del(&fctx->list);
kfree(fctx);
}
}

void mremap_userfaultfd_prep(struct vm_area_struct *vma,
struct vm_userfaultfd_ctx *vm_ctx)
{
Expand Down
25 changes: 14 additions & 11 deletions include/linux/ksm.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,17 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm)
return atomic_long_read(&mm->ksm_zero_pages);
}

static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
int ret;

if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) {
ret = __ksm_enter(mm);
if (ret)
return ret;
}
/* Adding mm to ksm is best effort on fork. */
if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
__ksm_enter(mm);
}

if (test_bit(MMF_VM_MERGE_ANY, &oldmm->flags))
set_bit(MMF_VM_MERGE_ANY, &mm->flags);
static inline int ksm_execve(struct mm_struct *mm)
{
if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
return __ksm_enter(mm);

return 0;
}
Expand Down Expand Up @@ -113,7 +112,11 @@ static inline int ksm_disable(struct mm_struct *mm)
return 0;
}

static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
}

static inline int ksm_execve(struct mm_struct *mm)
{
return 0;
}
Expand Down
7 changes: 7 additions & 0 deletions include/linux/maple_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,8 @@ struct maple_tree {
struct maple_tree name = MTREE_INIT(name, 0)

#define mtree_lock(mt) spin_lock((&(mt)->ma_lock))
#define mtree_lock_nested(mas, subclass) \
spin_lock_nested((&(mt)->ma_lock), subclass)
#define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock))

/*
Expand Down Expand Up @@ -327,6 +329,9 @@ int mtree_store(struct maple_tree *mt, unsigned long index,
void *entry, gfp_t gfp);
void *mtree_erase(struct maple_tree *mt, unsigned long index);

int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);

void mtree_destroy(struct maple_tree *mt);
void __mt_destroy(struct maple_tree *mt);

Expand Down Expand Up @@ -406,6 +411,8 @@ struct ma_wr_state {
};

#define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock))
#define mas_lock_nested(mas, subclass) \
spin_lock_nested(&((mas)->tree->ma_lock), subclass)
#define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock))


Expand Down
11 changes: 11 additions & 0 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -998,6 +998,17 @@ static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
return mas_expected_entries(&vmi->mas, count);
}

static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
unsigned long start, unsigned long end, gfp_t gfp)
{
__mas_set_range(&vmi->mas, start, end - 1);
mas_store_gfp(&vmi->mas, NULL, gfp);
if (unlikely(mas_is_err(&vmi->mas)))
return -ENOMEM;

return 0;
}

/* Free any unused preallocations */
static inline void vma_iter_free(struct vma_iterator *vmi)
{
Expand Down
13 changes: 8 additions & 5 deletions include/linux/sched/coredump.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */
#define MMF_MULTIPROCESS 26 /* mm is shared between processes */
/*
Expand All @@ -85,13 +86,15 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_HAS_MDWE 28
#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE)

#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)

#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
#define MMF_HAS_MDWE_NO_INHERIT 29

#define MMF_VM_MERGE_ANY 29
#define MMF_HAS_MDWE_NO_INHERIT 30
#define MMF_VM_MERGE_ANY 30
#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY)

#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
MMF_VM_MERGE_ANY_MASK)

static inline unsigned long mmf_init_flags(unsigned long flags)
{
Expand Down
5 changes: 5 additions & 0 deletions include/linux/userfaultfd_k.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,

extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
extern void dup_userfaultfd_complete(struct list_head *);
void dup_userfaultfd_fail(struct list_head *);

extern void mremap_userfaultfd_prep(struct vm_area_struct *,
struct vm_userfaultfd_ctx *);
Expand Down Expand Up @@ -256,6 +257,10 @@ static inline void dup_userfaultfd_complete(struct list_head *l)
{
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (bug_risk): Inline stub for dup_userfaultfd_fail may conflict.

There is a non-inline implementation of dup_userfaultfd_fail in fs/userfaultfd.c. The inline stub in the header might override or conflict with that implementation. Consider consolidating to a single definition to avoid potential link-time or behavioral inconsistencies.

Suggested implementation:

Make sure the non-inline definition (implementation) of dup_userfaultfd_fail in fs/userfaultfd.c is maintained and that no duplicate declarations exist.

}

static inline void dup_userfaultfd_fail(struct list_head *l)
{
}

static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
struct vm_userfaultfd_ctx *ctx)
{
Expand Down
7 changes: 7 additions & 0 deletions kernel/events/uprobes.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>
#include <linux/rcupdate_trace.h>
#include <linux/workqueue.h>
#include <linux/srcu.h>
#include <linux/oom.h> /* check_stable_address_space */

#include <linux/uprobes.h>

Expand Down Expand Up @@ -1053,6 +1057,9 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
goto free;

mmap_write_lock(mm);
if (check_stable_address_space(mm))
goto unlock;

vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
file_inode(vma->vm_file) != uprobe->inode)
Expand Down
76 changes: 52 additions & 24 deletions kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -651,14 +651,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
int retval;
unsigned long charge = 0;
LIST_HEAD(uf);
VMA_ITERATOR(old_vmi, oldmm, 0);
VMA_ITERATOR(vmi, mm, 0);

uprobe_start_dup_mmap();
if (mmap_write_lock_killable(oldmm)) {
retval = -EINTR;
goto fail_uprobe_end;
}
if (mmap_write_lock_killable(oldmm))
return -EINTR;
flush_cache_dup_mm(oldmm);
uprobe_dup_mmap(oldmm, mm);
/*
Expand All @@ -674,21 +670,22 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;

retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
khugepaged_fork(mm, oldmm);

retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
if (retval)
/* Use __mt_dup() to efficiently build an identical maple tree. */
retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
if (unlikely(retval))
goto out;

mt_clear_in_rcu(vmi.mas.tree);
for_each_vma(old_vmi, mpnt) {
for_each_vma(vmi, mpnt) {
struct file *file;

vma_start_write(mpnt);
if (mpnt->vm_flags & VM_DONTCOPY) {
retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
mpnt->vm_end, GFP_KERNEL);
if (retval)
goto loop_out;

vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
continue;
}
Expand Down Expand Up @@ -750,9 +747,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (is_vm_hugetlb_page(tmp))
hugetlb_dup_vma_private(tmp);

/* Link the vma into the MT */
if (vma_iter_bulk_store(&vmi, tmp))
goto fail_nomem_vmi_store;
/*
* Link the vma into the MT. After using __mt_dup(), memory
* allocation is not necessary here, so it cannot fail.
*/
vma_iter_bulk_store(&vmi, tmp);

mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
Expand All @@ -761,26 +760,51 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);

if (retval)
if (retval) {
mpnt = vma_next(&vmi);
goto loop_out;
}
}
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
if (!retval)
if (!retval) {
mt_set_in_rcu(vmi.mas.tree);
ksm_fork(mm, oldmm);
khugepaged_fork(mm, oldmm);
} else {

/*
* The entire maple tree has already been duplicated. If the
* mmap duplication fails, mark the failure point with
* XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
* stop releasing VMAs that have not been duplicated after this
* point.
*/
if (mpnt) {
mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
mas_store(&vmi.mas, XA_ZERO_ENTRY);
/* Avoid OOM iterating a broken tree */
set_bit(MMF_OOM_SKIP, &mm->flags);
}
/*
* The mm_struct is going to exit, but the locks will be dropped
* first. Set the mm_struct as unstable is advisable as it is
* not fully initialised.
*/
set_bit(MMF_UNSTABLE, &mm->flags);
}
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
mmap_write_unlock(oldmm);
dup_userfaultfd_complete(&uf);
fail_uprobe_end:
uprobe_end_dup_mmap();
if (!retval)
dup_userfaultfd_complete(&uf);
else
dup_userfaultfd_fail(&uf);
return retval;

fail_nomem_vmi_store:
unlink_anon_vmas(tmp);
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
fail_nomem_policy:
Expand Down Expand Up @@ -1684,9 +1708,11 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
if (!mm_init(mm, tsk, mm->user_ns))
goto fail_nomem;

uprobe_start_dup_mmap();
err = dup_mmap(mm, oldmm);
if (err)
goto free_pt;
uprobe_end_dup_mmap();

mm->hiwater_rss = get_mm_rss(mm);
mm->hiwater_vm = mm->total_vm;
Expand All @@ -1701,6 +1727,8 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
mm->binfmt = NULL;
mm_init_owner(mm, NULL);
mmput(mm);
if (err)
uprobe_end_dup_mmap();

fail_nomem:
return NULL;
Expand Down
Loading
Loading