From d67e03e361619b20c51aaef3b7dd1497617c371d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Sep 2021 11:23:38 -0500 Subject: exit: Factor coredump_exit_mm out of exit_mm Separate the coredump logic from the ordinary exit_mm logic by moving the coredump logic out of exit_mm into it's own function coredump_exit_mm. Link: https://lkml.kernel.org/r/87a6k2x277.fsf@disp2133 Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 76 ++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 35 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 91a43e57a32e..cb1619d8fd64 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -339,6 +339,46 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } +static void coredump_exit_mm(struct mm_struct *mm) +{ + struct core_state *core_state; + + /* + * Serialize with any possible pending coredump. + * We must hold mmap_lock around checking core_state + * and clearing tsk->mm. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group with ->mm != NULL. + */ + core_state = mm->core_state; + if (core_state) { + struct core_thread self; + + mmap_read_unlock(mm); + + self.task = current; + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; + /* + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. + */ + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!self.task) /* see coredump_finish() */ + break; + freezable_schedule(); + } + __set_current_state(TASK_RUNNING); + mmap_read_lock(mm); + } +} + #ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. @@ -434,47 +474,13 @@ assign_new_owner: static void exit_mm(void) { struct mm_struct *mm = current->mm; - struct core_state *core_state; exit_mm_release(current, mm); if (!mm) return; sync_mm_rss(mm); - /* - * Serialize with any possible pending coredump. - * We must hold mmap_lock around checking core_state - * and clearing tsk->mm. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group with ->mm != NULL. - */ mmap_read_lock(mm); - core_state = mm->core_state; - if (core_state) { - struct core_thread self; - - mmap_read_unlock(mm); - - self.task = current; - if (self.task->flags & PF_SIGNALED) - self.next = xchg(&core_state->dumper.next, &self); - else - self.task = NULL; - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!self.task) /* see coredump_finish() */ - break; - freezable_schedule(); - } - __set_current_state(TASK_RUNNING); - mmap_read_lock(mm); - } + coredump_exit_mm(mm); mmgrab(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ -- cgit v1.2.3 From 92307383082daff5df884a25df9e283efb7ef261 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Sep 2021 11:33:50 -0500 Subject: coredump: Don't perform any cleanups before dumping core Rename coredump_exit_mm to coredump_task_exit and call it from do_exit before PTRACE_EVENT_EXIT, and before any cleanup work for a task happens. This ensures that an accurate copy of the process can be captured in the coredump as no cleanup for the process happens before the coredump completes. This also ensures that PTRACE_EVENT_EXIT will not be visited by any thread until the coredump is complete. Add a new flag PF_POSTCOREDUMP so that tasks that have passed through coredump_task_exit can be recognized and ignored in zap_process. Now that all of the coredumping happens before exit_mm remove code to test for a coredump in progress from mm_release. Replace "may_ptrace_stop()" with a simple test of "current->ptrace". The other tests in may_ptrace_stop all concern avoiding stopping during a coredump. These tests are no longer necessary as it is now guaranteed that fatal_signal_pending will be set if the code enters ptrace_stop during a coredump. The code in ptrace_stop is guaranteed not to stop if fatal_signal_pending returns true. Until this change "ptrace_event(PTRACE_EVENT_EXIT)" could call ptrace_stop without fatal_signal_pending being true, as signals are dequeued in get_signal before calling do_exit. This is no longer an issue as "ptrace_event(PTRACE_EVENT_EXIT)" is no longer reached until after the coredump completes. Link: https://lkml.kernel.org/r/874kaax26c.fsf@disp2133 Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index cb1619d8fd64..774e6b5061b8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -339,23 +339,29 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -static void coredump_exit_mm(struct mm_struct *mm) +static void coredump_task_exit(struct task_struct *tsk) { struct core_state *core_state; + struct mm_struct *mm; + + mm = tsk->mm; + if (!mm) + return; /* * Serialize with any possible pending coredump. * We must hold mmap_lock around checking core_state - * and clearing tsk->mm. The core-inducing thread + * and setting PF_POSTCOREDUMP. The core-inducing thread * will increment ->nr_threads for each thread in the - * group with ->mm != NULL. + * group without PF_POSTCOREDUMP set. */ + mmap_read_lock(mm); + tsk->flags |= PF_POSTCOREDUMP; core_state = mm->core_state; + mmap_read_unlock(mm); if (core_state) { struct core_thread self; - mmap_read_unlock(mm); - self.task = current; if (self.task->flags & PF_SIGNALED) self.next = xchg(&core_state->dumper.next, &self); @@ -375,7 +381,6 @@ static void coredump_exit_mm(struct mm_struct *mm) freezable_schedule(); } __set_current_state(TASK_RUNNING); - mmap_read_lock(mm); } } @@ -480,7 +485,6 @@ static void exit_mm(void) return; sync_mm_rss(mm); mmap_read_lock(mm); - coredump_exit_mm(mm); mmgrab(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ @@ -768,6 +772,7 @@ void __noreturn do_exit(long code) profile_task_exit(tsk); kcov_task_exit(tsk); + coredump_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); -- cgit v1.2.3 From 0258b5fd7c7124b87e185a1a9322d2c66b1876b7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 22 Sep 2021 11:24:02 -0500 Subject: coredump: Limit coredumps to a single thread group Today when a signal is delivered with a handler of SIG_DFL whose default behavior is to generate a core dump not only that process but every process that shares the mm is killed. In the case of vfork this looks like a real world problem. Consider the following well defined sequence. if (vfork() == 0) { execve(...); _exit(EXIT_FAILURE); } If a signal that generates a core dump is received after vfork but before the execve changes the mm the process that called vfork will also be killed (as the mm is shared). Similarly if the execve fails after the point of no return the kernel delivers SIGSEGV which will kill both the exec'ing process and because the mm is shared the process that called vfork as well. As far as I can tell this behavior is a violation of people's reasonable expectations, POSIX, and is unnecessarily fragile when the system is low on memory. Solve this by making a userspace visible change to only kill a single process/thread group. This is possible because Jann Horn recently modified[1] the coredump code so that the mm can safely be modified while the coredump is happening. With LinuxThreads long gone I don't expect anyone to have a notice this behavior change in practice. To accomplish this move the core_state pointer from mm_struct to signal_struct, which allows different thread groups to coredump simultatenously. In zap_threads remove the work to kill anything except for the current thread group. v2: Remove core_state from the VM_BUG_ON_MM print to fix compile failure when CONFIG_DEBUG_VM is enabled. Reported-by: Stephen Rothwell [1] a07279c9a8cd ("binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot") Fixes: d89f3847def4 ("[PATCH] thread-aware coredumps, 2.5.43-C3") History-tree: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Link: https://lkml.kernel.org/r/87y27mvnke.fsf@disp2133 Link: https://lkml.kernel.org/r/20211007144701.67592574@canb.auug.org.au Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 774e6b5061b8..2b355e926c13 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -342,23 +342,18 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) static void coredump_task_exit(struct task_struct *tsk) { struct core_state *core_state; - struct mm_struct *mm; - - mm = tsk->mm; - if (!mm) - return; /* * Serialize with any possible pending coredump. - * We must hold mmap_lock around checking core_state + * We must hold siglock around checking core_state * and setting PF_POSTCOREDUMP. The core-inducing thread * will increment ->nr_threads for each thread in the * group without PF_POSTCOREDUMP set. */ - mmap_read_lock(mm); + spin_lock_irq(&tsk->sighand->siglock); tsk->flags |= PF_POSTCOREDUMP; - core_state = mm->core_state; - mmap_read_unlock(mm); + core_state = tsk->signal->core_state; + spin_unlock_irq(&tsk->sighand->siglock); if (core_state) { struct core_thread self; -- cgit v1.2.3