* [PATCH v2] exit: perform randomness and pid work without tasklist_lock
@ 2025-01-28 16:07 Mateusz Guzik
2025-01-28 18:29 ` Oleg Nesterov
2025-01-31 20:55 ` Eric W. Biederman
0 siblings, 2 replies; 9+ messages in thread
From: Mateusz Guzik @ 2025-01-28 16:07 UTC (permalink / raw)
To: brauner, oleg, akpm; +Cc: linux-mm, linux-kernel, Mateusz Guzik
Both add_device_randomness() and attach_pid()/detach_pid() have their
own synchronisation mechanisms.
The clone side calls them *without* the tasklist_lock held, meaning
parallel calls can contend on their locks.
The exit side calls them *with* the tasklist_lock lock, which means the
hold time is avoidably extended by waiting on either of the 2 locks, in
turn exacerbating contention on tasklist_lock itself.
Postponing the work until after the lock is dropped bumps thread
creation/destruction rate by 15% on a 24 core vm.
Bench (plop into will-it-scale):
$ cat tests/threadspawn1.c
char *testcase_description = "Thread creation and teardown";
static void *worker(void *arg)
{
return (NULL);
}
void testcase(unsigned long long *iterations, unsigned long nr)
{
pthread_t thread;
int error;
while (1) {
error = pthread_create(&thread, NULL, worker, NULL);
assert(error == 0);
error = pthread_join(thread, NULL);
assert(error == 0);
(*iterations)++;
}
}
Run:
$ ./threadspawn1_processes -t 24
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
v2:
- introduce a struct to collect work
- move out pids as well
there is more which can be pulled out
this may look suspicious:
+ proc_flush_pid(p->thread_pid);
AFAICS this is constant for the duration of the lifetime, so i don't
there is a problem
include/linux/pid.h | 1 +
kernel/exit.c | 53 +++++++++++++++++++++++++++++++++------------
kernel/pid.c | 23 +++++++++++++++-----
3 files changed, 58 insertions(+), 19 deletions(-)
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 98837a1ff0f3..6e9fcacd02cd 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -101,6 +101,7 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
* these helpers must be called with the tasklist_lock write-held.
*/
extern void attach_pid(struct task_struct *task, enum pid_type);
+extern struct pid *detach_pid_return(struct task_struct *task, enum pid_type);
extern void detach_pid(struct task_struct *task, enum pid_type);
extern void change_pid(struct task_struct *task, enum pid_type,
struct pid *pid);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1dcddfe537ee..4e452d3e3a89 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -122,14 +122,23 @@ static __init int kernel_exit_sysfs_init(void)
late_initcall(kernel_exit_sysfs_init);
#endif
-static void __unhash_process(struct task_struct *p, bool group_dead)
+/*
+ * For things release_task would like to do *after* tasklist_lock is released.
+ */
+struct release_task_post {
+ unsigned long long randomness;
+ struct pid *pids[PIDTYPE_MAX];
+};
+
+static void __unhash_process(struct release_task_post *post, struct task_struct *p,
+ bool group_dead)
{
nr_threads--;
- detach_pid(p, PIDTYPE_PID);
+ post->pids[PIDTYPE_PID] = detach_pid_return(p, PIDTYPE_PID);
if (group_dead) {
- detach_pid(p, PIDTYPE_TGID);
- detach_pid(p, PIDTYPE_PGID);
- detach_pid(p, PIDTYPE_SID);
+ post->pids[PIDTYPE_TGID] = detach_pid_return(p, PIDTYPE_TGID);
+ post->pids[PIDTYPE_PGID] = detach_pid_return(p, PIDTYPE_PGID);
+ post->pids[PIDTYPE_SID] = detach_pid_return(p, PIDTYPE_SID);
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
@@ -141,7 +150,8 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
/*
* This function expects the tasklist_lock write-locked.
*/
-static void __exit_signal(struct task_struct *tsk)
+static void __exit_signal(struct release_task_post *post,
+ struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
@@ -174,8 +184,7 @@ static void __exit_signal(struct task_struct *tsk)
sig->curr_target = next_thread(tsk);
}
- add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
- sizeof(unsigned long long));
+ post->randomness = tsk->se.sum_exec_runtime;
/*
* Accumulate here the counters for all threads as they die. We could
@@ -197,7 +206,7 @@ static void __exit_signal(struct task_struct *tsk)
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
- __unhash_process(tsk, group_dead);
+ __unhash_process(post, tsk, group_dead);
write_sequnlock(&sig->stats_lock);
/*
@@ -240,9 +249,13 @@ void __weak release_thread(struct task_struct *dead_task)
void release_task(struct task_struct *p)
{
struct task_struct *leader;
- struct pid *thread_pid;
int zap_leader;
+ struct release_task_post post;
repeat:
+ memset(&post, 0, sizeof(post));
+
+ proc_flush_pid(p->thread_pid);
+
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
@@ -253,8 +266,7 @@ void release_task(struct task_struct *p)
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
- thread_pid = get_pid(p->thread_pid);
- __exit_signal(p);
+ __exit_signal(&post, p);
/*
* If we are the last non-leader member of the thread
@@ -276,8 +288,21 @@ void release_task(struct task_struct *p)
}
write_unlock_irq(&tasklist_lock);
- proc_flush_pid(thread_pid);
- put_pid(thread_pid);
+
+ /*
+ * Process clean up deferred to after we drop the tasklist lock.
+ */
+ add_device_randomness((const void*) &post.randomness,
+ sizeof(unsigned long long));
+ if (post.pids[PIDTYPE_PID])
+ free_pid(post.pids[PIDTYPE_PID]);
+ if (post.pids[PIDTYPE_TGID])
+ free_pid(post.pids[PIDTYPE_TGID]);
+ if (post.pids[PIDTYPE_PGID])
+ free_pid(post.pids[PIDTYPE_PGID]);
+ if (post.pids[PIDTYPE_SID])
+ free_pid(post.pids[PIDTYPE_SID]);
+
release_thread(p);
put_task_struct_rcu_user(p);
diff --git a/kernel/pid.c b/kernel/pid.c
index 3a10a7b6fcf8..047cdbcef5cf 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -343,7 +343,7 @@ void attach_pid(struct task_struct *task, enum pid_type type)
hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}
-static void __change_pid(struct task_struct *task, enum pid_type type,
+static struct pid *__change_pid(struct task_struct *task, enum pid_type type,
struct pid *new)
{
struct pid **pid_ptr = task_pid_ptr(task, type);
@@ -362,20 +362,33 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (pid_has_task(pid, tmp))
- return;
+ return NULL;
- free_pid(pid);
+ return pid;
+}
+
+struct pid *detach_pid_return(struct task_struct *task, enum pid_type type)
+{
+ return __change_pid(task, type, NULL);
}
void detach_pid(struct task_struct *task, enum pid_type type)
{
- __change_pid(task, type, NULL);
+ struct pid *pid;
+
+ pid = detach_pid_return(task, type);
+ if (pid)
+ free_pid(pid);
}
void change_pid(struct task_struct *task, enum pid_type type,
struct pid *pid)
{
- __change_pid(task, type, pid);
+ struct pid *opid;
+
+ opid = __change_pid(task, type, pid);
+ if (opid)
+ free_pid(opid);
attach_pid(task, type);
}
--
2.43.0
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH v2] exit: perform randomness and pid work without tasklist_lock
2025-01-28 16:07 [PATCH v2] exit: perform randomness and pid work without tasklist_lock Mateusz Guzik
@ 2025-01-28 18:29 ` Oleg Nesterov
2025-01-28 18:38 ` Mateusz Guzik
2025-01-31 20:55 ` Eric W. Biederman
1 sibling, 1 reply; 9+ messages in thread
From: Oleg Nesterov @ 2025-01-28 18:29 UTC (permalink / raw)
To: Mateusz Guzik; +Cc: brauner, akpm, linux-mm, linux-kernel, Eric W. Biederman
(Add Eric).
On 01/28, Mateusz Guzik wrote:
>
> Both add_device_randomness() and attach_pid()/detach_pid()
So afaics this patch does 2 different things, and I do think this needs
2 separate patches. Can you split this change please?
As for add_device_randomness(). I must have missed something, but I still can't
understand why we can't simply shift add_device_randomness(p->sum_exec_runtime)
to release_release_task() and avoid release_task_post->randomness.
You said:
I wanted to keep the load where it was
but why??? Again, I must have missed something, but to me this simply adds the
unnecessary complications. Either way, ->sum_exec_runtime is not stable even if
task-to-release != current, so what is the point?
Oleg.
> have their
> own synchronisation mechanisms.
>
> The clone side calls them *without* the tasklist_lock held, meaning
> parallel calls can contend on their locks.
>
> The exit side calls them *with* the tasklist_lock lock, which means the
> hold time is avoidably extended by waiting on either of the 2 locks, in
> turn exacerbating contention on tasklist_lock itself.
>
> Postponing the work until after the lock is dropped bumps thread
> creation/destruction rate by 15% on a 24 core vm.
>
> Bench (plop into will-it-scale):
> $ cat tests/threadspawn1.c
>
> char *testcase_description = "Thread creation and teardown";
>
> static void *worker(void *arg)
> {
> return (NULL);
> }
>
> void testcase(unsigned long long *iterations, unsigned long nr)
> {
> pthread_t thread;
> int error;
>
> while (1) {
> error = pthread_create(&thread, NULL, worker, NULL);
> assert(error == 0);
> error = pthread_join(thread, NULL);
> assert(error == 0);
> (*iterations)++;
> }
> }
>
> Run:
> $ ./threadspawn1_processes -t 24
>
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
>
> v2:
> - introduce a struct to collect work
> - move out pids as well
>
> there is more which can be pulled out
>
> this may look suspicious:
> + proc_flush_pid(p->thread_pid);
>
> AFAICS this is constant for the duration of the lifetime, so i don't
> there is a problem
>
> include/linux/pid.h | 1 +
> kernel/exit.c | 53 +++++++++++++++++++++++++++++++++------------
> kernel/pid.c | 23 +++++++++++++++-----
> 3 files changed, 58 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 98837a1ff0f3..6e9fcacd02cd 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -101,6 +101,7 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
> * these helpers must be called with the tasklist_lock write-held.
> */
> extern void attach_pid(struct task_struct *task, enum pid_type);
> +extern struct pid *detach_pid_return(struct task_struct *task, enum pid_type);
> extern void detach_pid(struct task_struct *task, enum pid_type);
> extern void change_pid(struct task_struct *task, enum pid_type,
> struct pid *pid);
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 1dcddfe537ee..4e452d3e3a89 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -122,14 +122,23 @@ static __init int kernel_exit_sysfs_init(void)
> late_initcall(kernel_exit_sysfs_init);
> #endif
>
> -static void __unhash_process(struct task_struct *p, bool group_dead)
> +/*
> + * For things release_task would like to do *after* tasklist_lock is released.
> + */
> +struct release_task_post {
> + unsigned long long randomness;
> + struct pid *pids[PIDTYPE_MAX];
> +};
> +
> +static void __unhash_process(struct release_task_post *post, struct task_struct *p,
> + bool group_dead)
> {
> nr_threads--;
> - detach_pid(p, PIDTYPE_PID);
> + post->pids[PIDTYPE_PID] = detach_pid_return(p, PIDTYPE_PID);
> if (group_dead) {
> - detach_pid(p, PIDTYPE_TGID);
> - detach_pid(p, PIDTYPE_PGID);
> - detach_pid(p, PIDTYPE_SID);
> + post->pids[PIDTYPE_TGID] = detach_pid_return(p, PIDTYPE_TGID);
> + post->pids[PIDTYPE_PGID] = detach_pid_return(p, PIDTYPE_PGID);
> + post->pids[PIDTYPE_SID] = detach_pid_return(p, PIDTYPE_SID);
>
> list_del_rcu(&p->tasks);
> list_del_init(&p->sibling);
> @@ -141,7 +150,8 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
> /*
> * This function expects the tasklist_lock write-locked.
> */
> -static void __exit_signal(struct task_struct *tsk)
> +static void __exit_signal(struct release_task_post *post,
> + struct task_struct *tsk)
> {
> struct signal_struct *sig = tsk->signal;
> bool group_dead = thread_group_leader(tsk);
> @@ -174,8 +184,7 @@ static void __exit_signal(struct task_struct *tsk)
> sig->curr_target = next_thread(tsk);
> }
>
> - add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
> - sizeof(unsigned long long));
> + post->randomness = tsk->se.sum_exec_runtime;
>
> /*
> * Accumulate here the counters for all threads as they die. We could
> @@ -197,7 +206,7 @@ static void __exit_signal(struct task_struct *tsk)
> task_io_accounting_add(&sig->ioac, &tsk->ioac);
> sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
> sig->nr_threads--;
> - __unhash_process(tsk, group_dead);
> + __unhash_process(post, tsk, group_dead);
> write_sequnlock(&sig->stats_lock);
>
> /*
> @@ -240,9 +249,13 @@ void __weak release_thread(struct task_struct *dead_task)
> void release_task(struct task_struct *p)
> {
> struct task_struct *leader;
> - struct pid *thread_pid;
> int zap_leader;
> + struct release_task_post post;
> repeat:
> + memset(&post, 0, sizeof(post));
> +
> + proc_flush_pid(p->thread_pid);
> +
> /* don't need to get the RCU readlock here - the process is dead and
> * can't be modifying its own credentials. But shut RCU-lockdep up */
> rcu_read_lock();
> @@ -253,8 +266,7 @@ void release_task(struct task_struct *p)
>
> write_lock_irq(&tasklist_lock);
> ptrace_release_task(p);
> - thread_pid = get_pid(p->thread_pid);
> - __exit_signal(p);
> + __exit_signal(&post, p);
>
> /*
> * If we are the last non-leader member of the thread
> @@ -276,8 +288,21 @@ void release_task(struct task_struct *p)
> }
>
> write_unlock_irq(&tasklist_lock);
> - proc_flush_pid(thread_pid);
> - put_pid(thread_pid);
> +
> + /*
> + * Process clean up deferred to after we drop the tasklist lock.
> + */
> + add_device_randomness((const void*) &post.randomness,
> + sizeof(unsigned long long));
> + if (post.pids[PIDTYPE_PID])
> + free_pid(post.pids[PIDTYPE_PID]);
> + if (post.pids[PIDTYPE_TGID])
> + free_pid(post.pids[PIDTYPE_TGID]);
> + if (post.pids[PIDTYPE_PGID])
> + free_pid(post.pids[PIDTYPE_PGID]);
> + if (post.pids[PIDTYPE_SID])
> + free_pid(post.pids[PIDTYPE_SID]);
> +
> release_thread(p);
> put_task_struct_rcu_user(p);
>
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 3a10a7b6fcf8..047cdbcef5cf 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -343,7 +343,7 @@ void attach_pid(struct task_struct *task, enum pid_type type)
> hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
> }
>
> -static void __change_pid(struct task_struct *task, enum pid_type type,
> +static struct pid *__change_pid(struct task_struct *task, enum pid_type type,
> struct pid *new)
> {
> struct pid **pid_ptr = task_pid_ptr(task, type);
> @@ -362,20 +362,33 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
>
> for (tmp = PIDTYPE_MAX; --tmp >= 0; )
> if (pid_has_task(pid, tmp))
> - return;
> + return NULL;
>
> - free_pid(pid);
> + return pid;
> +}
> +
> +struct pid *detach_pid_return(struct task_struct *task, enum pid_type type)
> +{
> + return __change_pid(task, type, NULL);
> }
>
> void detach_pid(struct task_struct *task, enum pid_type type)
> {
> - __change_pid(task, type, NULL);
> + struct pid *pid;
> +
> + pid = detach_pid_return(task, type);
> + if (pid)
> + free_pid(pid);
> }
>
> void change_pid(struct task_struct *task, enum pid_type type,
> struct pid *pid)
> {
> - __change_pid(task, type, pid);
> + struct pid *opid;
> +
> + opid = __change_pid(task, type, pid);
> + if (opid)
> + free_pid(opid);
> attach_pid(task, type);
> }
>
> --
> 2.43.0
>
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH v2] exit: perform randomness and pid work without tasklist_lock
2025-01-28 18:29 ` Oleg Nesterov
@ 2025-01-28 18:38 ` Mateusz Guzik
2025-01-28 19:22 ` Oleg Nesterov
0 siblings, 1 reply; 9+ messages in thread
From: Mateusz Guzik @ 2025-01-28 18:38 UTC (permalink / raw)
To: Oleg Nesterov; +Cc: brauner, akpm, linux-mm, linux-kernel, Eric W. Biederman
On Tue, Jan 28, 2025 at 7:30 PM Oleg Nesterov <oleg@redhat.com> wrote:
>
> (Add Eric).
>
>
> On 01/28, Mateusz Guzik wrote:
> >
> > Both add_device_randomness() and attach_pid()/detach_pid()
>
> So afaics this patch does 2 different things, and I do think this needs
> 2 separate patches. Can you split this change please?
>
no problem, will send a v3 provided there are no issues reported
concerning the pid stuff
maybe i'll add few more things pulled out to further justify the struct
> As for add_device_randomness(). I must have missed something, but I still can't
> understand why we can't simply shift add_device_randomness(p->sum_exec_runtime)
> to release_release_task() and avoid release_task_post->randomness.
>
> You said:
>
> I wanted to keep the load where it was
>
> but why??? Again, I must have missed something, but to me this simply adds the
> unnecessary complications. Either way, ->sum_exec_runtime is not stable even if
> task-to-release != current, so what is the point?
>
Perhaps I should preface this is not a hill I'm going to die on. :->
This is the spot which is known to work and release_task does not
access the area otherwise. So for all I know someone will change it
later to be freeable, zeroed for "hardening" or some other crap and
the read moved to later will quietly break to always add the same
value. So by default I don't want to change aspect.
However, if you insist on the read to just moving, I'll be more than
happy to do that in v3.
--
Mateusz Guzik <mjguzik gmail.com>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] exit: perform randomness and pid work without tasklist_lock
2025-01-28 18:38 ` Mateusz Guzik
@ 2025-01-28 19:22 ` Oleg Nesterov
2025-01-30 11:01 ` Mateusz Guzik
0 siblings, 1 reply; 9+ messages in thread
From: Oleg Nesterov @ 2025-01-28 19:22 UTC (permalink / raw)
To: Mateusz Guzik; +Cc: brauner, akpm, linux-mm, linux-kernel, Eric W. Biederman
On 01/28, Mateusz Guzik wrote:
>
> On Tue, Jan 28, 2025 at 7:30 PM Oleg Nesterov <oleg@redhat.com> wrote:
> >
> no problem, will send a v3 provided there are no issues reported
> concerning the pid stuff
Great, thanks.
BTW, I didn't look at the pid stuff yet, I _feel_ that this can be simplified
too, but I am already sleeping, most probably I am wrong.
> > As for add_device_randomness(). I must have missed something, but I still can't
> > understand why we can't simply shift add_device_randomness(p->sum_exec_runtime)
> > to release_release_task() and avoid release_task_post->randomness.
> >
> > You said:
> >
> > I wanted to keep the load where it was
> >
> > but why??? Again, I must have missed something, but to me this simply adds the
> > unnecessary complications. Either way, ->sum_exec_runtime is not stable even if
> > task-to-release != current, so what is the point?
> >
>
> Perhaps I should preface this is not a hill I'm going to die on. :->
>
> This is the spot which is known to work and release_task does not
> access the area otherwise. So for all I know someone will change it
> later to be freeable, zeroed for "hardening"
sum_exec_runtime can't be freed/zeroed/etc in any case.
Again, please note that task-to-release can still be running, especially
if it is current.
> always add the same value.
I don't think that "add the same value" does matter at all in this case,
but please correct me.
> So by default I don't want to change aspect.
>
> However, if you insist on the read to just moving, I'll be more than
> happy to do that in v3.
Thanks, see above ;)
Oleg.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] exit: perform randomness and pid work without tasklist_lock
2025-01-28 19:22 ` Oleg Nesterov
@ 2025-01-30 11:01 ` Mateusz Guzik
0 siblings, 0 replies; 9+ messages in thread
From: Mateusz Guzik @ 2025-01-30 11:01 UTC (permalink / raw)
To: Oleg Nesterov; +Cc: brauner, akpm, linux-mm, linux-kernel, Eric W. Biederman
On Tue, Jan 28, 2025 at 8:22 PM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 01/28, Mateusz Guzik wrote:
> >
> > On Tue, Jan 28, 2025 at 7:30 PM Oleg Nesterov <oleg@redhat.com> wrote:
> > >
> > no problem, will send a v3 provided there are no issues reported
> > concerning the pid stuff
>
> Great, thanks.
>
> BTW, I didn't look at the pid stuff yet, I _feel_ that this can be simplified
> too, but I am already sleeping, most probably I am wrong.
>
I looked at pid code apart from the issue at hand.
It the lock protecting it uses irq disablement to guard against
tasklist_lock users coming from an interrupt.
AFAICS this can be legally arranged so that the pidmap_lock is *never*
taken while tasklist_lock is held.
so far the problematic ordering only stems from free_pid calls (not
only on exit), which can all be moved out.
this will reduce total tasklist_lock hold time *and* whack the irq
trip, speeding this up single-threaded
I'll hack it up when I get around to it, maybe this week.
btw, with the current patch when rolling with highly parallel thread
creation/destruction it is pidmap_lock which is the main bottleneck
instead of tasklist_lock
--
Mateusz Guzik <mjguzik gmail.com>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] exit: perform randomness and pid work without tasklist_lock
2025-01-28 16:07 [PATCH v2] exit: perform randomness and pid work without tasklist_lock Mateusz Guzik
2025-01-28 18:29 ` Oleg Nesterov
@ 2025-01-31 20:55 ` Eric W. Biederman
2025-01-31 22:31 ` Mateusz Guzik
1 sibling, 1 reply; 9+ messages in thread
From: Eric W. Biederman @ 2025-01-31 20:55 UTC (permalink / raw)
To: Mateusz Guzik; +Cc: brauner, oleg, akpm, linux-mm, linux-kernel
Oleg asked that I take a look, and I took one.
I very much agree with Oleg that this should be one patch per thing
you want to effect as the issues can be intricate in this part of
the code.
Moving proc_flush_pid inside of tasklist_lock is a bad idea. The code
has previously been several kinds of a sore spot. If you look at
proc_invalidate_siblings_dcache you can see calls to d_invalidate,
deactivate_super, and a few other vfs calls that could potentially do
quite a lot of work and potentially take a number of locks. It has been
a long time but I remember when we used to flush the proc entries under
the tasklist_lock that there were actual deadlocks caused by some rare
code paths that were trying to free memory to allocate memory to make
progress.
It is wrong that attach_pid/detach_pid can be performed without the
tasklist_lock. There are reasonable guarantees provided by the posix
standard that the set of processes sent a signal is the set of
processes at a point in time. The tasklist_lock is how we provide
those guarantees currently.
There are two more layers to pids. The pid number allocation of
alloc_pid/free_pid, and the struct pid layer maintained by get_pid,
put_pid. Those two layers don't need the tasklist_lock.
It is safe to move free_pid out of tasklist_lock. I am not certain
how sane it is.
Eric
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] exit: perform randomness and pid work without tasklist_lock
2025-01-31 20:55 ` Eric W. Biederman
@ 2025-01-31 22:31 ` Mateusz Guzik
2025-01-31 23:09 ` Eric W. Biederman
0 siblings, 1 reply; 9+ messages in thread
From: Mateusz Guzik @ 2025-01-31 22:31 UTC (permalink / raw)
To: Eric W. Biederman; +Cc: brauner, oleg, akpm, linux-mm, linux-kernel
On Fri, Jan 31, 2025 at 9:56 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
> Moving proc_flush_pid inside of tasklist_lock is a bad idea.
The patch does not make such a change though.
The call is still performed without the lock, but it also dodges the
additional refcount dance (and notably eliminates an atomic from an
area protected by tasklist_lock).
>
> It is wrong that attach_pid/detach_pid can be performed without the
> tasklist_lock. There are reasonable guarantees provided by the posix
> standard that the set of processes sent a signal is the set of
> processes at a point in time. The tasklist_lock is how we provide
> those guarantees currently.
>
I don't see anything calling these without the lock and neither my
patch nor a follow up about pids suggest anything of the sort.
> There are two more layers to pids. The pid number allocation of
> alloc_pid/free_pid, and the struct pid layer maintained by get_pid,
> put_pid. Those two layers don't need the tasklist_lock.
>
>
> It is safe to move free_pid out of tasklist_lock. I am not certain
> how sane it is.
>
Where is the sanity problem here? AFAICS this just delays some wakeups
in the worst case.
Regardless, looks like I have enough to send a v2 for further commentary.
--
Mateusz Guzik <mjguzik gmail.com>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] exit: perform randomness and pid work without tasklist_lock
2025-01-31 22:31 ` Mateusz Guzik
@ 2025-01-31 23:09 ` Eric W. Biederman
2025-02-01 14:03 ` Mateusz Guzik
0 siblings, 1 reply; 9+ messages in thread
From: Eric W. Biederman @ 2025-01-31 23:09 UTC (permalink / raw)
To: Mateusz Guzik; +Cc: brauner, oleg, akpm, linux-mm, linux-kernel
Mateusz Guzik <mjguzik@gmail.com> writes:
> On Fri, Jan 31, 2025 at 9:56 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>> Moving proc_flush_pid inside of tasklist_lock is a bad idea.
>
> The patch does not make such a change though.
>
> The call is still performed without the lock, but it also dodges the
> additional refcount dance (and notably eliminates an atomic from an
> area protected by tasklist_lock).
My mistake I saw you had moved it up, but I had missed just how
far.
It is still a bad idea to move it early, as that has caused problems
with lingering proc entries for reaped task clogging up the dcache.
>> It is wrong that attach_pid/detach_pid can be performed without the
>> tasklist_lock. There are reasonable guarantees provided by the posix
>> standard that the set of processes sent a signal is the set of
>> processes at a point in time. The tasklist_lock is how we provide
>> those guarantees currently.
>>
>
> I don't see anything calling these without the lock and neither my
> patch nor a follow up about pids suggest anything of the sort.
No. You simply said fork called attach_pid without the lock and
your description implied it was safe to call attach_pid/detach_pid
without the lock.
>> There are two more layers to pids. The pid number allocation of
>> alloc_pid/free_pid, and the struct pid layer maintained by get_pid,
>> put_pid. Those two layers don't need the tasklist_lock.
>>
>>
>> It is safe to move free_pid out of tasklist_lock. I am not certain
>> how sane it is.
>>
>
> Where is the sanity problem here? AFAICS this just delays some wakeups
> in the worst case.
At the end of the day it might be a sensible way to go. I just haven't
found the arguments to convince my gut of that yet. It is important for
me at least to convince my gut because it usually notices problems
before the rest of me does.
Eric
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] exit: perform randomness and pid work without tasklist_lock
2025-01-31 23:09 ` Eric W. Biederman
@ 2025-02-01 14:03 ` Mateusz Guzik
0 siblings, 0 replies; 9+ messages in thread
From: Mateusz Guzik @ 2025-02-01 14:03 UTC (permalink / raw)
To: Eric W. Biederman; +Cc: brauner, oleg, akpm, linux-mm, linux-kernel
On Sat, Feb 1, 2025 at 12:09 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Mateusz Guzik <mjguzik@gmail.com> writes:
>
> > On Fri, Jan 31, 2025 at 9:56 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
> >> Moving proc_flush_pid inside of tasklist_lock is a bad idea.
> >
> > The patch does not make such a change though.
> >
> > The call is still performed without the lock, but it also dodges the
> > additional refcount dance (and notably eliminates an atomic from an
> > area protected by tasklist_lock).
>
> My mistake I saw you had moved it up, but I had missed just how
> far.
>
> It is still a bad idea to move it early, as that has caused problems
> with lingering proc entries for reaped task clogging up the dcache.
>
I would argue the time window to find the about-to-be-whacked task is
not big, but this part is not important enough for me to push for it.
So I'm going to drop this bit for now.
> >> It is wrong that attach_pid/detach_pid can be performed without the
> >> tasklist_lock. There are reasonable guarantees provided by the posix
> >> standard that the set of processes sent a signal is the set of
> >> processes at a point in time. The tasklist_lock is how we provide
> >> those guarantees currently.
> >>
> >
> > I don't see anything calling these without the lock and neither my
> > patch nor a follow up about pids suggest anything of the sort.
>
> No. You simply said fork called attach_pid without the lock and
> your description implied it was safe to call attach_pid/detach_pid
> without the lock.
>
Huh, indeed that's how it reads like. That's very poorly stated at best, my bad.
The key was *allocating* a pid happens without the tasklist_lock (but
with pidmap_lock) and the part which gets rid of it (detach_pid ->
free_pid) operates under both.
As you can see the patch keeps detach_pid inside the
tasklist_lock-protected area.
> >> It is safe to move free_pid out of tasklist_lock. I am not certain
> >> how sane it is.
> >>
> >
> > Where is the sanity problem here? AFAICS this just delays some wakeups
> > in the worst case.
>
> At the end of the day it might be a sensible way to go. I just haven't
> found the arguments to convince my gut of that yet. It is important for
> me at least to convince my gut because it usually notices problems
> before the rest of me does.
>
There is definitely no rush.
I'm going to cook v3 if only just for fun.
--
Mateusz Guzik <mjguzik gmail.com>
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2025-02-01 14:04 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-01-28 16:07 [PATCH v2] exit: perform randomness and pid work without tasklist_lock Mateusz Guzik
2025-01-28 18:29 ` Oleg Nesterov
2025-01-28 18:38 ` Mateusz Guzik
2025-01-28 19:22 ` Oleg Nesterov
2025-01-30 11:01 ` Mateusz Guzik
2025-01-31 20:55 ` Eric W. Biederman
2025-01-31 22:31 ` Mateusz Guzik
2025-01-31 23:09 ` Eric W. Biederman
2025-02-01 14:03 ` Mateusz Guzik
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox