linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Oleg Nesterov <oleg@redhat.com>
To: Mateusz Guzik <mjguzik@gmail.com>
Cc: brauner@kernel.org, akpm@linux-foundation.org,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	"Eric W. Biederman" <ebiederm@xmission.com>
Subject: Re: [PATCH v2] exit: perform randomness and pid work without tasklist_lock
Date: Tue, 28 Jan 2025 19:29:33 +0100	[thread overview]
Message-ID: <20250128182932.GC24845@redhat.com> (raw)
In-Reply-To: <20250128160743.3142544-1-mjguzik@gmail.com>

(Add Eric).


On 01/28, Mateusz Guzik wrote:
>
> Both add_device_randomness() and attach_pid()/detach_pid()

So afaics this patch does 2 different things, and I do think this needs
2 separate patches. Can you split this change please?

As for add_device_randomness(). I must have missed something, but I still can't
understand why we can't simply shift add_device_randomness(p->sum_exec_runtime)
to release_release_task() and avoid release_task_post->randomness.

You said:

	I wanted to keep the load where it was

but why??? Again, I must have missed something, but to me this simply adds the
unnecessary complications. Either way, ->sum_exec_runtime is not stable even if
task-to-release != current, so what is the point?

Oleg.

> have their
> own synchronisation mechanisms.
>
> The clone side calls them *without* the tasklist_lock held, meaning
> parallel calls can contend on their locks.
>
> The exit side calls them *with* the tasklist_lock lock, which means the
> hold time is avoidably extended by waiting on either of the 2 locks, in
> turn exacerbating contention on tasklist_lock itself.
>
> Postponing the work until after the lock is dropped bumps thread
> creation/destruction rate by 15% on a 24 core vm.
>
> Bench (plop into will-it-scale):
> $ cat tests/threadspawn1.c
>
> char *testcase_description = "Thread creation and teardown";
>
> static void *worker(void *arg)
> {
>         return (NULL);
> }
>
> void testcase(unsigned long long *iterations, unsigned long nr)
> {
>         pthread_t thread;
>         int error;
>
>         while (1) {
>                 error = pthread_create(&thread, NULL, worker, NULL);
>                 assert(error == 0);
>                 error = pthread_join(thread, NULL);
>                 assert(error == 0);
>                 (*iterations)++;
>         }
> }
>
> Run:
> $ ./threadspawn1_processes -t 24
>
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
>
> v2:
> - introduce a struct to collect work
> - move out pids as well
>
> there is more which can be pulled out
>
> this may look suspicious:
> +	proc_flush_pid(p->thread_pid);
>
> AFAICS this is constant for the duration of the lifetime, so i don't
> there is a problem
>
>  include/linux/pid.h |  1 +
>  kernel/exit.c       | 53 +++++++++++++++++++++++++++++++++------------
>  kernel/pid.c        | 23 +++++++++++++++-----
>  3 files changed, 58 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 98837a1ff0f3..6e9fcacd02cd 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -101,6 +101,7 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
>   * these helpers must be called with the tasklist_lock write-held.
>   */
>  extern void attach_pid(struct task_struct *task, enum pid_type);
> +extern struct pid *detach_pid_return(struct task_struct *task, enum pid_type);
>  extern void detach_pid(struct task_struct *task, enum pid_type);
>  extern void change_pid(struct task_struct *task, enum pid_type,
>  			struct pid *pid);
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 1dcddfe537ee..4e452d3e3a89 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -122,14 +122,23 @@ static __init int kernel_exit_sysfs_init(void)
>  late_initcall(kernel_exit_sysfs_init);
>  #endif
>
> -static void __unhash_process(struct task_struct *p, bool group_dead)
> +/*
> + * For things release_task would like to do *after* tasklist_lock is released.
> + */
> +struct release_task_post {
> +	unsigned long long randomness;
> +	struct pid *pids[PIDTYPE_MAX];
> +};
> +
> +static void __unhash_process(struct release_task_post *post, struct task_struct *p,
> +			     bool group_dead)
>  {
>  	nr_threads--;
> -	detach_pid(p, PIDTYPE_PID);
> +	post->pids[PIDTYPE_PID] = detach_pid_return(p, PIDTYPE_PID);
>  	if (group_dead) {
> -		detach_pid(p, PIDTYPE_TGID);
> -		detach_pid(p, PIDTYPE_PGID);
> -		detach_pid(p, PIDTYPE_SID);
> +		post->pids[PIDTYPE_TGID] = detach_pid_return(p, PIDTYPE_TGID);
> +		post->pids[PIDTYPE_PGID] = detach_pid_return(p, PIDTYPE_PGID);
> +		post->pids[PIDTYPE_SID] = detach_pid_return(p, PIDTYPE_SID);
>
>  		list_del_rcu(&p->tasks);
>  		list_del_init(&p->sibling);
> @@ -141,7 +150,8 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
>  /*
>   * This function expects the tasklist_lock write-locked.
>   */
> -static void __exit_signal(struct task_struct *tsk)
> +static void __exit_signal(struct release_task_post *post,
> +			  struct task_struct *tsk)
>  {
>  	struct signal_struct *sig = tsk->signal;
>  	bool group_dead = thread_group_leader(tsk);
> @@ -174,8 +184,7 @@ static void __exit_signal(struct task_struct *tsk)
>  			sig->curr_target = next_thread(tsk);
>  	}
>
> -	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
> -			      sizeof(unsigned long long));
> +	post->randomness = tsk->se.sum_exec_runtime;
>
>  	/*
>  	 * Accumulate here the counters for all threads as they die. We could
> @@ -197,7 +206,7 @@ static void __exit_signal(struct task_struct *tsk)
>  	task_io_accounting_add(&sig->ioac, &tsk->ioac);
>  	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
>  	sig->nr_threads--;
> -	__unhash_process(tsk, group_dead);
> +	__unhash_process(post, tsk, group_dead);
>  	write_sequnlock(&sig->stats_lock);
>
>  	/*
> @@ -240,9 +249,13 @@ void __weak release_thread(struct task_struct *dead_task)
>  void release_task(struct task_struct *p)
>  {
>  	struct task_struct *leader;
> -	struct pid *thread_pid;
>  	int zap_leader;
> +	struct release_task_post post;
>  repeat:
> +	memset(&post, 0, sizeof(post));
> +
> +	proc_flush_pid(p->thread_pid);
> +
>  	/* don't need to get the RCU readlock here - the process is dead and
>  	 * can't be modifying its own credentials. But shut RCU-lockdep up */
>  	rcu_read_lock();
> @@ -253,8 +266,7 @@ void release_task(struct task_struct *p)
>
>  	write_lock_irq(&tasklist_lock);
>  	ptrace_release_task(p);
> -	thread_pid = get_pid(p->thread_pid);
> -	__exit_signal(p);
> +	__exit_signal(&post, p);
>
>  	/*
>  	 * If we are the last non-leader member of the thread
> @@ -276,8 +288,21 @@ void release_task(struct task_struct *p)
>  	}
>
>  	write_unlock_irq(&tasklist_lock);
> -	proc_flush_pid(thread_pid);
> -	put_pid(thread_pid);
> +
> +	/*
> +	 * Process clean up deferred to after we drop the tasklist lock.
> +	 */
> +	add_device_randomness((const void*) &post.randomness,
> +			      sizeof(unsigned long long));
> +	if (post.pids[PIDTYPE_PID])
> +		free_pid(post.pids[PIDTYPE_PID]);
> +	if (post.pids[PIDTYPE_TGID])
> +		free_pid(post.pids[PIDTYPE_TGID]);
> +	if (post.pids[PIDTYPE_PGID])
> +		free_pid(post.pids[PIDTYPE_PGID]);
> +	if (post.pids[PIDTYPE_SID])
> +		free_pid(post.pids[PIDTYPE_SID]);
> +
>  	release_thread(p);
>  	put_task_struct_rcu_user(p);
>
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 3a10a7b6fcf8..047cdbcef5cf 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -343,7 +343,7 @@ void attach_pid(struct task_struct *task, enum pid_type type)
>  	hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
>  }
>
> -static void __change_pid(struct task_struct *task, enum pid_type type,
> +static struct pid *__change_pid(struct task_struct *task, enum pid_type type,
>  			struct pid *new)
>  {
>  	struct pid **pid_ptr = task_pid_ptr(task, type);
> @@ -362,20 +362,33 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
>
>  	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
>  		if (pid_has_task(pid, tmp))
> -			return;
> +			return NULL;
>
> -	free_pid(pid);
> +	return pid;
> +}
> +
> +struct pid *detach_pid_return(struct task_struct *task, enum pid_type type)
> +{
> +	return __change_pid(task, type, NULL);
>  }
>
>  void detach_pid(struct task_struct *task, enum pid_type type)
>  {
> -	__change_pid(task, type, NULL);
> +	struct pid *pid;
> +
> +	pid = detach_pid_return(task, type);
> +	if (pid)
> +		free_pid(pid);
>  }
>
>  void change_pid(struct task_struct *task, enum pid_type type,
>  		struct pid *pid)
>  {
> -	__change_pid(task, type, pid);
> +	struct pid *opid;
> +
> +	opid = __change_pid(task, type, pid);
> +	if (opid)
> +		free_pid(opid);
>  	attach_pid(task, type);
>  }
>
> --
> 2.43.0
>



  reply	other threads:[~2025-01-28 18:30 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-01-28 16:07 Mateusz Guzik
2025-01-28 18:29 ` Oleg Nesterov [this message]
2025-01-28 18:38   ` Mateusz Guzik
2025-01-28 19:22     ` Oleg Nesterov
2025-01-30 11:01       ` Mateusz Guzik
2025-01-31 20:55 ` Eric W. Biederman
2025-01-31 22:31   ` Mateusz Guzik
2025-01-31 23:09     ` Eric W. Biederman
2025-02-01 14:03       ` Mateusz Guzik

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250128182932.GC24845@redhat.com \
    --to=oleg@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=brauner@kernel.org \
    --cc=ebiederm@xmission.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mjguzik@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox