linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock
@ 2023-12-25  8:19 Maria Yu
  2023-12-25  8:26 ` Aiqun Yu (Maria)
  2024-01-03 14:04 ` Jarkko Sakkinen
  0 siblings, 2 replies; 13+ messages in thread
From: Maria Yu @ 2023-12-25  8:19 UTC (permalink / raw)
  To: ebiederm
  Cc: Maria Yu, kernel, quic_pkondeti, keescook, viro, brauner, oleg,
	dhowells, jarkko, paul, jmorris, serge, linux-mm, linux-fsdevel,
	linux-kernel, keyrings, linux-security-module, linux-arm-msm

As a rwlock for tasklist_lock, there are multiple scenarios to acquire
read lock which write lock needed to be waiting for.
In freeze_process/thaw_processes it can take about 200+ms for holding read
lock of tasklist_lock by walking and freezing/thawing tasks in commercial
devices. And write_lock_irq will have preempt disabled and local irq
disabled to spin until the tasklist_lock can be acquired. This leading to
a bad responsive performance of current system.
Take an example:
1. cpu0 is holding read lock of tasklist_lock to thaw_processes.
2. cpu1 is waiting write lock of tasklist_lock to exec a new thread with
   preempt_disabled and local irq disabled.
3. cpu2 is waiting write lock of tasklist_lock to do_exit with
   preempt_disabled and local irq disabled.
4. cpu3 is waiting write lock of tasklist_lock to do_exit with
   preempt_disabled and local irq disabled.
So introduce a write lock/unlock wrapper for tasklist_lock specificly.
The current taskslist_lock writers all have write_lock_irq to hold
tasklist_lock, and write_unlock_irq to release tasklist_lock, that means
the writers are not suitable or workable to wait on tasklist_lock in irq
disabled scenarios. So the write lock/unlock wrapper here only follow the
current design of directly use local_irq_disable and local_irq_enable,
and not take already irq disabled writer callers into account.
Use write_trylock in the loop and enabled irq for cpu to repsond if lock
cannot be taken.

Signed-off-by: Maria Yu <quic_aiquny@quicinc.com>
---
 fs/exec.c                  | 10 +++++-----
 include/linux/sched/task.h | 29 +++++++++++++++++++++++++++++
 kernel/exit.c              | 16 ++++++++--------
 kernel/fork.c              |  6 +++---
 kernel/ptrace.c            | 12 ++++++------
 kernel/sys.c               |  8 ++++----
 security/keys/keyctl.c     |  4 ++--
 7 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 4aa19b24f281..030eef6852eb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1086,7 +1086,7 @@ static int de_thread(struct task_struct *tsk)
 
 		for (;;) {
 			cgroup_threadgroup_change_begin(tsk);
-			write_lock_irq(&tasklist_lock);
+			write_lock_tasklist_lock();
 			/*
 			 * Do this under tasklist_lock to ensure that
 			 * exit_notify() can't miss ->group_exec_task
@@ -1095,7 +1095,7 @@ static int de_thread(struct task_struct *tsk)
 			if (likely(leader->exit_state))
 				break;
 			__set_current_state(TASK_KILLABLE);
-			write_unlock_irq(&tasklist_lock);
+			write_unlock_tasklist_lock();
 			cgroup_threadgroup_change_end(tsk);
 			schedule();
 			if (__fatal_signal_pending(tsk))
@@ -1150,7 +1150,7 @@ static int de_thread(struct task_struct *tsk)
 		 */
 		if (unlikely(leader->ptrace))
 			__wake_up_parent(leader, leader->parent);
-		write_unlock_irq(&tasklist_lock);
+		write_unlock_tasklist_lock();
 		cgroup_threadgroup_change_end(tsk);
 
 		release_task(leader);
@@ -1198,13 +1198,13 @@ static int unshare_sighand(struct task_struct *me)
 
 		refcount_set(&newsighand->count, 1);
 
-		write_lock_irq(&tasklist_lock);
+		write_lock_tasklist_lock();
 		spin_lock(&oldsighand->siglock);
 		memcpy(newsighand->action, oldsighand->action,
 		       sizeof(newsighand->action));
 		rcu_assign_pointer(me->sighand, newsighand);
 		spin_unlock(&oldsighand->siglock);
-		write_unlock_irq(&tasklist_lock);
+		write_unlock_tasklist_lock();
 
 		__cleanup_sighand(oldsighand);
 	}
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a23af225c898..6f69d9a3c868 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -50,6 +50,35 @@ struct kernel_clone_args {
  * a separate lock).
  */
 extern rwlock_t tasklist_lock;
+
+/*
+ * Tasklist_lock is a special lock, it takes a good amount of time of
+ * taskslist_lock readers to finish, and the pure write_irq_lock api
+ * will do local_irq_disable at the very first, and put the current cpu
+ * waiting for the lock while is non-responsive for interrupts.
+ *
+ * The current taskslist_lock writers all have write_lock_irq to hold
+ * tasklist_lock, and write_unlock_irq to release tasklist_lock, that
+ * means the writers are not suitable or workable to wait on
+ * tasklist_lock in irq disabled scenarios. So the write lock/unlock
+ * wrapper here only follow the current design of directly use
+ * local_irq_disable and local_irq_enable.
+ */
+static inline void write_lock_tasklist_lock(void)
+{
+	while (1) {
+		local_irq_disable();
+		if (write_trylock(&tasklist_lock))
+			break;
+		local_irq_enable();
+		cpu_relax();
+	}
+}
+static inline void write_unlock_tasklist_lock(void)
+{
+	write_unlock_irq(&tasklist_lock);
+}
+
 extern spinlock_t mmlist_lock;
 
 extern union thread_union init_thread_union;
diff --git a/kernel/exit.c b/kernel/exit.c
index ee9f43bed49a..18b00f477079 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -251,7 +251,7 @@ void release_task(struct task_struct *p)
 
 	cgroup_release(p);
 
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 	ptrace_release_task(p);
 	thread_pid = get_pid(p->thread_pid);
 	__exit_signal(p);
@@ -275,7 +275,7 @@ void release_task(struct task_struct *p)
 			leader->exit_state = EXIT_DEAD;
 	}
 
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 	seccomp_filter_release(p);
 	proc_flush_pid(thread_pid);
 	put_pid(thread_pid);
@@ -598,7 +598,7 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
 		return reaper;
 	}
 
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 
 	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
 		list_del_init(&p->ptrace_entry);
@@ -606,7 +606,7 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
 	}
 
 	zap_pid_ns_processes(pid_ns);
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 
 	return father;
 }
@@ -730,7 +730,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	struct task_struct *p, *n;
 	LIST_HEAD(dead);
 
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 	forget_original_parent(tsk, &dead);
 
 	if (group_dead)
@@ -758,7 +758,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	/* mt-exec, de_thread() is waiting for group leader */
 	if (unlikely(tsk->signal->notify_count < 0))
 		wake_up_process(tsk->signal->group_exec_task);
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 
 	list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
 		list_del_init(&p->ptrace_entry);
@@ -1172,7 +1172,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	wo->wo_stat = status;
 
 	if (state == EXIT_TRACE) {
-		write_lock_irq(&tasklist_lock);
+		write_lock_tasklist_lock();
 		/* We dropped tasklist, ptracer could die and untrace */
 		ptrace_unlink(p);
 
@@ -1181,7 +1181,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		if (do_notify_parent(p, p->exit_signal))
 			state = EXIT_DEAD;
 		p->exit_state = state;
-		write_unlock_irq(&tasklist_lock);
+		write_unlock_tasklist_lock();
 	}
 	if (state == EXIT_DEAD)
 		release_task(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..06c4b4ab9102 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2623,7 +2623,7 @@ __latent_entropy struct task_struct *copy_process(
 	 * Make it visible to the rest of the system, but dont wake it up yet.
 	 * Need tasklist lock for parent etc handling!
 	 */
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
@@ -2714,7 +2714,7 @@ __latent_entropy struct task_struct *copy_process(
 	hlist_del_init(&delayed.node);
 	spin_unlock(&current->sighand->siglock);
 	syscall_tracepoint_update(p);
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 
 	if (pidfile)
 		fd_install(pidfd, pidfile);
@@ -2735,7 +2735,7 @@ __latent_entropy struct task_struct *copy_process(
 bad_fork_cancel_cgroup:
 	sched_core_free(p);
 	spin_unlock(&current->sighand->siglock);
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 	cgroup_cancel_fork(p, args);
 bad_fork_put_pidfd:
 	if (clone_flags & CLONE_PIDFD) {
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d8b5e13a2229..a8d7e2d06f3e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -435,7 +435,7 @@ static int ptrace_attach(struct task_struct *task, long request,
 	if (retval)
 		goto unlock_creds;
 
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 	retval = -EPERM;
 	if (unlikely(task->exit_state))
 		goto unlock_tasklist;
@@ -479,7 +479,7 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	retval = 0;
 unlock_tasklist:
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 unlock_creds:
 	mutex_unlock(&task->signal->cred_guard_mutex);
 out:
@@ -508,7 +508,7 @@ static int ptrace_traceme(void)
 {
 	int ret = -EPERM;
 
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 	/* Are we already being traced? */
 	if (!current->ptrace) {
 		ret = security_ptrace_traceme(current->parent);
@@ -522,7 +522,7 @@ static int ptrace_traceme(void)
 			ptrace_link(current, current->real_parent);
 		}
 	}
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 
 	return ret;
 }
@@ -588,7 +588,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
 	/* Architecture-specific hardware disable .. */
 	ptrace_disable(child);
 
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 	/*
 	 * We rely on ptrace_freeze_traced(). It can't be killed and
 	 * untraced by another thread, it can't be a zombie.
@@ -600,7 +600,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
 	 */
 	child->exit_code = data;
 	__ptrace_detach(current, child);
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 
 	proc_ptrace_connector(child, PTRACE_DETACH);
 
diff --git a/kernel/sys.c b/kernel/sys.c
index e219fcfa112d..0b1647d3ed32 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1088,7 +1088,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 	/* From this point forward we keep holding onto the tasklist lock
 	 * so that our parent does not change from under us. -DaveM
 	 */
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 
 	err = -ESRCH;
 	p = find_task_by_vpid(pid);
@@ -1136,7 +1136,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 	err = 0;
 out:
 	/* All paths lead to here, thus we are safe. -DaveM */
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 	rcu_read_unlock();
 	return err;
 }
@@ -1229,7 +1229,7 @@ int ksys_setsid(void)
 	pid_t session = pid_vnr(sid);
 	int err = -EPERM;
 
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 	/* Fail if I am already a session leader */
 	if (group_leader->signal->leader)
 		goto out;
@@ -1247,7 +1247,7 @@ int ksys_setsid(void)
 
 	err = session;
 out:
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 	if (err > 0) {
 		proc_sid_connector(group_leader);
 		sched_autogroup_create_attach(group_leader);
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 19be69fa4d05..dd8aed20486a 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1652,7 +1652,7 @@ long keyctl_session_to_parent(void)
 
 	me = current;
 	rcu_read_lock();
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 
 	ret = -EPERM;
 	oldwork = NULL;
@@ -1702,7 +1702,7 @@ long keyctl_session_to_parent(void)
 	if (!ret)
 		newwork = NULL;
 unlock:
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 	rcu_read_unlock();
 	if (oldwork)
 		put_cred(container_of(oldwork, struct cred, rcu));

base-commit: 88035e5694a86a7167d490bb95e9df97a9bb162b
-- 
2.17.1



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock
  2023-12-25  8:19 [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock Maria Yu
@ 2023-12-25  8:26 ` Aiqun Yu (Maria)
  2024-01-03 14:04 ` Jarkko Sakkinen
  1 sibling, 0 replies; 13+ messages in thread
From: Aiqun Yu (Maria) @ 2023-12-25  8:26 UTC (permalink / raw)
  To: ebiederm
  Cc: kernel, quic_pkondeti, keescook, viro, brauner, oleg, dhowells,
	jarkko, paul, jmorris, serge, linux-mm, linux-fsdevel,
	linux-kernel, keyrings, linux-security-module, linux-arm-msm



On 12/25/2023 4:19 PM, Maria Yu wrote:
> As a rwlock for tasklist_lock, there are multiple scenarios to acquire
> read lock which write lock needed to be waiting for.
> In freeze_process/thaw_processes it can take about 200+ms for holding read
> lock of tasklist_lock by walking and freezing/thawing tasks in commercial
> devices. And write_lock_irq will have preempt disabled and local irq
> disabled to spin until the tasklist_lock can be acquired. This leading to
> a bad responsive performance of current system.
> Take an example:
> 1. cpu0 is holding read lock of tasklist_lock to thaw_processes.
> 2. cpu1 is waiting write lock of tasklist_lock to exec a new thread with
>     preempt_disabled and local irq disabled.
> 3. cpu2 is waiting write lock of tasklist_lock to do_exit with
>     preempt_disabled and local irq disabled.
> 4. cpu3 is waiting write lock of tasklist_lock to do_exit with
>     preempt_disabled and local irq disabled.
> So introduce a write lock/unlock wrapper for tasklist_lock specificly.
> The current taskslist_lock writers all have write_lock_irq to hold
> tasklist_lock, and write_unlock_irq to release tasklist_lock, that means
> the writers are not suitable or workable to wait on tasklist_lock in irq
> disabled scenarios. So the write lock/unlock wrapper here only follow the
> current design of directly use local_irq_disable and local_irq_enable,
> and not take already irq disabled writer callers into account.
> Use write_trylock in the loop and enabled irq for cpu to repsond if lock
> cannot be taken.
Pls ignore this patch.
Change is not ready for review.
Re-send by mistake.
> 
> Signed-off-by: Maria Yu <quic_aiquny@quicinc.com>
> ---
>   fs/exec.c                  | 10 +++++-----
>   include/linux/sched/task.h | 29 +++++++++++++++++++++++++++++
>   kernel/exit.c              | 16 ++++++++--------
>   kernel/fork.c              |  6 +++---
>   kernel/ptrace.c            | 12 ++++++------
>   kernel/sys.c               |  8 ++++----
>   security/keys/keyctl.c     |  4 ++--
>   7 files changed, 57 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/exec.c b/fs/exec.c
> index 4aa19b24f281..030eef6852eb 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1086,7 +1086,7 @@ static int de_thread(struct task_struct *tsk)
>   
>   		for (;;) {
>   			cgroup_threadgroup_change_begin(tsk);
> -			write_lock_irq(&tasklist_lock);
> +			write_lock_tasklist_lock();
>   			/*
>   			 * Do this under tasklist_lock to ensure that
>   			 * exit_notify() can't miss ->group_exec_task
> @@ -1095,7 +1095,7 @@ static int de_thread(struct task_struct *tsk)
>   			if (likely(leader->exit_state))
>   				break;
>   			__set_current_state(TASK_KILLABLE);
> -			write_unlock_irq(&tasklist_lock);
> +			write_unlock_tasklist_lock();
>   			cgroup_threadgroup_change_end(tsk);
>   			schedule();
>   			if (__fatal_signal_pending(tsk))
> @@ -1150,7 +1150,7 @@ static int de_thread(struct task_struct *tsk)
>   		 */
>   		if (unlikely(leader->ptrace))
>   			__wake_up_parent(leader, leader->parent);
> -		write_unlock_irq(&tasklist_lock);
> +		write_unlock_tasklist_lock();
>   		cgroup_threadgroup_change_end(tsk);
>   
>   		release_task(leader);
> @@ -1198,13 +1198,13 @@ static int unshare_sighand(struct task_struct *me)
>   
>   		refcount_set(&newsighand->count, 1);
>   
> -		write_lock_irq(&tasklist_lock);
> +		write_lock_tasklist_lock();
>   		spin_lock(&oldsighand->siglock);
>   		memcpy(newsighand->action, oldsighand->action,
>   		       sizeof(newsighand->action));
>   		rcu_assign_pointer(me->sighand, newsighand);
>   		spin_unlock(&oldsighand->siglock);
> -		write_unlock_irq(&tasklist_lock);
> +		write_unlock_tasklist_lock();
>   
>   		__cleanup_sighand(oldsighand);
>   	}
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index a23af225c898..6f69d9a3c868 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -50,6 +50,35 @@ struct kernel_clone_args {
>    * a separate lock).
>    */
>   extern rwlock_t tasklist_lock;
> +
> +/*
> + * Tasklist_lock is a special lock, it takes a good amount of time of
> + * taskslist_lock readers to finish, and the pure write_irq_lock api
> + * will do local_irq_disable at the very first, and put the current cpu
> + * waiting for the lock while is non-responsive for interrupts.
> + *
> + * The current taskslist_lock writers all have write_lock_irq to hold
> + * tasklist_lock, and write_unlock_irq to release tasklist_lock, that
> + * means the writers are not suitable or workable to wait on
> + * tasklist_lock in irq disabled scenarios. So the write lock/unlock
> + * wrapper here only follow the current design of directly use
> + * local_irq_disable and local_irq_enable.
> + */
> +static inline void write_lock_tasklist_lock(void)
> +{
> +	while (1) {
> +		local_irq_disable();
> +		if (write_trylock(&tasklist_lock))
> +			break;
> +		local_irq_enable();
> +		cpu_relax();
> +	}
> +}
> +static inline void write_unlock_tasklist_lock(void)
> +{
> +	write_unlock_irq(&tasklist_lock);
> +}
> +
>   extern spinlock_t mmlist_lock;
>   
>   extern union thread_union init_thread_union;
> diff --git a/kernel/exit.c b/kernel/exit.c
> index ee9f43bed49a..18b00f477079 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -251,7 +251,7 @@ void release_task(struct task_struct *p)
>   
>   	cgroup_release(p);
>   
> -	write_lock_irq(&tasklist_lock);
> +	write_lock_tasklist_lock();
>   	ptrace_release_task(p);
>   	thread_pid = get_pid(p->thread_pid);
>   	__exit_signal(p);
> @@ -275,7 +275,7 @@ void release_task(struct task_struct *p)
>   			leader->exit_state = EXIT_DEAD;
>   	}
>   
> -	write_unlock_irq(&tasklist_lock);
> +	write_unlock_tasklist_lock();
>   	seccomp_filter_release(p);
>   	proc_flush_pid(thread_pid);
>   	put_pid(thread_pid);
> @@ -598,7 +598,7 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
>   		return reaper;
>   	}
>   
> -	write_unlock_irq(&tasklist_lock);
> +	write_unlock_tasklist_lock();
>   
>   	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
>   		list_del_init(&p->ptrace_entry);
> @@ -606,7 +606,7 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
>   	}
>   
>   	zap_pid_ns_processes(pid_ns);
> -	write_lock_irq(&tasklist_lock);
> +	write_lock_tasklist_lock();
>   
>   	return father;
>   }
> @@ -730,7 +730,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
>   	struct task_struct *p, *n;
>   	LIST_HEAD(dead);
>   
> -	write_lock_irq(&tasklist_lock);
> +	write_lock_tasklist_lock();
>   	forget_original_parent(tsk, &dead);
>   
>   	if (group_dead)
> @@ -758,7 +758,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
>   	/* mt-exec, de_thread() is waiting for group leader */
>   	if (unlikely(tsk->signal->notify_count < 0))
>   		wake_up_process(tsk->signal->group_exec_task);
> -	write_unlock_irq(&tasklist_lock);
> +	write_unlock_tasklist_lock();
>   
>   	list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
>   		list_del_init(&p->ptrace_entry);
> @@ -1172,7 +1172,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
>   	wo->wo_stat = status;
>   
>   	if (state == EXIT_TRACE) {
> -		write_lock_irq(&tasklist_lock);
> +		write_lock_tasklist_lock();
>   		/* We dropped tasklist, ptracer could die and untrace */
>   		ptrace_unlink(p);
>   
> @@ -1181,7 +1181,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
>   		if (do_notify_parent(p, p->exit_signal))
>   			state = EXIT_DEAD;
>   		p->exit_state = state;
> -		write_unlock_irq(&tasklist_lock);
> +		write_unlock_tasklist_lock();
>   	}
>   	if (state == EXIT_DEAD)
>   		release_task(p);
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 10917c3e1f03..06c4b4ab9102 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2623,7 +2623,7 @@ __latent_entropy struct task_struct *copy_process(
>   	 * Make it visible to the rest of the system, but dont wake it up yet.
>   	 * Need tasklist lock for parent etc handling!
>   	 */
> -	write_lock_irq(&tasklist_lock);
> +	write_lock_tasklist_lock();
>   
>   	/* CLONE_PARENT re-uses the old parent */
>   	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
> @@ -2714,7 +2714,7 @@ __latent_entropy struct task_struct *copy_process(
>   	hlist_del_init(&delayed.node);
>   	spin_unlock(&current->sighand->siglock);
>   	syscall_tracepoint_update(p);
> -	write_unlock_irq(&tasklist_lock);
> +	write_unlock_tasklist_lock();
>   
>   	if (pidfile)
>   		fd_install(pidfd, pidfile);
> @@ -2735,7 +2735,7 @@ __latent_entropy struct task_struct *copy_process(
>   bad_fork_cancel_cgroup:
>   	sched_core_free(p);
>   	spin_unlock(&current->sighand->siglock);
> -	write_unlock_irq(&tasklist_lock);
> +	write_unlock_tasklist_lock();
>   	cgroup_cancel_fork(p, args);
>   bad_fork_put_pidfd:
>   	if (clone_flags & CLONE_PIDFD) {
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index d8b5e13a2229..a8d7e2d06f3e 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -435,7 +435,7 @@ static int ptrace_attach(struct task_struct *task, long request,
>   	if (retval)
>   		goto unlock_creds;
>   
> -	write_lock_irq(&tasklist_lock);
> +	write_lock_tasklist_lock();
>   	retval = -EPERM;
>   	if (unlikely(task->exit_state))
>   		goto unlock_tasklist;
> @@ -479,7 +479,7 @@ static int ptrace_attach(struct task_struct *task, long request,
>   
>   	retval = 0;
>   unlock_tasklist:
> -	write_unlock_irq(&tasklist_lock);
> +	write_unlock_tasklist_lock();
>   unlock_creds:
>   	mutex_unlock(&task->signal->cred_guard_mutex);
>   out:
> @@ -508,7 +508,7 @@ static int ptrace_traceme(void)
>   {
>   	int ret = -EPERM;
>   
> -	write_lock_irq(&tasklist_lock);
> +	write_lock_tasklist_lock();
>   	/* Are we already being traced? */
>   	if (!current->ptrace) {
>   		ret = security_ptrace_traceme(current->parent);
> @@ -522,7 +522,7 @@ static int ptrace_traceme(void)
>   			ptrace_link(current, current->real_parent);
>   		}
>   	}
> -	write_unlock_irq(&tasklist_lock);
> +	write_unlock_tasklist_lock();
>   
>   	return ret;
>   }
> @@ -588,7 +588,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
>   	/* Architecture-specific hardware disable .. */
>   	ptrace_disable(child);
>   
> -	write_lock_irq(&tasklist_lock);
> +	write_lock_tasklist_lock();
>   	/*
>   	 * We rely on ptrace_freeze_traced(). It can't be killed and
>   	 * untraced by another thread, it can't be a zombie.
> @@ -600,7 +600,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
>   	 */
>   	child->exit_code = data;
>   	__ptrace_detach(current, child);
> -	write_unlock_irq(&tasklist_lock);
> +	write_unlock_tasklist_lock();
>   
>   	proc_ptrace_connector(child, PTRACE_DETACH);
>   
> diff --git a/kernel/sys.c b/kernel/sys.c
> index e219fcfa112d..0b1647d3ed32 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1088,7 +1088,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
>   	/* From this point forward we keep holding onto the tasklist lock
>   	 * so that our parent does not change from under us. -DaveM
>   	 */
> -	write_lock_irq(&tasklist_lock);
> +	write_lock_tasklist_lock();
>   
>   	err = -ESRCH;
>   	p = find_task_by_vpid(pid);
> @@ -1136,7 +1136,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
>   	err = 0;
>   out:
>   	/* All paths lead to here, thus we are safe. -DaveM */
> -	write_unlock_irq(&tasklist_lock);
> +	write_unlock_tasklist_lock();
>   	rcu_read_unlock();
>   	return err;
>   }
> @@ -1229,7 +1229,7 @@ int ksys_setsid(void)
>   	pid_t session = pid_vnr(sid);
>   	int err = -EPERM;
>   
> -	write_lock_irq(&tasklist_lock);
> +	write_lock_tasklist_lock();
>   	/* Fail if I am already a session leader */
>   	if (group_leader->signal->leader)
>   		goto out;
> @@ -1247,7 +1247,7 @@ int ksys_setsid(void)
>   
>   	err = session;
>   out:
> -	write_unlock_irq(&tasklist_lock);
> +	write_unlock_tasklist_lock();
>   	if (err > 0) {
>   		proc_sid_connector(group_leader);
>   		sched_autogroup_create_attach(group_leader);
> diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
> index 19be69fa4d05..dd8aed20486a 100644
> --- a/security/keys/keyctl.c
> +++ b/security/keys/keyctl.c
> @@ -1652,7 +1652,7 @@ long keyctl_session_to_parent(void)
>   
>   	me = current;
>   	rcu_read_lock();
> -	write_lock_irq(&tasklist_lock);
> +	write_lock_tasklist_lock();
>   
>   	ret = -EPERM;
>   	oldwork = NULL;
> @@ -1702,7 +1702,7 @@ long keyctl_session_to_parent(void)
>   	if (!ret)
>   		newwork = NULL;
>   unlock:
> -	write_unlock_irq(&tasklist_lock);
> +	write_unlock_tasklist_lock();
>   	rcu_read_unlock();
>   	if (oldwork)
>   		put_cred(container_of(oldwork, struct cred, rcu));
> 
> base-commit: 88035e5694a86a7167d490bb95e9df97a9bb162b

-- 
Thx and BRs,
Aiqun(Maria) Yu


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock
  2023-12-25  8:19 [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock Maria Yu
  2023-12-25  8:26 ` Aiqun Yu (Maria)
@ 2024-01-03 14:04 ` Jarkko Sakkinen
  1 sibling, 0 replies; 13+ messages in thread
From: Jarkko Sakkinen @ 2024-01-03 14:04 UTC (permalink / raw)
  To: Maria Yu, ebiederm
  Cc: kernel, quic_pkondeti, keescook, viro, brauner, oleg, dhowells,
	paul, jmorris, serge, linux-mm, linux-fsdevel, linux-kernel,
	keyrings, linux-security-module, linux-arm-msm

On Mon Dec 25, 2023 at 10:19 AM EET, Maria Yu wrote:
> As a rwlock for tasklist_lock, there are multiple scenarios to acquire
> read lock which write lock needed to be waiting for.
> In freeze_process/thaw_processes it can take about 200+ms for holding read
> lock of tasklist_lock by walking and freezing/thawing tasks in commercial
> devices. And write_lock_irq will have preempt disabled and local irq
> disabled to spin until the tasklist_lock can be acquired. This leading to
> a bad responsive performance of current system.
> Take an example:
> 1. cpu0 is holding read lock of tasklist_lock to thaw_processes.
> 2. cpu1 is waiting write lock of tasklist_lock to exec a new thread with
>    preempt_disabled and local irq disabled.
> 3. cpu2 is waiting write lock of tasklist_lock to do_exit with
>    preempt_disabled and local irq disabled.
> 4. cpu3 is waiting write lock of tasklist_lock to do_exit with
>    preempt_disabled and local irq disabled.
> So introduce a write lock/unlock wrapper for tasklist_lock specificly.
> The current taskslist_lock writers all have write_lock_irq to hold
> tasklist_lock, and write_unlock_irq to release tasklist_lock, that means
> the writers are not suitable or workable to wait on tasklist_lock in irq
> disabled scenarios. So the write lock/unlock wrapper here only follow the
> current design of directly use local_irq_disable and local_irq_enable,
> and not take already irq disabled writer callers into account.
> Use write_trylock in the loop and enabled irq for cpu to repsond if lock
> cannot be taken.
>
> Signed-off-by: Maria Yu <quic_aiquny@quicinc.com>
> ---
>  fs/exec.c                  | 10 +++++-----
>  include/linux/sched/task.h | 29 +++++++++++++++++++++++++++++
>  kernel/exit.c              | 16 ++++++++--------
>  kernel/fork.c              |  6 +++---
>  kernel/ptrace.c            | 12 ++++++------
>  kernel/sys.c               |  8 ++++----
>  security/keys/keyctl.c     |  4 ++--
>  7 files changed, 57 insertions(+), 28 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 4aa19b24f281..030eef6852eb 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1086,7 +1086,7 @@ static int de_thread(struct task_struct *tsk)
>  
>  		for (;;) {
>  			cgroup_threadgroup_change_begin(tsk);
> -			write_lock_irq(&tasklist_lock);
> +			write_lock_tasklist_lock();
>  			/*
>  			 * Do this under tasklist_lock to ensure that
>  			 * exit_notify() can't miss ->group_exec_task
> @@ -1095,7 +1095,7 @@ static int de_thread(struct task_struct *tsk)
>  			if (likely(leader->exit_state))
>  				break;
>  			__set_current_state(TASK_KILLABLE);
> -			write_unlock_irq(&tasklist_lock);
> +			write_unlock_tasklist_lock();
>  			cgroup_threadgroup_change_end(tsk);
>  			schedule();
>  			if (__fatal_signal_pending(tsk))
> @@ -1150,7 +1150,7 @@ static int de_thread(struct task_struct *tsk)
>  		 */
>  		if (unlikely(leader->ptrace))
>  			__wake_up_parent(leader, leader->parent);
> -		write_unlock_irq(&tasklist_lock);
> +		write_unlock_tasklist_lock();
>  		cgroup_threadgroup_change_end(tsk);
>  
>  		release_task(leader);
> @@ -1198,13 +1198,13 @@ static int unshare_sighand(struct task_struct *me)
>  
>  		refcount_set(&newsighand->count, 1);
>  
> -		write_lock_irq(&tasklist_lock);
> +		write_lock_tasklist_lock();
>  		spin_lock(&oldsighand->siglock);
>  		memcpy(newsighand->action, oldsighand->action,
>  		       sizeof(newsighand->action));
>  		rcu_assign_pointer(me->sighand, newsighand);
>  		spin_unlock(&oldsighand->siglock);
> -		write_unlock_irq(&tasklist_lock);
> +		write_unlock_tasklist_lock();
>  
>  		__cleanup_sighand(oldsighand);
>  	}
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index a23af225c898..6f69d9a3c868 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -50,6 +50,35 @@ struct kernel_clone_args {
>   * a separate lock).
>   */
>  extern rwlock_t tasklist_lock;
> +
> +/*
> + * Tasklist_lock is a special lock, it takes a good amount of time of
> + * taskslist_lock readers to finish, and the pure write_irq_lock api
> + * will do local_irq_disable at the very first, and put the current cpu
> + * waiting for the lock while is non-responsive for interrupts.
> + *
> + * The current taskslist_lock writers all have write_lock_irq to hold
> + * tasklist_lock, and write_unlock_irq to release tasklist_lock, that
> + * means the writers are not suitable or workable to wait on
> + * tasklist_lock in irq disabled scenarios. So the write lock/unlock
> + * wrapper here only follow the current design of directly use
> + * local_irq_disable and local_irq_enable.
> + */
> +static inline void write_lock_tasklist_lock(void)
> +{
> +	while (1) {
> +		local_irq_disable();
> +		if (write_trylock(&tasklist_lock))
> +			break;
> +		local_irq_enable();
> +		cpu_relax();
> +	}

Maybe:

	local_irq_disable();
	while (!write_trylock(&tasklist_lock)) {
		local_irq_enable();
		cpu_relax();
		local_irq_disable();
	}

BR, Jarkko


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock
  2024-01-03 18:18             ` Matthew Wilcox
@ 2024-01-04  0:46               ` Aiqun Yu (Maria)
  0 siblings, 0 replies; 13+ messages in thread
From: Aiqun Yu (Maria) @ 2024-01-04  0:46 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Eric W. Biederman, Hillf Danton, kernel, quic_pkondeti, keescook,
	viro, brauner, oleg, dhowells, jarkko, paul, jmorris, serge,
	linux-mm, linux-fsdevel, linux-kernel, keyrings,
	linux-security-module, linux-arm-msm



On 1/4/2024 2:18 AM, Matthew Wilcox wrote:
> On Wed, Jan 03, 2024 at 10:58:33AM +0800, Aiqun Yu (Maria) wrote:
>> On 1/2/2024 5:14 PM, Matthew Wilcox wrote:
>>>>> -void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
>>>>> +void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock, bool irq)
>>>>>     {
>>>>>     	int cnts;
>>>>> @@ -82,7 +83,11 @@ void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
>>>> Also a new state showed up after the current design:
>>>> 1. locked flag with _QW_WAITING, while irq enabled.
>>>> 2. And this state will be only in interrupt context.
>>>> 3. lock->wait_lock is hold by the write waiter.
>>>> So per my understanding, a different behavior also needed to be done in
>>>> queued_write_lock_slowpath:
>>>>     when (unlikely(in_interrupt())) , get the lock directly.
>>>
>>> I don't think so.  Remember that write_lock_irq() can only be called in
>>> process context, and when interrupts are enabled.
>> In current kernel drivers, I can see same lock called with write_lock_irq
>> and write_lock_irqsave in different drivers.
>>
>> And this is the scenario I am talking about:
>> 1. cpu0 have task run and called write_lock_irq.(Not in interrupt context)
>> 2. cpu0 hold the lock->wait_lock and re-enabled the interrupt.
> 
> Oh, I missed that it was holding the wait_lock.  Yes, we also need to
> release the wait_lock before spinning with interrupts disabled.
> 
>> I was thinking to support both write_lock_irq and write_lock_irqsave with
>> interrupt enabled together in queued_write_lock_slowpath.
>>
>> That's why I am suggesting in write_lock_irqsave when (in_interrupt()),
>> instead spin for the lock->wait_lock, spin to get the lock->cnts directly.
> 
> Mmm, but the interrupt could come in on a different CPU and that would
> lead to it stealing the wait_lock from the CPU which is merely waiting
> for the readers to go away.
That's right.
The fairness(or queue mechanism) wouldn't be ensured (only in interrupt 
context) if we have the special design when (in_interrupt()) spin to get 
the lock->cnts directly. When in interrupt context, the later 
write_lock_irqsave may get the lock earlier than the write_lock_irq() 
which is not in interrupt context.

This is a side effect of the design, while similar unfairness design in 
read lock as well. I think it is reasonable to have in_interrupt() 
waiters get lock earlier from the whole system's performance of view.
> 

-- 
Thx and BRs,
Aiqun(Maria) Yu


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock
  2024-01-03  2:58           ` Aiqun Yu (Maria)
@ 2024-01-03 18:18             ` Matthew Wilcox
  2024-01-04  0:46               ` Aiqun Yu (Maria)
  0 siblings, 1 reply; 13+ messages in thread
From: Matthew Wilcox @ 2024-01-03 18:18 UTC (permalink / raw)
  To: Aiqun Yu (Maria)
  Cc: Eric W. Biederman, Hillf Danton, kernel, quic_pkondeti, keescook,
	viro, brauner, oleg, dhowells, jarkko, paul, jmorris, serge,
	linux-mm, linux-fsdevel, linux-kernel, keyrings,
	linux-security-module, linux-arm-msm

On Wed, Jan 03, 2024 at 10:58:33AM +0800, Aiqun Yu (Maria) wrote:
> On 1/2/2024 5:14 PM, Matthew Wilcox wrote:
> > > > -void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
> > > > +void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock, bool irq)
> > > >    {
> > > >    	int cnts;
> > > > @@ -82,7 +83,11 @@ void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
> > > Also a new state showed up after the current design:
> > > 1. locked flag with _QW_WAITING, while irq enabled.
> > > 2. And this state will be only in interrupt context.
> > > 3. lock->wait_lock is hold by the write waiter.
> > > So per my understanding, a different behavior also needed to be done in
> > > queued_write_lock_slowpath:
> > >    when (unlikely(in_interrupt())) , get the lock directly.
> > 
> > I don't think so.  Remember that write_lock_irq() can only be called in
> > process context, and when interrupts are enabled.
> In current kernel drivers, I can see same lock called with write_lock_irq
> and write_lock_irqsave in different drivers.
> 
> And this is the scenario I am talking about:
> 1. cpu0 have task run and called write_lock_irq.(Not in interrupt context)
> 2. cpu0 hold the lock->wait_lock and re-enabled the interrupt.

Oh, I missed that it was holding the wait_lock.  Yes, we also need to
release the wait_lock before spinning with interrupts disabled.

> I was thinking to support both write_lock_irq and write_lock_irqsave with
> interrupt enabled together in queued_write_lock_slowpath.
> 
> That's why I am suggesting in write_lock_irqsave when (in_interrupt()),
> instead spin for the lock->wait_lock, spin to get the lock->cnts directly.

Mmm, but the interrupt could come in on a different CPU and that would
lead to it stealing the wait_lock from the CPU which is merely waiting
for the readers to go away.



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock
  2024-01-02  9:14         ` Matthew Wilcox
@ 2024-01-03  2:58           ` Aiqun Yu (Maria)
  2024-01-03 18:18             ` Matthew Wilcox
  0 siblings, 1 reply; 13+ messages in thread
From: Aiqun Yu (Maria) @ 2024-01-03  2:58 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Eric W. Biederman, Hillf Danton, kernel, quic_pkondeti, keescook,
	viro, brauner, oleg, dhowells, jarkko, paul, jmorris, serge,
	linux-mm, linux-fsdevel, linux-kernel, keyrings,
	linux-security-module, linux-arm-msm



On 1/2/2024 5:14 PM, Matthew Wilcox wrote:
> On Tue, Jan 02, 2024 at 10:19:47AM +0800, Aiqun Yu (Maria) wrote:
>> On 12/29/2023 6:20 AM, Matthew Wilcox wrote:
>>> On Wed, Dec 13, 2023 at 12:27:05PM -0600, Eric W. Biederman wrote:
>>>> Matthew Wilcox <willy@infradead.org> writes:
>>>>> I think the right way to fix this is to pass a boolean flag to
>>>>> queued_write_lock_slowpath() to let it know whether it can re-enable
>>>>> interrupts while checking whether _QW_WAITING is set.
>>>>
>>>> Yes.  It seems to make sense to distinguish between write_lock_irq and
>>>> write_lock_irqsave and fix this for all of write_lock_irq.
>>>
>>> I wasn't planning on doing anything here, but Hillf kind of pushed me into
>>> it.  I think it needs to be something like this.  Compile tested only.
>>> If it ends up getting used,
>> Happy new year!
> 
> Thank you!  I know your new year is a few weeks away still ;-)
Yeah, Chinese new year will come about 5 weeks later. :)
> 
>>> -void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
>>> +void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock, bool irq)
>>>    {
>>>    	int cnts;
>>> @@ -82,7 +83,11 @@ void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
>> Also a new state showed up after the current design:
>> 1. locked flag with _QW_WAITING, while irq enabled.
>> 2. And this state will be only in interrupt context.
>> 3. lock->wait_lock is hold by the write waiter.
>> So per my understanding, a different behavior also needed to be done in
>> queued_write_lock_slowpath:
>>    when (unlikely(in_interrupt())) , get the lock directly.
> 
> I don't think so.  Remember that write_lock_irq() can only be called in
> process context, and when interrupts are enabled.
In current kernel drivers, I can see same lock called with 
write_lock_irq and write_lock_irqsave in different drivers.

And this is the scenario I am talking about:
1. cpu0 have task run and called write_lock_irq.(Not in interrupt context)
2. cpu0 hold the lock->wait_lock and re-enabled the interrupt.
* this is the new state with _QW_WAITING set, lock->wait_lock locked, 
interrupt enabled. *
3. cpu0 in-interrupt context and want to do write_lock_irqsave.
4. cpu0 tried to acquire lock->wait_lock again.

I was thinking to support both write_lock_irq and write_lock_irqsave 
with interrupt enabled together in queued_write_lock_slowpath.

That's why I am suggesting in write_lock_irqsave when (in_interrupt()), 
instead spin for the lock->wait_lock, spin to get the lock->cnts directly.
> 
>> So needed to be done in release path. This is to address Hillf's concern on
>> possibility of deadlock.
> 
> Hillf's concern is invalid.
> 
>>>    	/* When no more readers or writers, set the locked flag */
>>>    	do {
>>> +		if (irq)
>>> +			local_irq_enable();
>> I think write_lock_irqsave also needs to be take account. So
>> loal_irq_save(flags) should be take into account here.
> 
> If we did want to support the same kind of spinning with interrupts
> enabled for write_lock_irqsave(), we'd want to pass the flags in
> and do local_irq_restore(), but I don't know how we'd support
> write_lock_irq() if we did that -- can we rely on passing in 0 for flags
> meaning "reenable" on all architectures?  And ~0 meaning "don't
> reenable" on all architectures?
What about for all write_lock_irq, pass the real flags from 
local_irq_save(flags) into the queued_write_lock_slowpath?
Arch specific valid flags won't be !0 limited then.
> 
> That all seems complicated, so I didn't do that.
This is complicated. Also need test verify to ensure.
More careful design more better.

Fixed previous wrong email address. ^-^!
> 

-- 
Thx and BRs,
Aiqun(Maria) Yu


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock
  2024-01-02  2:19       ` Aiqun Yu (Maria)
@ 2024-01-02  9:14         ` Matthew Wilcox
  2024-01-03  2:58           ` Aiqun Yu (Maria)
  0 siblings, 1 reply; 13+ messages in thread
From: Matthew Wilcox @ 2024-01-02  9:14 UTC (permalink / raw)
  To: Aiqun Yu (Maria)
  Cc: Eric W. Biederman, Hillf Danton, kernel, quic_pkondeti, keescook,
	viro, brauner, oleg, dhowells, jarkko, paul, jmorris, serge,
	linux-mm, linux-fsdevel, linux-kernel, keyrings,
	linux-security-module, linux-arm-msm

On Tue, Jan 02, 2024 at 10:19:47AM +0800, Aiqun Yu (Maria) wrote:
> On 12/29/2023 6:20 AM, Matthew Wilcox wrote:
> > On Wed, Dec 13, 2023 at 12:27:05PM -0600, Eric W. Biederman wrote:
> > > Matthew Wilcox <willy@infradead.org> writes:
> > > > I think the right way to fix this is to pass a boolean flag to
> > > > queued_write_lock_slowpath() to let it know whether it can re-enable
> > > > interrupts while checking whether _QW_WAITING is set.
> > > 
> > > Yes.  It seems to make sense to distinguish between write_lock_irq and
> > > write_lock_irqsave and fix this for all of write_lock_irq.
> > 
> > I wasn't planning on doing anything here, but Hillf kind of pushed me into
> > it.  I think it needs to be something like this.  Compile tested only.
> > If it ends up getting used,
> Happy new year!

Thank you!  I know your new year is a few weeks away still ;-)

> > -void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
> > +void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock, bool irq)
> >   {
> >   	int cnts;
> > @@ -82,7 +83,11 @@ void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
> Also a new state showed up after the current design:
> 1. locked flag with _QW_WAITING, while irq enabled.
> 2. And this state will be only in interrupt context.
> 3. lock->wait_lock is hold by the write waiter.
> So per my understanding, a different behavior also needed to be done in
> queued_write_lock_slowpath:
>   when (unlikely(in_interrupt())) , get the lock directly.

I don't think so.  Remember that write_lock_irq() can only be called in
process context, and when interrupts are enabled.

> So needed to be done in release path. This is to address Hillf's concern on
> possibility of deadlock.

Hillf's concern is invalid.

> >   	/* When no more readers or writers, set the locked flag */
> >   	do {
> > +		if (irq)
> > +			local_irq_enable();
> I think write_lock_irqsave also needs to be take account. So
> loal_irq_save(flags) should be take into account here.

If we did want to support the same kind of spinning with interrupts
enabled for write_lock_irqsave(), we'd want to pass the flags in
and do local_irq_restore(), but I don't know how we'd support
write_lock_irq() if we did that -- can we rely on passing in 0 for flags
meaning "reenable" on all architectures?  And ~0 meaning "don't
reenable" on all architectures?

That all seems complicated, so I didn't do that.



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock
  2023-12-28 22:20     ` Matthew Wilcox
@ 2024-01-02  2:19       ` Aiqun Yu (Maria)
  2024-01-02  9:14         ` Matthew Wilcox
  0 siblings, 1 reply; 13+ messages in thread
From: Aiqun Yu (Maria) @ 2024-01-02  2:19 UTC (permalink / raw)
  To: Matthew Wilcox, Eric W. Biederman, Hillf Danton
  Cc: kernel, quic_pkondeti, keescook, viro, brauner, oleg, dhowells,
	jarkko, paul, jmorris, serge, linux-mm, linux-fsdevel,
	linux-kernel, keyrings, linux-security-module, linux-arm-msm



On 12/29/2023 6:20 AM, Matthew Wilcox wrote:
> On Wed, Dec 13, 2023 at 12:27:05PM -0600, Eric W. Biederman wrote:
>> Matthew Wilcox <willy@infradead.org> writes:
>>> I think the right way to fix this is to pass a boolean flag to
>>> queued_write_lock_slowpath() to let it know whether it can re-enable
>>> interrupts while checking whether _QW_WAITING is set.
>>
>> Yes.  It seems to make sense to distinguish between write_lock_irq and
>> write_lock_irqsave and fix this for all of write_lock_irq.
> 
> I wasn't planning on doing anything here, but Hillf kind of pushed me into
> it.  I think it needs to be something like this.  Compile tested only.
> If it ends up getting used,
Happy new year!
Thx Metthew for chiming into this. I think more thoughts will gain more 
perfect designs.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> 
> diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
> index 75b8f4601b28..1152e080c719 100644
> --- a/include/asm-generic/qrwlock.h
> +++ b/include/asm-generic/qrwlock.h
> @@ -33,8 +33,8 @@
>   /*
>    * External function declarations
>    */
> -extern void queued_read_lock_slowpath(struct qrwlock *lock);
> -extern void queued_write_lock_slowpath(struct qrwlock *lock);
> +void queued_read_lock_slowpath(struct qrwlock *lock);
> +void queued_write_lock_slowpath(struct qrwlock *lock, bool irq);
>   
>   /**
>    * queued_read_trylock - try to acquire read lock of a queued rwlock
> @@ -98,7 +98,21 @@ static inline void queued_write_lock(struct qrwlock *lock)
>   	if (likely(atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)))
>   		return;
>   
> -	queued_write_lock_slowpath(lock);
> +	queued_write_lock_slowpath(lock, false);
> +}
> +
> +/**
> + * queued_write_lock_irq - acquire write lock of a queued rwlock
> + * @lock : Pointer to queued rwlock structure
> + */
> +static inline void queued_write_lock_irq(struct qrwlock *lock)
> +{
> +	int cnts = 0;
> +	/* Optimize for the unfair lock case where the fair flag is 0. */
> +	if (likely(atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)))
> +		return;
> +
> +	queued_write_lock_slowpath(lock, true);
>   }
>   
>   /**
> @@ -138,6 +152,7 @@ static inline int queued_rwlock_is_contended(struct qrwlock *lock)
>    */
>   #define arch_read_lock(l)		queued_read_lock(l)
>   #define arch_write_lock(l)		queued_write_lock(l)
> +#define arch_write_lock_irq(l)		queued_write_lock_irq(l)
>   #define arch_read_trylock(l)		queued_read_trylock(l)
>   #define arch_write_trylock(l)		queued_write_trylock(l)
>   #define arch_read_unlock(l)		queued_read_unlock(l)
> diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
> index c0ef596f340b..897010b6ba0a 100644
> --- a/include/linux/rwlock.h
> +++ b/include/linux/rwlock.h
> @@ -33,6 +33,7 @@ do {								\
>    extern int do_raw_read_trylock(rwlock_t *lock);
>    extern void do_raw_read_unlock(rwlock_t *lock) __releases(lock);
>    extern void do_raw_write_lock(rwlock_t *lock) __acquires(lock);
> + extern void do_raw_write_lock_irq(rwlock_t *lock) __acquires(lock);
>    extern int do_raw_write_trylock(rwlock_t *lock);
>    extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock);
>   #else
> @@ -40,6 +41,7 @@ do {								\
>   # define do_raw_read_trylock(rwlock)	arch_read_trylock(&(rwlock)->raw_lock)
>   # define do_raw_read_unlock(rwlock)	do {arch_read_unlock(&(rwlock)->raw_lock); __release(lock); } while (0)
>   # define do_raw_write_lock(rwlock)	do {__acquire(lock); arch_write_lock(&(rwlock)->raw_lock); } while (0)
> +# define do_raw_write_lock_irq(rwlock)	do {__acquire(lock); arch_write_lock_irq(&(rwlock)->raw_lock); } while (0)
>   # define do_raw_write_trylock(rwlock)	arch_write_trylock(&(rwlock)->raw_lock)
>   # define do_raw_write_unlock(rwlock)	do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0)
>   #endif
> diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h
> index dceb0a59b692..6257976dfb72 100644
> --- a/include/linux/rwlock_api_smp.h
> +++ b/include/linux/rwlock_api_smp.h
> @@ -193,7 +193,7 @@ static inline void __raw_write_lock_irq(rwlock_t *lock)
>   	local_irq_disable();
>   	preempt_disable();
>   	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
> -	LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock);
> +	LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock_irq);
>   }
>   
>   static inline void __raw_write_lock_bh(rwlock_t *lock)
> diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
> index d2ef312a8611..6c644a71b01d 100644
> --- a/kernel/locking/qrwlock.c
> +++ b/kernel/locking/qrwlock.c
> @@ -61,9 +61,10 @@ EXPORT_SYMBOL(queued_read_lock_slowpath);
>   
>   /**
>    * queued_write_lock_slowpath - acquire write lock of a queued rwlock
> - * @lock : Pointer to queued rwlock structure
> + * @lock: Pointer to queued rwlock structure
> + * @irq: True if we can enable interrupts while spinning
>    */
> -void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
> +void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock, bool irq)
>   {
>   	int cnts;
>   
> @@ -82,7 +83,11 @@ void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
>   
Also a new state showed up after the current design:
1. locked flag with _QW_WAITING, while irq enabled.
2. And this state will be only in interrupt context.
3. lock->wait_lock is hold by the write waiter.
So per my understanding, a different behavior also needed to be done in 
queued_write_lock_slowpath:
   when (unlikely(in_interrupt())) , get the lock directly.
So needed to be done in release path. This is to address Hillf's concern 
on possibility of deadlock.

Add Hillf here to merge thread. I am going to have a tested patch V2 
accordingly.
Feel free to let me know your thoughts prior on that.
>   	/* When no more readers or writers, set the locked flag */
>   	do {
> +		if (irq)
> +			local_irq_enable();
I think write_lock_irqsave also needs to be take account. So 
loal_irq_save(flags) should be take into account here.
>   		cnts = atomic_cond_read_relaxed(&lock->cnts, VAL == _QW_WAITING);
> +		if (irq)
> +			local_irq_disable();
ditto.
>   	} while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
>   unlock:
>   	arch_spin_unlock(&lock->wait_lock);
> diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
> index 87b03d2e41db..bf94551d7435 100644
> --- a/kernel/locking/spinlock_debug.c
> +++ b/kernel/locking/spinlock_debug.c
> @@ -212,6 +212,13 @@ void do_raw_write_lock(rwlock_t *lock)
>   	debug_write_lock_after(lock);
>   }
>   
> +void do_raw_write_lock_irq(rwlock_t *lock)
> +{
> +	debug_write_lock_before(lock);
> +	arch_write_lock_irq(&lock->raw_lock);
> +	debug_write_lock_after(lock);
> +}
> +
>   int do_raw_write_trylock(rwlock_t *lock)
>   {
>   	int ret = arch_write_trylock(&lock->raw_lock);

-- 
Thx and BRs,
Aiqun(Maria) Yu


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock
  2023-12-13 18:27   ` Eric W. Biederman
  2023-12-15  5:52     ` Aiqun Yu (Maria)
@ 2023-12-28 22:20     ` Matthew Wilcox
  2024-01-02  2:19       ` Aiqun Yu (Maria)
  1 sibling, 1 reply; 13+ messages in thread
From: Matthew Wilcox @ 2023-12-28 22:20 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Maria Yu, kernel, quic_pkondeti, keescook, viro, brauner, oleg,
	dhowells, jarkko, paul, jmorris, serge, linux-mm, linux-fsdevel,
	linux-kernel, keyrings, linux-security-module, linux-arm-msm

On Wed, Dec 13, 2023 at 12:27:05PM -0600, Eric W. Biederman wrote:
> Matthew Wilcox <willy@infradead.org> writes:
> > I think the right way to fix this is to pass a boolean flag to
> > queued_write_lock_slowpath() to let it know whether it can re-enable
> > interrupts while checking whether _QW_WAITING is set.
> 
> Yes.  It seems to make sense to distinguish between write_lock_irq and
> write_lock_irqsave and fix this for all of write_lock_irq.

I wasn't planning on doing anything here, but Hillf kind of pushed me into
it.  I think it needs to be something like this.  Compile tested only.
If it ends up getting used,

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 75b8f4601b28..1152e080c719 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -33,8 +33,8 @@
 /*
  * External function declarations
  */
-extern void queued_read_lock_slowpath(struct qrwlock *lock);
-extern void queued_write_lock_slowpath(struct qrwlock *lock);
+void queued_read_lock_slowpath(struct qrwlock *lock);
+void queued_write_lock_slowpath(struct qrwlock *lock, bool irq);
 
 /**
  * queued_read_trylock - try to acquire read lock of a queued rwlock
@@ -98,7 +98,21 @@ static inline void queued_write_lock(struct qrwlock *lock)
 	if (likely(atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)))
 		return;
 
-	queued_write_lock_slowpath(lock);
+	queued_write_lock_slowpath(lock, false);
+}
+
+/**
+ * queued_write_lock_irq - acquire write lock of a queued rwlock
+ * @lock : Pointer to queued rwlock structure
+ */
+static inline void queued_write_lock_irq(struct qrwlock *lock)
+{
+	int cnts = 0;
+	/* Optimize for the unfair lock case where the fair flag is 0. */
+	if (likely(atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)))
+		return;
+
+	queued_write_lock_slowpath(lock, true);
 }
 
 /**
@@ -138,6 +152,7 @@ static inline int queued_rwlock_is_contended(struct qrwlock *lock)
  */
 #define arch_read_lock(l)		queued_read_lock(l)
 #define arch_write_lock(l)		queued_write_lock(l)
+#define arch_write_lock_irq(l)		queued_write_lock_irq(l)
 #define arch_read_trylock(l)		queued_read_trylock(l)
 #define arch_write_trylock(l)		queued_write_trylock(l)
 #define arch_read_unlock(l)		queued_read_unlock(l)
diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index c0ef596f340b..897010b6ba0a 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -33,6 +33,7 @@ do {								\
  extern int do_raw_read_trylock(rwlock_t *lock);
  extern void do_raw_read_unlock(rwlock_t *lock) __releases(lock);
  extern void do_raw_write_lock(rwlock_t *lock) __acquires(lock);
+ extern void do_raw_write_lock_irq(rwlock_t *lock) __acquires(lock);
  extern int do_raw_write_trylock(rwlock_t *lock);
  extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock);
 #else
@@ -40,6 +41,7 @@ do {								\
 # define do_raw_read_trylock(rwlock)	arch_read_trylock(&(rwlock)->raw_lock)
 # define do_raw_read_unlock(rwlock)	do {arch_read_unlock(&(rwlock)->raw_lock); __release(lock); } while (0)
 # define do_raw_write_lock(rwlock)	do {__acquire(lock); arch_write_lock(&(rwlock)->raw_lock); } while (0)
+# define do_raw_write_lock_irq(rwlock)	do {__acquire(lock); arch_write_lock_irq(&(rwlock)->raw_lock); } while (0)
 # define do_raw_write_trylock(rwlock)	arch_write_trylock(&(rwlock)->raw_lock)
 # define do_raw_write_unlock(rwlock)	do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0)
 #endif
diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h
index dceb0a59b692..6257976dfb72 100644
--- a/include/linux/rwlock_api_smp.h
+++ b/include/linux/rwlock_api_smp.h
@@ -193,7 +193,7 @@ static inline void __raw_write_lock_irq(rwlock_t *lock)
 	local_irq_disable();
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock);
+	LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock_irq);
 }
 
 static inline void __raw_write_lock_bh(rwlock_t *lock)
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index d2ef312a8611..6c644a71b01d 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -61,9 +61,10 @@ EXPORT_SYMBOL(queued_read_lock_slowpath);
 
 /**
  * queued_write_lock_slowpath - acquire write lock of a queued rwlock
- * @lock : Pointer to queued rwlock structure
+ * @lock: Pointer to queued rwlock structure
+ * @irq: True if we can enable interrupts while spinning
  */
-void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
+void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock, bool irq)
 {
 	int cnts;
 
@@ -82,7 +83,11 @@ void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
 
 	/* When no more readers or writers, set the locked flag */
 	do {
+		if (irq)
+			local_irq_enable();
 		cnts = atomic_cond_read_relaxed(&lock->cnts, VAL == _QW_WAITING);
+		if (irq)
+			local_irq_disable();
 	} while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
 unlock:
 	arch_spin_unlock(&lock->wait_lock);
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 87b03d2e41db..bf94551d7435 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -212,6 +212,13 @@ void do_raw_write_lock(rwlock_t *lock)
 	debug_write_lock_after(lock);
 }
 
+void do_raw_write_lock_irq(rwlock_t *lock)
+{
+	debug_write_lock_before(lock);
+	arch_write_lock_irq(&lock->raw_lock);
+	debug_write_lock_after(lock);
+}
+
 int do_raw_write_trylock(rwlock_t *lock)
 {
 	int ret = arch_write_trylock(&lock->raw_lock);


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock
  2023-12-13 18:27   ` Eric W. Biederman
@ 2023-12-15  5:52     ` Aiqun Yu (Maria)
  2023-12-28 22:20     ` Matthew Wilcox
  1 sibling, 0 replies; 13+ messages in thread
From: Aiqun Yu (Maria) @ 2023-12-15  5:52 UTC (permalink / raw)
  To: Eric W. Biederman, Matthew Wilcox
  Cc: kernel, quic_pkondeti, keescook, viro, brauner, oleg, dhowells,
	jarkko, paul, jmorris, serge, linux-mm, linux-fsdevel,
	linux-kernel, keyrings, linux-security-module, linux-arm-msm



On 12/14/2023 2:27 AM, Eric W. Biederman wrote:
> Matthew Wilcox <willy@infradead.org> writes:
> 
>> On Wed, Dec 13, 2023 at 06:17:45PM +0800, Maria Yu wrote:
>>> +static inline void write_lock_tasklist_lock(void)
>>> +{
>>> +	while (1) {
>>> +		local_irq_disable();
>>> +		if (write_trylock(&tasklist_lock))
>>> +			break;
>>> +		local_irq_enable();
>>> +		cpu_relax();
>>
>> This is a bad implementation though.  You don't set the _QW_WAITING flag
Any better ideas and suggestions are welcomed. :)
>> so readers don't know that there's a pending writer.  Also, I've see >> cpu_relax() pessimise CPU behaviour; putting it into a low-power mode
>> that takes a while to wake up from.
>>
>> I think the right way to fix this is to pass a boolean flag to
>> queued_write_lock_slowpath() to let it know whether it can re-enable
>> interrupts while checking whether _QW_WAITING is set.
> 
> Yes.  It seems to make sense to distinguish between write_lock_irq and
> write_lock_irqsave and fix this for all of write_lock_irq.
> 
Let me think about this.
It seems a possible because there is a special behavior from reader side 
when in interrupt it will directly get the lock regardless of the 
pending writer.

> Either that or someone can put in the work to start making the
> tasklist_lock go away.
> 
> Eric
> 

-- 
Thx and BRs,
Aiqun(Maria) Yu


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock
  2023-12-13 16:22 ` Matthew Wilcox
@ 2023-12-13 18:27   ` Eric W. Biederman
  2023-12-15  5:52     ` Aiqun Yu (Maria)
  2023-12-28 22:20     ` Matthew Wilcox
  0 siblings, 2 replies; 13+ messages in thread
From: Eric W. Biederman @ 2023-12-13 18:27 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Maria Yu, kernel, quic_pkondeti, keescook, viro, brauner, oleg,
	dhowells, jarkko, paul, jmorris, serge, linux-mm, linux-fsdevel,
	linux-kernel, keyrings, linux-security-module, linux-arm-msm

Matthew Wilcox <willy@infradead.org> writes:

> On Wed, Dec 13, 2023 at 06:17:45PM +0800, Maria Yu wrote:
>> +static inline void write_lock_tasklist_lock(void)
>> +{
>> +	while (1) {
>> +		local_irq_disable();
>> +		if (write_trylock(&tasklist_lock))
>> +			break;
>> +		local_irq_enable();
>> +		cpu_relax();
>
> This is a bad implementation though.  You don't set the _QW_WAITING flag
> so readers don't know that there's a pending writer.  Also, I've seen
> cpu_relax() pessimise CPU behaviour; putting it into a low-power mode
> that takes a while to wake up from.
>
> I think the right way to fix this is to pass a boolean flag to
> queued_write_lock_slowpath() to let it know whether it can re-enable
> interrupts while checking whether _QW_WAITING is set.

Yes.  It seems to make sense to distinguish between write_lock_irq and
write_lock_irqsave and fix this for all of write_lock_irq.

Either that or someone can put in the work to start making the
tasklist_lock go away.

Eric



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock
  2023-12-13 10:17 Maria Yu
@ 2023-12-13 16:22 ` Matthew Wilcox
  2023-12-13 18:27   ` Eric W. Biederman
  0 siblings, 1 reply; 13+ messages in thread
From: Matthew Wilcox @ 2023-12-13 16:22 UTC (permalink / raw)
  To: Maria Yu
  Cc: ebiederm, kernel, quic_pkondeti, keescook, viro, brauner, oleg,
	dhowells, jarkko, paul, jmorris, serge, linux-mm, linux-fsdevel,
	linux-kernel, keyrings, linux-security-module, linux-arm-msm

On Wed, Dec 13, 2023 at 06:17:45PM +0800, Maria Yu wrote:
> +static inline void write_lock_tasklist_lock(void)
> +{
> +	while (1) {
> +		local_irq_disable();
> +		if (write_trylock(&tasklist_lock))
> +			break;
> +		local_irq_enable();
> +		cpu_relax();

This is a bad implementation though.  You don't set the _QW_WAITING flag
so readers don't know that there's a pending writer.  Also, I've seen
cpu_relax() pessimise CPU behaviour; putting it into a low-power mode
that takes a while to wake up from.

I think the right way to fix this is to pass a boolean flag to
queued_write_lock_slowpath() to let it know whether it can re-enable
interrupts while checking whether _QW_WAITING is set.



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock
@ 2023-12-13 10:17 Maria Yu
  2023-12-13 16:22 ` Matthew Wilcox
  0 siblings, 1 reply; 13+ messages in thread
From: Maria Yu @ 2023-12-13 10:17 UTC (permalink / raw)
  To: ebiederm
  Cc: Maria Yu, kernel, quic_pkondeti, keescook, viro, brauner, oleg,
	dhowells, jarkko, paul, jmorris, serge, linux-mm, linux-fsdevel,
	linux-kernel, keyrings, linux-security-module, linux-arm-msm

As a rwlock for tasklist_lock, there are multiple scenarios to acquire
read lock which write lock needed to be waiting for.
In freeze_process/thaw_processes it can take about 200+ms for holding read
lock of tasklist_lock by walking and freezing/thawing tasks in commercial
devices. And write_lock_irq will have preempt disabled and local irq
disabled to spin until the tasklist_lock can be acquired. This leading to
a bad responsive performance of current system.
Take an example:
1. cpu0 is holding read lock of tasklist_lock to thaw_processes.
2. cpu1 is waiting write lock of tasklist_lock to exec a new thread with
   preempt_disabled and local irq disabled.
3. cpu2 is waiting write lock of tasklist_lock to do_exit with
   preempt_disabled and local irq disabled.
4. cpu3 is waiting write lock of tasklist_lock to do_exit with
   preempt_disabled and local irq disabled.
So introduce a write lock/unlock wrapper for tasklist_lock specificly.
The current taskslist_lock writers all have write_lock_irq to hold
tasklist_lock, and write_unlock_irq to release tasklist_lock, that means
the writers are not suitable or workable to wait on tasklist_lock in irq
disabled scenarios. So the write lock/unlock wrapper here only follow the
current design of directly use local_irq_disable and local_irq_enable,
and not take already irq disabled writer callers into account.
Use write_trylock in the loop and enabled irq for cpu to repsond if lock
cannot be taken.

Signed-off-by: Maria Yu <quic_aiquny@quicinc.com>
---
 fs/exec.c                  | 10 +++++-----
 include/linux/sched/task.h | 29 +++++++++++++++++++++++++++++
 kernel/exit.c              | 16 ++++++++--------
 kernel/fork.c              |  6 +++---
 kernel/ptrace.c            | 12 ++++++------
 kernel/sys.c               |  8 ++++----
 security/keys/keyctl.c     |  4 ++--
 7 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 4aa19b24f281..030eef6852eb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1086,7 +1086,7 @@ static int de_thread(struct task_struct *tsk)
 
 		for (;;) {
 			cgroup_threadgroup_change_begin(tsk);
-			write_lock_irq(&tasklist_lock);
+			write_lock_tasklist_lock();
 			/*
 			 * Do this under tasklist_lock to ensure that
 			 * exit_notify() can't miss ->group_exec_task
@@ -1095,7 +1095,7 @@ static int de_thread(struct task_struct *tsk)
 			if (likely(leader->exit_state))
 				break;
 			__set_current_state(TASK_KILLABLE);
-			write_unlock_irq(&tasklist_lock);
+			write_unlock_tasklist_lock();
 			cgroup_threadgroup_change_end(tsk);
 			schedule();
 			if (__fatal_signal_pending(tsk))
@@ -1150,7 +1150,7 @@ static int de_thread(struct task_struct *tsk)
 		 */
 		if (unlikely(leader->ptrace))
 			__wake_up_parent(leader, leader->parent);
-		write_unlock_irq(&tasklist_lock);
+		write_unlock_tasklist_lock();
 		cgroup_threadgroup_change_end(tsk);
 
 		release_task(leader);
@@ -1198,13 +1198,13 @@ static int unshare_sighand(struct task_struct *me)
 
 		refcount_set(&newsighand->count, 1);
 
-		write_lock_irq(&tasklist_lock);
+		write_lock_tasklist_lock();
 		spin_lock(&oldsighand->siglock);
 		memcpy(newsighand->action, oldsighand->action,
 		       sizeof(newsighand->action));
 		rcu_assign_pointer(me->sighand, newsighand);
 		spin_unlock(&oldsighand->siglock);
-		write_unlock_irq(&tasklist_lock);
+		write_unlock_tasklist_lock();
 
 		__cleanup_sighand(oldsighand);
 	}
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a23af225c898..6f69d9a3c868 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -50,6 +50,35 @@ struct kernel_clone_args {
  * a separate lock).
  */
 extern rwlock_t tasklist_lock;
+
+/*
+ * Tasklist_lock is a special lock, it takes a good amount of time of
+ * taskslist_lock readers to finish, and the pure write_irq_lock api
+ * will do local_irq_disable at the very first, and put the current cpu
+ * waiting for the lock while is non-responsive for interrupts.
+ *
+ * The current taskslist_lock writers all have write_lock_irq to hold
+ * tasklist_lock, and write_unlock_irq to release tasklist_lock, that
+ * means the writers are not suitable or workable to wait on
+ * tasklist_lock in irq disabled scenarios. So the write lock/unlock
+ * wrapper here only follow the current design of directly use
+ * local_irq_disable and local_irq_enable.
+ */
+static inline void write_lock_tasklist_lock(void)
+{
+	while (1) {
+		local_irq_disable();
+		if (write_trylock(&tasklist_lock))
+			break;
+		local_irq_enable();
+		cpu_relax();
+	}
+}
+static inline void write_unlock_tasklist_lock(void)
+{
+	write_unlock_irq(&tasklist_lock);
+}
+
 extern spinlock_t mmlist_lock;
 
 extern union thread_union init_thread_union;
diff --git a/kernel/exit.c b/kernel/exit.c
index ee9f43bed49a..18b00f477079 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -251,7 +251,7 @@ void release_task(struct task_struct *p)
 
 	cgroup_release(p);
 
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 	ptrace_release_task(p);
 	thread_pid = get_pid(p->thread_pid);
 	__exit_signal(p);
@@ -275,7 +275,7 @@ void release_task(struct task_struct *p)
 			leader->exit_state = EXIT_DEAD;
 	}
 
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 	seccomp_filter_release(p);
 	proc_flush_pid(thread_pid);
 	put_pid(thread_pid);
@@ -598,7 +598,7 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
 		return reaper;
 	}
 
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 
 	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
 		list_del_init(&p->ptrace_entry);
@@ -606,7 +606,7 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
 	}
 
 	zap_pid_ns_processes(pid_ns);
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 
 	return father;
 }
@@ -730,7 +730,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	struct task_struct *p, *n;
 	LIST_HEAD(dead);
 
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 	forget_original_parent(tsk, &dead);
 
 	if (group_dead)
@@ -758,7 +758,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	/* mt-exec, de_thread() is waiting for group leader */
 	if (unlikely(tsk->signal->notify_count < 0))
 		wake_up_process(tsk->signal->group_exec_task);
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 
 	list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
 		list_del_init(&p->ptrace_entry);
@@ -1172,7 +1172,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	wo->wo_stat = status;
 
 	if (state == EXIT_TRACE) {
-		write_lock_irq(&tasklist_lock);
+		write_lock_tasklist_lock();
 		/* We dropped tasklist, ptracer could die and untrace */
 		ptrace_unlink(p);
 
@@ -1181,7 +1181,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		if (do_notify_parent(p, p->exit_signal))
 			state = EXIT_DEAD;
 		p->exit_state = state;
-		write_unlock_irq(&tasklist_lock);
+		write_unlock_tasklist_lock();
 	}
 	if (state == EXIT_DEAD)
 		release_task(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..06c4b4ab9102 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2623,7 +2623,7 @@ __latent_entropy struct task_struct *copy_process(
 	 * Make it visible to the rest of the system, but dont wake it up yet.
 	 * Need tasklist lock for parent etc handling!
 	 */
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
@@ -2714,7 +2714,7 @@ __latent_entropy struct task_struct *copy_process(
 	hlist_del_init(&delayed.node);
 	spin_unlock(&current->sighand->siglock);
 	syscall_tracepoint_update(p);
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 
 	if (pidfile)
 		fd_install(pidfd, pidfile);
@@ -2735,7 +2735,7 @@ __latent_entropy struct task_struct *copy_process(
 bad_fork_cancel_cgroup:
 	sched_core_free(p);
 	spin_unlock(&current->sighand->siglock);
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 	cgroup_cancel_fork(p, args);
 bad_fork_put_pidfd:
 	if (clone_flags & CLONE_PIDFD) {
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d8b5e13a2229..a8d7e2d06f3e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -435,7 +435,7 @@ static int ptrace_attach(struct task_struct *task, long request,
 	if (retval)
 		goto unlock_creds;
 
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 	retval = -EPERM;
 	if (unlikely(task->exit_state))
 		goto unlock_tasklist;
@@ -479,7 +479,7 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	retval = 0;
 unlock_tasklist:
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 unlock_creds:
 	mutex_unlock(&task->signal->cred_guard_mutex);
 out:
@@ -508,7 +508,7 @@ static int ptrace_traceme(void)
 {
 	int ret = -EPERM;
 
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 	/* Are we already being traced? */
 	if (!current->ptrace) {
 		ret = security_ptrace_traceme(current->parent);
@@ -522,7 +522,7 @@ static int ptrace_traceme(void)
 			ptrace_link(current, current->real_parent);
 		}
 	}
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 
 	return ret;
 }
@@ -588,7 +588,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
 	/* Architecture-specific hardware disable .. */
 	ptrace_disable(child);
 
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 	/*
 	 * We rely on ptrace_freeze_traced(). It can't be killed and
 	 * untraced by another thread, it can't be a zombie.
@@ -600,7 +600,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
 	 */
 	child->exit_code = data;
 	__ptrace_detach(current, child);
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 
 	proc_ptrace_connector(child, PTRACE_DETACH);
 
diff --git a/kernel/sys.c b/kernel/sys.c
index e219fcfa112d..0b1647d3ed32 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1088,7 +1088,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 	/* From this point forward we keep holding onto the tasklist lock
 	 * so that our parent does not change from under us. -DaveM
 	 */
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 
 	err = -ESRCH;
 	p = find_task_by_vpid(pid);
@@ -1136,7 +1136,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 	err = 0;
 out:
 	/* All paths lead to here, thus we are safe. -DaveM */
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 	rcu_read_unlock();
 	return err;
 }
@@ -1229,7 +1229,7 @@ int ksys_setsid(void)
 	pid_t session = pid_vnr(sid);
 	int err = -EPERM;
 
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 	/* Fail if I am already a session leader */
 	if (group_leader->signal->leader)
 		goto out;
@@ -1247,7 +1247,7 @@ int ksys_setsid(void)
 
 	err = session;
 out:
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 	if (err > 0) {
 		proc_sid_connector(group_leader);
 		sched_autogroup_create_attach(group_leader);
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 19be69fa4d05..dd8aed20486a 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1652,7 +1652,7 @@ long keyctl_session_to_parent(void)
 
 	me = current;
 	rcu_read_lock();
-	write_lock_irq(&tasklist_lock);
+	write_lock_tasklist_lock();
 
 	ret = -EPERM;
 	oldwork = NULL;
@@ -1702,7 +1702,7 @@ long keyctl_session_to_parent(void)
 	if (!ret)
 		newwork = NULL;
 unlock:
-	write_unlock_irq(&tasklist_lock);
+	write_unlock_tasklist_lock();
 	rcu_read_unlock();
 	if (oldwork)
 		put_cred(container_of(oldwork, struct cred, rcu));

base-commit: 88035e5694a86a7167d490bb95e9df97a9bb162b
-- 
2.17.1



^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2024-01-04  0:46 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-25  8:19 [PATCH] kernel: Introduce a write lock/unlock wrapper for tasklist_lock Maria Yu
2023-12-25  8:26 ` Aiqun Yu (Maria)
2024-01-03 14:04 ` Jarkko Sakkinen
  -- strict thread matches above, loose matches on Subject: below --
2023-12-13 10:17 Maria Yu
2023-12-13 16:22 ` Matthew Wilcox
2023-12-13 18:27   ` Eric W. Biederman
2023-12-15  5:52     ` Aiqun Yu (Maria)
2023-12-28 22:20     ` Matthew Wilcox
2024-01-02  2:19       ` Aiqun Yu (Maria)
2024-01-02  9:14         ` Matthew Wilcox
2024-01-03  2:58           ` Aiqun Yu (Maria)
2024-01-03 18:18             ` Matthew Wilcox
2024-01-04  0:46               ` Aiqun Yu (Maria)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox