[RFC v2 20/35] RPAL: add rpal_ret_from_lazy_switch

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Bo Li <libo.gcs85@bytedance.com>
To: tglx@linutronix.de, mingo@redhat.com, bp@alien8.de,
	dave.hansen@linux.intel.com, x86@kernel.org, luto@kernel.org,
	kees@kernel.org, akpm@linux-foundation.org, david@redhat.com,
	juri.lelli@redhat.com, vincent.guittot@linaro.org,
	peterz@infradead.org
Cc: dietmar.eggemann@arm.com, hpa@zytor.com, acme@kernel.org,
	namhyung@kernel.org, mark.rutland@arm.com,
	alexander.shishkin@linux.intel.com, jolsa@kernel.org,
	irogers@google.com, adrian.hunter@intel.com,
	kan.liang@linux.intel.com, viro@zeniv.linux.org.uk,
	brauner@kernel.org, jack@suse.cz, lorenzo.stoakes@oracle.com,
	Liam.Howlett@oracle.com, vbabka@suse.cz, rppt@kernel.org,
	surenb@google.com, mhocko@suse.com, rostedt@goodmis.org,
	bsegall@google.com, mgorman@suse.de, vschneid@redhat.com,
	jannh@google.com, pfalcato@suse.de, riel@surriel.com,
	harry.yoo@oracle.com, linux-kernel@vger.kernel.org,
	linux-perf-users@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, duanxiongchun@bytedance.com,
	yinhongbo@bytedance.com, dengliang.1214@bytedance.com,
	xieyongji@bytedance.com, chaiwen.cc@bytedance.com,
	songmuchun@bytedance.com, yuanzhu@bytedance.com,
	chengguozhu@bytedance.com, sunjiadong.lff@bytedance.com,
	Bo Li <libo.gcs85@bytedance.com>
Subject: [RFC v2 20/35] RPAL: add rpal_ret_from_lazy_switch
Date: Fri, 30 May 2025 17:27:48 +0800	[thread overview]
Message-ID: <4cd58d0e989640f0c230196e81cec5cee0ceb476.1748594841.git.libo.gcs85@bytedance.com> (raw)
In-Reply-To: <cover.1748594840.git.libo.gcs85@bytedance.com>

After lazy switch the task before the lazy switch will lose its user mode
context (which is passed to the task after the lazy switch). Therefore,
RPAL needs to handle the issue of the previous task losing its user mode
context.

After the lazy switch occurs, the sender can resume execution in two ways.
One way is to be scheduled by the scheduler. In this case, RPAL handles
this issue in a manner similar to ret_from_fork. the sender will enter
rpal_ret_from_lazy_switch through the constructed stack frame by lazy
switchto execute the return logic and finally return to the pre-defined
user mode (referred to as "kernel return"). The other way is to be switched
back to by the receiver through another lazy switch. In this case, the
receiver will pass the user mode context to the sender, so there is no need
to construct a user mode context for the sender. And the receiver can
return to the user mode through the kernel return method.

rpal_ret_from_lazy_switch primarily handles scheduler cleanup work, similar
to schedule_tail(), but does not perform set_child_tid-otherwise, it might
cause set_child_tid to be executed repeatedly. It then calls
rpal_kernel_ret(), which is primarily used to set the states of the sender
and receiver and attempt to unlock the CPU. Finally, it performs syscall
cleanup work and returns to user mode.

Signed-off-by: Bo Li <libo.gcs85@bytedance.com>
---
 arch/x86/entry/entry_64.S | 23 ++++++++++++++++++++
 arch/x86/rpal/core.c      | 45 +++++++++++++++++++++++++++++++++++++--
 include/linux/rpal.h      |  5 ++++-
 kernel/sched/core.c       | 25 +++++++++++++++++++++-
 4 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ed04a968cc7d..13b4d0684575 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -169,6 +169,29 @@ SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
 	int3
 SYM_CODE_END(entry_SYSCALL_64)
 
+#ifdef CONFIG_RPAL
+SYM_CODE_START(rpal_ret_from_lazy_switch)
+	UNWIND_HINT_END_OF_STACK
+	ANNOTATE_NOENDBR
+	movq	%rax, %rdi
+	call	rpal_schedule_tail
+
+	movq	%rsp, %rdi
+	call	rpal_kernel_ret
+
+	movq	%rsp, %rdi
+	call	syscall_exit_to_user_mode	/* returns with IRQs disabled */
+
+	UNWIND_HINT_REGS
+#ifdef CONFIG_X86_FRED
+	ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \
+		    "jmp asm_fred_exit_user", X86_FEATURE_FRED
+#else
+	jmp	swapgs_restore_regs_and_return_to_usermode
+#endif
+SYM_CODE_END(rpal_ret_from_lazy_switch)
+#endif
+
 /*
  * %rdi: prev task
  * %rsi: next task
diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 19c4ef38bca3..ed4c11e6838c 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -18,7 +18,7 @@ unsigned long rpal_cap;
 
 static inline void rpal_lock_cpu(struct task_struct *tsk)
 {
-	rpal_set_cpus_allowed_ptr(tsk, true);
+	rpal_set_cpus_allowed_ptr(tsk, true, false);
 	if (unlikely(!irqs_disabled())) {
 		local_irq_disable();
 		rpal_err("%s: irq is enabled\n", __func__);
@@ -27,13 +27,54 @@ static inline void rpal_lock_cpu(struct task_struct *tsk)
 
 static inline void rpal_unlock_cpu(struct task_struct *tsk)
 {
-	rpal_set_cpus_allowed_ptr(tsk, false);
+	rpal_set_cpus_allowed_ptr(tsk, false, false);
 	if (unlikely(!irqs_disabled())) {
 		local_irq_disable();
 		rpal_err("%s: irq is enabled\n", __func__);
 	}
 }
 
+static inline void rpal_unlock_cpu_kernel_ret(struct task_struct *tsk)
+{
+	rpal_set_cpus_allowed_ptr(tsk, false, true);
+}
+
+void rpal_kernel_ret(struct pt_regs *regs)
+{
+	struct task_struct *tsk;
+	struct rpal_receiver_call_context *rcc;
+	int state;
+
+	if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) {
+		rcc = current->rpal_rd->rcc;
+		atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_KERNEL_RET);
+	} else {
+		tsk = current->rpal_sd->receiver;
+		rcc = tsk->rpal_rd->rcc;
+		rpal_clear_task_thread_flag(tsk, RPAL_LAZY_SWITCHED_BIT);
+		state = atomic_xchg(&rcc->sender_state, RPAL_SENDER_STATE_KERNEL_RET);
+		WARN_ON_ONCE(state != RPAL_SENDER_STATE_CALL);
+		/* make sure kernel return is finished */
+		smp_mb();
+		WRITE_ONCE(tsk->rpal_rd->sender, NULL);
+		/*
+		 * We must unlock receiver first, otherwise we may unlock
+		 * receiver which is already locked by another sender.
+		 *
+		 *  Sender A			Receiver B      Sender C
+		 *	lazy switch (A->B)
+		 *  kernel return
+		 *      unlock cpu A
+		 *                      epoll_wait
+		 *                                         lazy switch(C->B)
+		 *                                         lock cpu B
+		 *		unlock cpu B
+		 *						BUG()			BUG()
+		 */
+		rpal_unlock_cpu_kernel_ret(tsk);
+		rpal_unlock_cpu_kernel_ret(current);
+	}
+}
 
 static inline struct task_struct *rpal_get_sender_task(void)
 {
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 0813db4552c0..01b582fa821e 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -480,14 +480,17 @@ int rpal_rebuild_sender_context_on_fault(struct pt_regs *regs,
 					 unsigned long addr, int error_code);
 struct mm_struct *rpal_pf_get_real_mm(unsigned long address, int *rebuild);
 struct task_struct *rpal_find_next_task(unsigned long fsbase);
+void rpal_kernel_ret(struct pt_regs *regs);
 
 extern void rpal_pick_mmap_base(struct mm_struct *mm,
 	struct rlimit *rlim_stack);
 int rpal_try_to_wake_up(struct task_struct *p);
 int rpal_init_thread_pending(struct rpal_common_data *rcd);
 void rpal_free_thread_pending(struct rpal_common_data *rcd);
-int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock);
+int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock,
+	bool is_kernel_ret);
 void rpal_schedule(struct task_struct *next);
 asmlinkage struct task_struct *
 __rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p);
+asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev);
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 760d88458b39..0f9343698198 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3181,7 +3181,8 @@ void rpal_free_thread_pending(struct rpal_common_data *rcd)
 /*
  * CPU lock is forced and all cpumask will be ignored by RPAL temporary.
  */
-int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock)
+int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock,
+							 bool is_kernel_ret)
 {
 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	struct set_affinity_pending *pending = p->rpal_cd->pending;
@@ -3210,6 +3211,9 @@ int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock)
 		rpal_clear_task_thread_flag(p, RPAL_CPU_LOCKED_BIT);
 	}
 
+	if (is_kernel_ret)
+		return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf);
+
 	update_rq_clock(rq);
 
 	if (cpumask_equal(&p->cpus_mask, ac.new_mask))
@@ -11011,6 +11015,25 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
 #endif	/* CONFIG_SCHED_CLASS_EXT */
 
 #ifdef CONFIG_RPAL
+asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev)
+	__releases(rq->lock)
+{
+	/*
+	 * New tasks start with FORK_PREEMPT_COUNT, see there and
+	 * finish_task_switch() for details.
+	 *
+	 * finish_task_switch() will drop rq->lock() and lower preempt_count
+	 * and the preempt_enable() will end up enabling preemption (on
+	 * PREEMPT_COUNT kernels).
+	 */
+
+	finish_task_switch(prev);
+	trace_sched_exit_tp(true, CALLER_ADDR0);
+	preempt_enable();
+
+	calculate_sigpending();
+}
+
 static struct rq *rpal_finish_task_switch(struct task_struct *prev)
 	__releases(rq->lock)
 {
-- 
2.20.1

next prev parent reply	other threads:[~2025-05-30  9:33 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-30  9:27 [RFC v2 00/35] optimize cost of inter-process communication Bo Li
2025-05-30  9:27 ` [RFC v2 01/35] Kbuild: rpal support Bo Li
2025-05-30  9:27 ` [RFC v2 02/35] RPAL: add struct rpal_service Bo Li
2025-05-30  9:27 ` [RFC v2 03/35] RPAL: add service registration interface Bo Li
2025-05-30  9:27 ` [RFC v2 04/35] RPAL: add member to task_struct and mm_struct Bo Li
2025-05-30  9:27 ` [RFC v2 05/35] RPAL: enable virtual address space partitions Bo Li
2025-05-30  9:27 ` [RFC v2 06/35] RPAL: add user interface Bo Li
2025-05-30  9:27 ` [RFC v2 07/35] RPAL: enable shared page mmap Bo Li
2025-05-30  9:27 ` [RFC v2 08/35] RPAL: enable sender/receiver registration Bo Li
2025-05-30  9:27 ` [RFC v2 09/35] RPAL: enable address space sharing Bo Li
2025-05-30  9:27 ` [RFC v2 10/35] RPAL: allow service enable/disable Bo Li
2025-05-30  9:27 ` [RFC v2 11/35] RPAL: add service request/release Bo Li
2025-05-30  9:27 ` [RFC v2 12/35] RPAL: enable service disable notification Bo Li
2025-05-30  9:27 ` [RFC v2 13/35] RPAL: add tlb flushing support Bo Li
2025-05-30  9:27 ` [RFC v2 14/35] RPAL: enable page fault handling Bo Li
2025-05-30 13:59   ` Dave Hansen
2025-05-30  9:27 ` [RFC v2 15/35] RPAL: add sender/receiver state Bo Li
2025-05-30  9:27 ` [RFC v2 16/35] RPAL: add cpu lock interface Bo Li
2025-05-30  9:27 ` [RFC v2 17/35] RPAL: add a mapping between fsbase and tasks Bo Li
2025-05-30  9:27 ` [RFC v2 18/35] sched: pick a specified task Bo Li
2025-05-30  9:27 ` [RFC v2 19/35] RPAL: add lazy switch main logic Bo Li
2025-05-30  9:27 ` Bo Li [this message]
2025-05-30  9:27 ` [RFC v2 21/35] RPAL: add kernel entry handling for lazy switch Bo Li
2025-05-30  9:27 ` [RFC v2 22/35] RPAL: rebuild receiver state Bo Li
2025-05-30  9:27 ` [RFC v2 23/35] RPAL: resume cpumask when fork Bo Li
2025-05-30  9:27 ` [RFC v2 24/35] RPAL: critical section optimization Bo Li
2025-05-30  9:27 ` [RFC v2 25/35] RPAL: add MPK initialization and interface Bo Li
2025-05-30  9:27 ` [RFC v2 26/35] RPAL: enable MPK support Bo Li
2025-05-30 17:03   ` Dave Hansen
2025-05-30  9:27 ` [RFC v2 27/35] RPAL: add epoll support Bo Li
2025-05-30  9:27 ` [RFC v2 28/35] RPAL: add rpal_uds_fdmap() support Bo Li
2025-05-30  9:27 ` [RFC v2 29/35] RPAL: fix race condition in pkru update Bo Li
2025-05-30  9:27 ` [RFC v2 30/35] RPAL: fix pkru setup when fork Bo Li
2025-05-30  9:27 ` [RFC v2 31/35] RPAL: add receiver waker Bo Li
2025-05-30  9:28 ` [RFC v2 32/35] RPAL: fix unknown nmi on AMD CPU Bo Li
2025-05-30  9:28 ` [RFC v2 33/35] RPAL: enable time slice correction Bo Li
2025-05-30  9:28 ` [RFC v2 34/35] RPAL: enable fast epoll wait Bo Li
2025-05-30  9:28 ` [RFC v2 35/35] samples/rpal: add RPAL samples Bo Li
2025-05-30  9:33 ` [RFC v2 00/35] optimize cost of inter-process communication Lorenzo Stoakes
2025-06-03  8:22   ` Bo Li
2025-06-03  9:22     ` Lorenzo Stoakes
2025-05-30  9:41 ` Pedro Falcato
2025-05-30  9:56 ` David Hildenbrand
2025-05-30 22:42 ` Andrew Morton
2025-05-31  7:16 ` Ingo Molnar
2025-06-03 17:49 ` H. Peter Anvin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4cd58d0e989640f0c230196e81cec5cee0ceb476.1748594841.git.libo.gcs85@bytedance.com \
    --to=libo.gcs85@bytedance.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=acme@kernel.org \
    --cc=adrian.hunter@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=bp@alien8.de \
    --cc=brauner@kernel.org \
    --cc=bsegall@google.com \
    --cc=chaiwen.cc@bytedance.com \
    --cc=chengguozhu@bytedance.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=dengliang.1214@bytedance.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=duanxiongchun@bytedance.com \
    --cc=harry.yoo@oracle.com \
    --cc=hpa@zytor.com \
    --cc=irogers@google.com \
    --cc=jack@suse.cz \
    --cc=jannh@google.com \
    --cc=jolsa@kernel.org \
    --cc=juri.lelli@redhat.com \
    --cc=kan.liang@linux.intel.com \
    --cc=kees@kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=luto@kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mgorman@suse.de \
    --cc=mhocko@suse.com \
    --cc=mingo@redhat.com \
    --cc=namhyung@kernel.org \
    --cc=peterz@infradead.org \
    --cc=pfalcato@suse.de \
    --cc=riel@surriel.com \
    --cc=rostedt@goodmis.org \
    --cc=rppt@kernel.org \
    --cc=songmuchun@bytedance.com \
    --cc=sunjiadong.lff@bytedance.com \
    --cc=surenb@google.com \
    --cc=tglx@linutronix.de \
    --cc=vbabka@suse.cz \
    --cc=vincent.guittot@linaro.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=vschneid@redhat.com \
    --cc=x86@kernel.org \
    --cc=xieyongji@bytedance.com \
    --cc=yinhongbo@bytedance.com \
    --cc=yuanzhu@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox