[RFC v2 34/35] RPAL: enable fast epoll wait

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Bo Li <libo.gcs85@bytedance.com>
To: tglx@linutronix.de, mingo@redhat.com, bp@alien8.de,
	dave.hansen@linux.intel.com, x86@kernel.org, luto@kernel.org,
	kees@kernel.org, akpm@linux-foundation.org, david@redhat.com,
	juri.lelli@redhat.com, vincent.guittot@linaro.org,
	peterz@infradead.org
Cc: dietmar.eggemann@arm.com, hpa@zytor.com, acme@kernel.org,
	namhyung@kernel.org, mark.rutland@arm.com,
	alexander.shishkin@linux.intel.com, jolsa@kernel.org,
	irogers@google.com, adrian.hunter@intel.com,
	kan.liang@linux.intel.com, viro@zeniv.linux.org.uk,
	brauner@kernel.org, jack@suse.cz, lorenzo.stoakes@oracle.com,
	Liam.Howlett@oracle.com, vbabka@suse.cz, rppt@kernel.org,
	surenb@google.com, mhocko@suse.com, rostedt@goodmis.org,
	bsegall@google.com, mgorman@suse.de, vschneid@redhat.com,
	jannh@google.com, pfalcato@suse.de, riel@surriel.com,
	harry.yoo@oracle.com, linux-kernel@vger.kernel.org,
	linux-perf-users@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, duanxiongchun@bytedance.com,
	yinhongbo@bytedance.com, dengliang.1214@bytedance.com,
	xieyongji@bytedance.com, chaiwen.cc@bytedance.com,
	songmuchun@bytedance.com, yuanzhu@bytedance.com,
	chengguozhu@bytedance.com, sunjiadong.lff@bytedance.com,
	Bo Li <libo.gcs85@bytedance.com>
Subject: [RFC v2 34/35] RPAL: enable fast epoll wait
Date: Fri, 30 May 2025 17:28:02 +0800	[thread overview]
Message-ID: <b13520ef51366f6c25c50f05de7210d37fcd9489.1748594841.git.libo.gcs85@bytedance.com> (raw)
In-Reply-To: <cover.1748594840.git.libo.gcs85@bytedance.com>

When a kernel event occurs during an RPAL call and triggers a lazy switch,
the kernel context switches from the sender to the receiver. When the
receiver later returns from user space to the sender, a second lazy switch
is required to switch the kernel context back to the sender. In the current
implementation, after the second lazy switch, the receiver returns to user
space via rpal_kernel_ret() and then calls epoll_wait() from user space to
re-enter the kernel. This causes the receiver to be unable to process epoll
events for a long period, degrading performance.

This patch introduces a fast epoll wait feature. During the second lazy
switch, the kernel configures epoll-related data structures so that the
receiver can directly enter the epoll wait state without first returning
to user space and then calling epoll_wait(). The patch adds a new state
RPAL_RECEIVER_STATE_READY_LS, which is used to mark that the receiver can
transition to RPAL_RECEIVER_STATE_WAIT during the second lazy switch. The
kernel then performs this state transition in rpal_lazy_switch_tail().

Signed-off-by: Bo Li <libo.gcs85@bytedance.com>
---
 arch/x86/rpal/core.c |  29 ++++++++++++-
 fs/eventpoll.c       | 101 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/rpal.h |   3 ++
 kernel/sched/core.c  |  13 +++++-
 4 files changed, 143 insertions(+), 3 deletions(-)

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 2ac5d932f69c..7b6efde23e48 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -51,7 +51,25 @@ void rpal_lazy_switch_tail(struct task_struct *tsk)
 		atomic_cmpxchg(&rcc->receiver_state, rpal_build_call_state(tsk->rpal_sd),
 			       RPAL_RECEIVER_STATE_LAZY_SWITCH);
 	} else {
+		/* tsk is receiver */
+		int state;
+
+		rcc = tsk->rpal_rd->rcc;
+		state = atomic_read(&rcc->receiver_state);
+		/* receiver may be scheduled on another cpu after unlock. */
 		rpal_unlock_cpu(tsk);
+		/*
+		 * We must not use RPAL_RECEIVER_STATE_READY instead of
+		 * RPAL_RECEIVER_STATE_READY_LS. As receiver may at
+		 * TASK_RUNNING state and then call epoll_wait() again,
+		 * the state may become RPAL_RECEIVER_STATE_READY, we should
+		 * not changed its state to RPAL_RECEIVER_STATE_WAIT since
+		 * the state is set by another RPAL call.
+		 */
+		if (state == RPAL_RECEIVER_STATE_READY_LS)
+			atomic_cmpxchg(&rcc->receiver_state,
+				       RPAL_RECEIVER_STATE_READY_LS,
+				       RPAL_RECEIVER_STATE_WAIT);
 		rpal_unlock_cpu(current);
 	}
 }
@@ -63,8 +81,14 @@ void rpal_kernel_ret(struct pt_regs *regs)
 	int state;
 
 	if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) {
-		rcc = current->rpal_rd->rcc;
-		regs->ax = rpal_try_send_events(current->rpal_rd->ep, rcc);
+		struct rpal_receiver_data *rrd = current->rpal_rd;
+
+		rcc = rrd->rcc;
+		if (rcc->timeout > 0)
+			hrtimer_cancel(&rrd->ep_sleeper.timer);
+		rpal_remove_ep_wait_list(rrd);
+		regs->ax = rpal_try_send_events(rrd->ep, rcc);
+		fdput(rrd->f);
 		atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_KERNEL_RET);
 	} else {
 		tsk = current->rpal_sd->receiver;
@@ -173,6 +197,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
 		 * Otherwise, sender's user context will be corrupted.
 		 */
 		rebuild_receiver_stack(current->rpal_rd, regs);
+		rpal_fast_ep_poll(current->rpal_rd, regs);
 		rpal_schedule(next);
 		rpal_clear_task_thread_flag(prev, RPAL_LAZY_SWITCHED_BIT);
 		prev->rpal_rd->sender = NULL;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 791321639561..b70c1cd82335 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2143,6 +2143,107 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 }
 
 #ifdef CONFIG_RPAL
+static void *rpal_get_eventpoll(struct rpal_receiver_data *rrd, struct pt_regs *regs)
+{
+	struct rpal_receiver_call_context *rcc = rrd->rcc;
+	int epfd = rcc->epfd;
+	struct epoll_event __user *events = rcc->events;
+	int maxevents = rcc->maxevents;
+	struct file *file;
+
+	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) {
+		regs->ax = -EINVAL;
+		return NULL;
+	}
+
+	if (!access_ok(events, maxevents * sizeof(struct epoll_event))) {
+		regs->ax = -EFAULT;
+		return NULL;
+	}
+
+	rrd->f = fdget(epfd);
+	file = fd_file(rrd->f);
+	if (!file) {
+		regs->ax = -EBADF;
+		return NULL;
+	}
+
+	if (!is_file_epoll(file)) {
+		regs->ax = -EINVAL;
+		fdput(rrd->f);
+		return NULL;
+	}
+
+	rrd->ep = file->private_data;
+	return rrd->ep;
+}
+
+void rpal_fast_ep_poll(struct rpal_receiver_data *rrd, struct pt_regs *regs)
+{
+	struct eventpoll *ep;
+	struct rpal_receiver_call_context *rcc = rrd->rcc;
+	ktime_t ts = 0;
+	struct hrtimer *ht = &rrd->ep_sleeper.timer;
+	int state;
+	int avail;
+
+	regs->orig_ax = __NR_epoll_wait;
+	ep = rpal_get_eventpoll(rrd, regs);
+
+	if (!ep || signal_pending(current) ||
+	    unlikely(ep_events_available(ep)) ||
+	    atomic_read(&rcc->ep_pending) || unlikely(rcc->timeout == 0)) {
+		INIT_LIST_HEAD(&rrd->ep_wait.entry);
+	} else {
+		/*
+		 * Here we use RPAL_RECEIVER_STATE_READY_LS to avoid conflict with
+		 * RPAL_RECEIVER_STATE_READY. As the RPAL_RECEIVER_STATE_READY_LS
+		 * is convert to RPAL_RECEIVER_STATE_WAIT in rpal_lazy_switch_tail(),
+		 * it is possible the receiver is woken at that time. Thus,
+		 * rpal_lazy_switch_tail() should figure out whether the receiver
+		 * state is set by lazy switch or not. See rpal_lazy_switch_tail()
+		 * for details.
+		 */
+		state = atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_READY_LS);
+		if (unlikely(state != RPAL_RECEIVER_STATE_LAZY_SWITCH))
+			rpal_err("%s: unexpected state: %d\n", __func__, state);
+		init_waitqueue_func_entry(&rrd->ep_wait, rpal_ep_autoremove_wake_function);
+		rrd->ep_wait.private = rrd;
+		INIT_LIST_HEAD(&rrd->ep_wait.entry);
+		write_lock(&ep->lock);
+		set_current_state(TASK_INTERRUPTIBLE);
+		avail = ep_events_available(ep);
+		if (!avail)
+			__add_wait_queue_exclusive(&ep->wq, &rrd->ep_wait);
+		write_unlock(&ep->lock);
+		if (avail) {
+			/* keep state consistent when we enter rpal_kernel_ret() */
+			atomic_set(&rcc->receiver_state,
+				   RPAL_RECEIVER_STATE_LAZY_SWITCH);
+			set_current_state(TASK_RUNNING);
+			return;
+		}
+
+		if (rcc->timeout > 0) {
+			rrd->ep_sleeper.task = rrd->rcd.bp_task;
+			ts = ms_to_ktime(rcc->timeout);
+			hrtimer_start(ht, ts, HRTIMER_MODE_REL);
+		}
+	}
+}
+
+void rpal_remove_ep_wait_list(struct rpal_receiver_data *rrd)
+{
+	struct eventpoll *ep = (struct eventpoll *)rrd->ep;
+	wait_queue_entry_t *wait = &rrd->ep_wait;
+
+	if (!list_empty_careful(&wait->entry)) {
+		write_lock_irq(&ep->lock);
+		__remove_wait_queue(&ep->wq, wait);
+		write_unlock_irq(&ep->lock);
+	}
+}
+
 void *rpal_get_epitemep(wait_queue_entry_t *wait)
 {
 	struct epitem *epi = ep_item_from_wait(wait);
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index f5f4da63f28c..676113f0ba1f 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -126,6 +126,7 @@ enum rpal_receiver_state {
 	RPAL_RECEIVER_STATE_WAIT,
 	RPAL_RECEIVER_STATE_CALL,
 	RPAL_RECEIVER_STATE_LAZY_SWITCH,
+	RPAL_RECEIVER_STATE_READY_LS,
 	RPAL_RECEIVER_STATE_MAX,
 };
 
@@ -627,4 +628,6 @@ void rpal_resume_ep(struct task_struct *tsk);
 void *rpal_get_epitemep(wait_queue_entry_t *wait);
 int rpal_get_epitemfd(wait_queue_entry_t *wait);
 int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc);
+void rpal_remove_ep_wait_list(struct rpal_receiver_data *rrd);
+void rpal_fast_ep_poll(struct rpal_receiver_data *rrd, struct pt_regs *regs);
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d6f8e0d76fc0..1728b04d1387 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3965,6 +3965,11 @@ static bool rpal_check_state(struct task_struct *p)
 		case RPAL_RECEIVER_STATE_LAZY_SWITCH:
 		case RPAL_RECEIVER_STATE_RUNNING:
 			break;
+		/*
+		 * Allow RPAL_RECEIVER_STATE_READY_LS to be woken will cause irq
+		 * being enabled in rpal_unlock_cpu.
+		 */
+		case RPAL_RECEIVER_STATE_READY_LS:
 		case RPAL_RECEIVER_STATE_CALL:
 			rpal_set_task_thread_flag(p, RPAL_WAKE_BIT);
 			ret = false;
@@ -11403,7 +11408,13 @@ void __sched notrace rpal_schedule(struct task_struct *next)
 
 	prev_state = READ_ONCE(prev->__state);
 	if (prev_state) {
-		try_to_block_task(rq, prev, &prev_state);
+		if (!try_to_block_task(rq, prev, &prev_state)) {
+			/*
+			 * As the task enter TASK_RUNNING state, we should clean up
+			 * RPAL_RECEIVER_STATE_READY_LS status.
+			 */
+			rpal_check_ready_state(prev, RPAL_RECEIVER_STATE_READY_LS);
+		}
 		switch_count = &prev->nvcsw;
 	}
 
-- 
2.20.1

next prev parent reply	other threads:[~2025-05-30  9:37 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-30  9:27 [RFC v2 00/35] optimize cost of inter-process communication Bo Li
2025-05-30  9:27 ` [RFC v2 01/35] Kbuild: rpal support Bo Li
2025-05-30  9:27 ` [RFC v2 02/35] RPAL: add struct rpal_service Bo Li
2025-05-30  9:27 ` [RFC v2 03/35] RPAL: add service registration interface Bo Li
2025-05-30  9:27 ` [RFC v2 04/35] RPAL: add member to task_struct and mm_struct Bo Li
2025-05-30  9:27 ` [RFC v2 05/35] RPAL: enable virtual address space partitions Bo Li
2025-05-30  9:27 ` [RFC v2 06/35] RPAL: add user interface Bo Li
2025-05-30  9:27 ` [RFC v2 07/35] RPAL: enable shared page mmap Bo Li
2025-05-30  9:27 ` [RFC v2 08/35] RPAL: enable sender/receiver registration Bo Li
2025-05-30  9:27 ` [RFC v2 09/35] RPAL: enable address space sharing Bo Li
2025-05-30  9:27 ` [RFC v2 10/35] RPAL: allow service enable/disable Bo Li
2025-05-30  9:27 ` [RFC v2 11/35] RPAL: add service request/release Bo Li
2025-05-30  9:27 ` [RFC v2 12/35] RPAL: enable service disable notification Bo Li
2025-05-30  9:27 ` [RFC v2 13/35] RPAL: add tlb flushing support Bo Li
2025-05-30  9:27 ` [RFC v2 14/35] RPAL: enable page fault handling Bo Li
2025-05-30 13:59   ` Dave Hansen
2025-05-30  9:27 ` [RFC v2 15/35] RPAL: add sender/receiver state Bo Li
2025-05-30  9:27 ` [RFC v2 16/35] RPAL: add cpu lock interface Bo Li
2025-05-30  9:27 ` [RFC v2 17/35] RPAL: add a mapping between fsbase and tasks Bo Li
2025-05-30  9:27 ` [RFC v2 18/35] sched: pick a specified task Bo Li
2025-05-30  9:27 ` [RFC v2 19/35] RPAL: add lazy switch main logic Bo Li
2025-05-30  9:27 ` [RFC v2 20/35] RPAL: add rpal_ret_from_lazy_switch Bo Li
2025-05-30  9:27 ` [RFC v2 21/35] RPAL: add kernel entry handling for lazy switch Bo Li
2025-05-30  9:27 ` [RFC v2 22/35] RPAL: rebuild receiver state Bo Li
2025-05-30  9:27 ` [RFC v2 23/35] RPAL: resume cpumask when fork Bo Li
2025-05-30  9:27 ` [RFC v2 24/35] RPAL: critical section optimization Bo Li
2025-05-30  9:27 ` [RFC v2 25/35] RPAL: add MPK initialization and interface Bo Li
2025-05-30  9:27 ` [RFC v2 26/35] RPAL: enable MPK support Bo Li
2025-05-30 17:03   ` Dave Hansen
2025-05-30  9:27 ` [RFC v2 27/35] RPAL: add epoll support Bo Li
2025-05-30  9:27 ` [RFC v2 28/35] RPAL: add rpal_uds_fdmap() support Bo Li
2025-05-30  9:27 ` [RFC v2 29/35] RPAL: fix race condition in pkru update Bo Li
2025-05-30  9:27 ` [RFC v2 30/35] RPAL: fix pkru setup when fork Bo Li
2025-05-30  9:27 ` [RFC v2 31/35] RPAL: add receiver waker Bo Li
2025-05-30  9:28 ` [RFC v2 32/35] RPAL: fix unknown nmi on AMD CPU Bo Li
2025-05-30  9:28 ` [RFC v2 33/35] RPAL: enable time slice correction Bo Li
2025-05-30  9:28 ` Bo Li [this message]
2025-05-30  9:28 ` [RFC v2 35/35] samples/rpal: add RPAL samples Bo Li
2025-05-30  9:33 ` [RFC v2 00/35] optimize cost of inter-process communication Lorenzo Stoakes
2025-06-03  8:22   ` Bo Li
2025-06-03  9:22     ` Lorenzo Stoakes
2025-05-30  9:41 ` Pedro Falcato
2025-05-30  9:56 ` David Hildenbrand
2025-05-30 22:42 ` Andrew Morton
2025-05-31  7:16 ` Ingo Molnar
2025-06-03 17:49 ` H. Peter Anvin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b13520ef51366f6c25c50f05de7210d37fcd9489.1748594841.git.libo.gcs85@bytedance.com \
    --to=libo.gcs85@bytedance.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=acme@kernel.org \
    --cc=adrian.hunter@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=bp@alien8.de \
    --cc=brauner@kernel.org \
    --cc=bsegall@google.com \
    --cc=chaiwen.cc@bytedance.com \
    --cc=chengguozhu@bytedance.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=dengliang.1214@bytedance.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=duanxiongchun@bytedance.com \
    --cc=harry.yoo@oracle.com \
    --cc=hpa@zytor.com \
    --cc=irogers@google.com \
    --cc=jack@suse.cz \
    --cc=jannh@google.com \
    --cc=jolsa@kernel.org \
    --cc=juri.lelli@redhat.com \
    --cc=kan.liang@linux.intel.com \
    --cc=kees@kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=luto@kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mgorman@suse.de \
    --cc=mhocko@suse.com \
    --cc=mingo@redhat.com \
    --cc=namhyung@kernel.org \
    --cc=peterz@infradead.org \
    --cc=pfalcato@suse.de \
    --cc=riel@surriel.com \
    --cc=rostedt@goodmis.org \
    --cc=rppt@kernel.org \
    --cc=songmuchun@bytedance.com \
    --cc=sunjiadong.lff@bytedance.com \
    --cc=surenb@google.com \
    --cc=tglx@linutronix.de \
    --cc=vbabka@suse.cz \
    --cc=vincent.guittot@linaro.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=vschneid@redhat.com \
    --cc=x86@kernel.org \
    --cc=xieyongji@bytedance.com \
    --cc=yinhongbo@bytedance.com \
    --cc=yuanzhu@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox