[RFC v2 27/35] RPAL: add epoll support

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Bo Li <libo.gcs85@bytedance.com>
To: tglx@linutronix.de, mingo@redhat.com, bp@alien8.de,
	dave.hansen@linux.intel.com, x86@kernel.org, luto@kernel.org,
	kees@kernel.org, akpm@linux-foundation.org, david@redhat.com,
	juri.lelli@redhat.com, vincent.guittot@linaro.org,
	peterz@infradead.org
Cc: dietmar.eggemann@arm.com, hpa@zytor.com, acme@kernel.org,
	namhyung@kernel.org, mark.rutland@arm.com,
	alexander.shishkin@linux.intel.com, jolsa@kernel.org,
	irogers@google.com, adrian.hunter@intel.com,
	kan.liang@linux.intel.com, viro@zeniv.linux.org.uk,
	brauner@kernel.org, jack@suse.cz, lorenzo.stoakes@oracle.com,
	Liam.Howlett@oracle.com, vbabka@suse.cz, rppt@kernel.org,
	surenb@google.com, mhocko@suse.com, rostedt@goodmis.org,
	bsegall@google.com, mgorman@suse.de, vschneid@redhat.com,
	jannh@google.com, pfalcato@suse.de, riel@surriel.com,
	harry.yoo@oracle.com, linux-kernel@vger.kernel.org,
	linux-perf-users@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, duanxiongchun@bytedance.com,
	yinhongbo@bytedance.com, dengliang.1214@bytedance.com,
	xieyongji@bytedance.com, chaiwen.cc@bytedance.com,
	songmuchun@bytedance.com, yuanzhu@bytedance.com,
	chengguozhu@bytedance.com, sunjiadong.lff@bytedance.com,
	Bo Li <libo.gcs85@bytedance.com>
Subject: [RFC v2 27/35] RPAL: add epoll support
Date: Fri, 30 May 2025 17:27:55 +0800	[thread overview]
Message-ID: <7eb30a577e2c6a4f582515357aea25260105eb18.1748594841.git.libo.gcs85@bytedance.com> (raw)
In-Reply-To: <cover.1748594840.git.libo.gcs85@bytedance.com>

To support the epoll family, RPAL needs to add new logic for RPAL services
to the existing epoll logic, ensuring that user mode can execute RPAL
service-related logic through identical interfaces.

When the receiver thread calls epoll_wait(), it can set RPAL_EP_POLL_MAGIC
to notify the kernel to invoke RPAL-related logic. The kernel then sets the
receiver's state to RPAL_RECEIVER_STATE_READY and transitions it to
RPAL_RECEIVER_STATE_WAIT when the receiver is actually removed from the
runqueue, allowing the sender to perform RPAL calls on the receiver thread.

Signed-off-by: Bo Li <libo.gcs85@bytedance.com>
---
 arch/x86/rpal/core.c |   4 +
 fs/eventpoll.c       | 200 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/rpal.h |  21 +++++
 kernel/sched/core.c  |  17 ++++
 4 files changed, 242 insertions(+)

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 47c9e551344e..6a22b9faa100 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -9,6 +9,7 @@
 #include <linux/rpal.h>
 #include <linux/sched/task_stack.h>
 #include <linux/pkeys.h>
+#include <linux/file.h>
 #include <asm/fsgsbase.h>
 
 #include "internal.h"
@@ -63,6 +64,7 @@ void rpal_kernel_ret(struct pt_regs *regs)
 
 	if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) {
 		rcc = current->rpal_rd->rcc;
+		regs->ax = rpal_try_send_events(current->rpal_rd->ep, rcc);
 		atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_KERNEL_RET);
 	} else {
 		tsk = current->rpal_sd->receiver;
@@ -142,6 +144,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
 	struct task_struct *prev = current;
 
 	if (rpal_test_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT)) {
+		rpal_resume_ep(next);
 		current->rpal_sd->receiver = next;
 		rpal_lock_cpu(current);
 		rpal_lock_cpu(next);
@@ -154,6 +157,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
 		 */
 		rebuild_sender_stack(current->rpal_sd, regs);
 		rpal_schedule(next);
+		fdput(next->rpal_rd->f);
 	} else {
 		update_dst_stack(next, regs);
 		/*
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d4dbffdedd08..437cd5764c03 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -38,6 +38,7 @@
 #include <linux/compat.h>
 #include <linux/rculist.h>
 #include <linux/capability.h>
+#include <linux/rpal.h>
 #include <net/busy_poll.h>
 
 /*
@@ -2141,6 +2142,187 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	}
 }
 
+#ifdef CONFIG_RPAL
+
+void rpal_resume_ep(struct task_struct *tsk)
+{
+	struct rpal_receiver_data *rrd = tsk->rpal_rd;
+	struct eventpoll *ep = (struct eventpoll *)rrd->ep;
+	struct rpal_receiver_call_context *rcc = rrd->rcc;
+
+	if (rcc->timeout > 0) {
+		hrtimer_cancel(&rrd->ep_sleeper.timer);
+		destroy_hrtimer_on_stack(&rrd->ep_sleeper.timer);
+	}
+	if (!list_empty_careful(&rrd->ep_wait.entry)) {
+		write_lock(&ep->lock);
+		__remove_wait_queue(&ep->wq, &rrd->ep_wait);
+		write_unlock(&ep->lock);
+	}
+}
+
+int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc)
+{
+	int eavail;
+	int res = 0;
+
+	res = ep_send_events(ep, rcc->events, rcc->maxevents);
+	if (res > 0)
+		ep_suspend_napi_irqs(ep);
+
+	eavail = ep_events_available(ep);
+	if (!eavail) {
+		atomic_and(~RPAL_KERNEL_PENDING, &rcc->ep_pending);
+		/* check again to avoid data race on RPAL_KERNEL_PENDING */
+		eavail = ep_events_available(ep);
+		if (eavail)
+			atomic_or(RPAL_KERNEL_PENDING, &rcc->ep_pending);
+	}
+	return res;
+}
+
+static int rpal_schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
+					       const enum hrtimer_mode mode,
+					       clockid_t clock_id)
+{
+	struct hrtimer_sleeper *t = &current->rpal_rd->ep_sleeper;
+
+	/*
+	 * Optimize when a zero timeout value is given. It does not
+	 * matter whether this is an absolute or a relative time.
+	 */
+	if (expires && *expires == 0) {
+		__set_current_state(TASK_RUNNING);
+		return 0;
+	}
+
+	/*
+	 * A NULL parameter means "infinite"
+	 */
+	if (!expires) {
+		schedule();
+		return -EINTR;
+	}
+
+	hrtimer_setup_sleeper_on_stack(t, clock_id, mode);
+	hrtimer_set_expires_range_ns(&t->timer, *expires, delta);
+	hrtimer_sleeper_start_expires(t, mode);
+
+	if (likely(t->task))
+		schedule();
+
+	hrtimer_cancel(&t->timer);
+	destroy_hrtimer_on_stack(&t->timer);
+
+	__set_current_state(TASK_RUNNING);
+
+	return !t->task ? 0 : -EINTR;
+}
+
+static int rpal_ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
+			int maxevents, struct timespec64 *timeout)
+{
+	int res = 0, eavail, timed_out = 0;
+	u64 slack = 0;
+	struct rpal_receiver_data *rrd = current->rpal_rd;
+	wait_queue_entry_t *wait = &rrd->ep_wait;
+	ktime_t expires, *to = NULL;
+
+	rrd->ep = ep;
+
+	lockdep_assert_irqs_enabled();
+
+	if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
+		slack = select_estimate_accuracy(timeout);
+		to = &expires;
+		*to = timespec64_to_ktime(*timeout);
+	} else if (timeout) {
+		timed_out = 1;
+	}
+
+	eavail = ep_events_available(ep);
+
+	while (1) {
+		if (eavail) {
+			res = rpal_try_send_events(ep, rrd->rcc);
+			if (res) {
+				atomic_xchg(&rrd->rcc->receiver_state,
+					    RPAL_RECEIVER_STATE_RUNNING);
+				return res;
+			}
+		}
+
+		if (timed_out) {
+			atomic_xchg(&rrd->rcc->receiver_state,
+				    RPAL_RECEIVER_STATE_RUNNING);
+			return 0;
+		}
+
+		eavail = ep_busy_loop(ep);
+		if (eavail)
+			continue;
+
+		if (signal_pending(current)) {
+			atomic_xchg(&rrd->rcc->receiver_state,
+				    RPAL_RECEIVER_STATE_RUNNING);
+			return -EINTR;
+		}
+
+		init_wait(wait);
+		wait->func = rpal_ep_autoremove_wake_function;
+		wait->private = rrd;
+		write_lock_irq(&ep->lock);
+
+		atomic_xchg(&rrd->rcc->receiver_state,
+			    RPAL_RECEIVER_STATE_READY);
+		__set_current_state(TASK_INTERRUPTIBLE);
+
+		eavail = ep_events_available(ep);
+		if (!eavail)
+			__add_wait_queue_exclusive(&ep->wq, wait);
+
+		write_unlock_irq(&ep->lock);
+
+		if (!eavail && ep_schedule_timeout(to)) {
+			if (RPAL_USER_PENDING & atomic_read(&rrd->rcc->ep_pending)) {
+				timed_out = 1;
+			} else {
+				timed_out =
+					!rpal_schedule_hrtimeout_range_clock(
+						to, slack, HRTIMER_MODE_ABS,
+						CLOCK_MONOTONIC);
+			}
+		}
+		atomic_cmpxchg(&rrd->rcc->receiver_state,
+			       RPAL_RECEIVER_STATE_READY,
+			       RPAL_RECEIVER_STATE_RUNNING);
+		__set_current_state(TASK_RUNNING);
+
+		/*
+		 * We were woken up, thus go and try to harvest some events.
+		 * If timed out and still on the wait queue, recheck eavail
+		 * carefully under lock, below.
+		 */
+		eavail = 1;
+
+		if (!list_empty_careful(&wait->entry)) {
+			write_lock_irq(&ep->lock);
+			/*
+			 * If the thread timed out and is not on the wait queue,
+			 * it means that the thread was woken up after its
+			 * timeout expired before it could reacquire the lock.
+			 * Thus, when wait.entry is empty, it needs to harvest
+			 * events.
+			 */
+			if (timed_out)
+				eavail = list_empty(&wait->entry);
+			__remove_wait_queue(&ep->wq, wait);
+			write_unlock_irq(&ep->lock);
+		}
+	}
+}
+#endif
+
 /**
  * ep_loop_check_proc - verify that adding an epoll file inside another
  *                      epoll structure does not violate the constraints, in
@@ -2529,7 +2711,25 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
 	ep = fd_file(f)->private_data;
 
 	/* Time to fish for events ... */
+#ifdef CONFIG_RPAL
+	/*
+	 * For RPAL task, if it is a receiver and it set MAGIC in shared memory,
+	 * We think it is prepared for rpal calls. Therefore, we need to handle
+	 * it differently.
+	 *
+	 * In other cases, RPAL task always plays like a normal task.
+	 */
+	if (rpal_current_service() &&
+	    rpal_test_current_thread_flag(RPAL_RECEIVER_BIT) &&
+	    current->rpal_rd->rcc->rpal_ep_poll_magic == RPAL_EP_POLL_MAGIC) {
+		current->rpal_rd->f = f;
+		return rpal_ep_poll(ep, events, maxevents, to);
+	} else {
+		return ep_poll(ep, events, maxevents, to);
+	}
+#else
 	return ep_poll(ep, events, maxevents, to);
+#endif
 }
 
 SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index f2474cb53abe..5912ffec6e28 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -16,6 +16,8 @@
 #include <linux/hashtable.h>
 #include <linux/atomic.h>
 #include <linux/sizes.h>
+#include <linux/file.h>
+#include <linux/hrtimer.h>
 
 #define RPAL_ERROR_MSG "rpal error: "
 #define rpal_err(x...) pr_err(RPAL_ERROR_MSG x)
@@ -89,6 +91,7 @@ enum {
 };
 
 #define RPAL_ERROR_MAGIC 0x98CC98CC
+#define RPAL_EP_POLL_MAGIC 0xCC98CC98
 
 #define RPAL_SID_SHIFT 24
 #define RPAL_ID_SHIFT 8
@@ -103,6 +106,9 @@ enum {
 #define RPAL_PKRU_UNION 1
 #define RPAL_PKRU_INTERSECT 2
 
+#define RPAL_KERNEL_PENDING 0x1
+#define RPAL_USER_PENDING 0x2
+
 extern unsigned long rpal_cap;
 
 enum rpal_task_flag_bits {
@@ -282,6 +288,12 @@ struct rpal_receiver_call_context {
 	int receiver_id;
 	atomic_t receiver_state;
 	atomic_t sender_state;
+	atomic_t ep_pending;
+	int rpal_ep_poll_magic;
+	int epfd;
+	void __user *events;
+	int maxevents;
+	int timeout;
 };
 
 /* recovery point for sender */
@@ -325,6 +337,10 @@ struct rpal_receiver_data {
 	struct rpal_shared_page *rsp;
 	struct rpal_receiver_call_context *rcc;
 	struct task_struct *sender;
+	void *ep;
+	struct fd f;
+	struct hrtimer_sleeper ep_sleeper;
+	wait_queue_entry_t ep_wait;
 };
 
 struct rpal_sender_data {
@@ -574,4 +590,9 @@ __rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p);
 asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev);
 int do_rpal_mprotect_pkey(unsigned long start, size_t len, int pkey);
 void rpal_set_pku_schedule_tail(struct task_struct *prev);
+int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr,
+	unsigned int mode, int wake_flags,
+	void *key);
+void rpal_resume_ep(struct task_struct *tsk);
+int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc);
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eb5d5bd51597..486d59bdd3fc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6794,6 +6794,23 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 #define SM_RTLOCK_WAIT		2
 
 #ifdef CONFIG_RPAL
+int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr,
+				     unsigned int mode, int wake_flags,
+				     void *key)
+{
+	struct rpal_receiver_data *rrd = curr->private;
+	struct task_struct *tsk = rrd->rcd.bp_task;
+	int ret;
+
+	ret = try_to_wake_up(tsk, mode, wake_flags);
+
+	list_del_init_careful(&curr->entry);
+	if (!ret)
+		atomic_or(RPAL_KERNEL_PENDING, &rrd->rcc->ep_pending);
+
+	return 1;
+}
+
 static inline void rpal_check_ready_state(struct task_struct *tsk, int state)
 {
 	if (rpal_test_task_thread_flag(tsk, RPAL_RECEIVER_BIT)) {
-- 
2.20.1

next prev parent reply	other threads:[~2025-05-30  9:35 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-30  9:27 [RFC v2 00/35] optimize cost of inter-process communication Bo Li
2025-05-30  9:27 ` [RFC v2 01/35] Kbuild: rpal support Bo Li
2025-05-30  9:27 ` [RFC v2 02/35] RPAL: add struct rpal_service Bo Li
2025-05-30  9:27 ` [RFC v2 03/35] RPAL: add service registration interface Bo Li
2025-05-30  9:27 ` [RFC v2 04/35] RPAL: add member to task_struct and mm_struct Bo Li
2025-05-30  9:27 ` [RFC v2 05/35] RPAL: enable virtual address space partitions Bo Li
2025-05-30  9:27 ` [RFC v2 06/35] RPAL: add user interface Bo Li
2025-05-30  9:27 ` [RFC v2 07/35] RPAL: enable shared page mmap Bo Li
2025-05-30  9:27 ` [RFC v2 08/35] RPAL: enable sender/receiver registration Bo Li
2025-05-30  9:27 ` [RFC v2 09/35] RPAL: enable address space sharing Bo Li
2025-05-30  9:27 ` [RFC v2 10/35] RPAL: allow service enable/disable Bo Li
2025-05-30  9:27 ` [RFC v2 11/35] RPAL: add service request/release Bo Li
2025-05-30  9:27 ` [RFC v2 12/35] RPAL: enable service disable notification Bo Li
2025-05-30  9:27 ` [RFC v2 13/35] RPAL: add tlb flushing support Bo Li
2025-05-30  9:27 ` [RFC v2 14/35] RPAL: enable page fault handling Bo Li
2025-05-30 13:59   ` Dave Hansen
2025-05-30  9:27 ` [RFC v2 15/35] RPAL: add sender/receiver state Bo Li
2025-05-30  9:27 ` [RFC v2 16/35] RPAL: add cpu lock interface Bo Li
2025-05-30  9:27 ` [RFC v2 17/35] RPAL: add a mapping between fsbase and tasks Bo Li
2025-05-30  9:27 ` [RFC v2 18/35] sched: pick a specified task Bo Li
2025-05-30  9:27 ` [RFC v2 19/35] RPAL: add lazy switch main logic Bo Li
2025-05-30  9:27 ` [RFC v2 20/35] RPAL: add rpal_ret_from_lazy_switch Bo Li
2025-05-30  9:27 ` [RFC v2 21/35] RPAL: add kernel entry handling for lazy switch Bo Li
2025-05-30  9:27 ` [RFC v2 22/35] RPAL: rebuild receiver state Bo Li
2025-05-30  9:27 ` [RFC v2 23/35] RPAL: resume cpumask when fork Bo Li
2025-05-30  9:27 ` [RFC v2 24/35] RPAL: critical section optimization Bo Li
2025-05-30  9:27 ` [RFC v2 25/35] RPAL: add MPK initialization and interface Bo Li
2025-05-30  9:27 ` [RFC v2 26/35] RPAL: enable MPK support Bo Li
2025-05-30 17:03   ` Dave Hansen
2025-05-30  9:27 ` Bo Li [this message]
2025-05-30  9:27 ` [RFC v2 28/35] RPAL: add rpal_uds_fdmap() support Bo Li
2025-05-30  9:27 ` [RFC v2 29/35] RPAL: fix race condition in pkru update Bo Li
2025-05-30  9:27 ` [RFC v2 30/35] RPAL: fix pkru setup when fork Bo Li
2025-05-30  9:27 ` [RFC v2 31/35] RPAL: add receiver waker Bo Li
2025-05-30  9:28 ` [RFC v2 32/35] RPAL: fix unknown nmi on AMD CPU Bo Li
2025-05-30  9:28 ` [RFC v2 33/35] RPAL: enable time slice correction Bo Li
2025-05-30  9:28 ` [RFC v2 34/35] RPAL: enable fast epoll wait Bo Li
2025-05-30  9:28 ` [RFC v2 35/35] samples/rpal: add RPAL samples Bo Li
2025-05-30  9:33 ` [RFC v2 00/35] optimize cost of inter-process communication Lorenzo Stoakes
2025-06-03  8:22   ` Bo Li
2025-06-03  9:22     ` Lorenzo Stoakes
2025-05-30  9:41 ` Pedro Falcato
2025-05-30  9:56 ` David Hildenbrand
2025-05-30 22:42 ` Andrew Morton
2025-05-31  7:16 ` Ingo Molnar
2025-06-03 17:49 ` H. Peter Anvin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=7eb30a577e2c6a4f582515357aea25260105eb18.1748594841.git.libo.gcs85@bytedance.com \
    --to=libo.gcs85@bytedance.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=acme@kernel.org \
    --cc=adrian.hunter@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=bp@alien8.de \
    --cc=brauner@kernel.org \
    --cc=bsegall@google.com \
    --cc=chaiwen.cc@bytedance.com \
    --cc=chengguozhu@bytedance.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=dengliang.1214@bytedance.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=duanxiongchun@bytedance.com \
    --cc=harry.yoo@oracle.com \
    --cc=hpa@zytor.com \
    --cc=irogers@google.com \
    --cc=jack@suse.cz \
    --cc=jannh@google.com \
    --cc=jolsa@kernel.org \
    --cc=juri.lelli@redhat.com \
    --cc=kan.liang@linux.intel.com \
    --cc=kees@kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=luto@kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mgorman@suse.de \
    --cc=mhocko@suse.com \
    --cc=mingo@redhat.com \
    --cc=namhyung@kernel.org \
    --cc=peterz@infradead.org \
    --cc=pfalcato@suse.de \
    --cc=riel@surriel.com \
    --cc=rostedt@goodmis.org \
    --cc=rppt@kernel.org \
    --cc=songmuchun@bytedance.com \
    --cc=sunjiadong.lff@bytedance.com \
    --cc=surenb@google.com \
    --cc=tglx@linutronix.de \
    --cc=vbabka@suse.cz \
    --cc=vincent.guittot@linaro.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=vschneid@redhat.com \
    --cc=x86@kernel.org \
    --cc=xieyongji@bytedance.com \
    --cc=yinhongbo@bytedance.com \
    --cc=yuanzhu@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox