From: Bo Li <libo.gcs85@bytedance.com>
To: tglx@linutronix.de, mingo@redhat.com, bp@alien8.de,
dave.hansen@linux.intel.com, x86@kernel.org, luto@kernel.org,
kees@kernel.org, akpm@linux-foundation.org, david@redhat.com,
juri.lelli@redhat.com, vincent.guittot@linaro.org,
peterz@infradead.org
Cc: dietmar.eggemann@arm.com, hpa@zytor.com, acme@kernel.org,
namhyung@kernel.org, mark.rutland@arm.com,
alexander.shishkin@linux.intel.com, jolsa@kernel.org,
irogers@google.com, adrian.hunter@intel.com,
kan.liang@linux.intel.com, viro@zeniv.linux.org.uk,
brauner@kernel.org, jack@suse.cz, lorenzo.stoakes@oracle.com,
Liam.Howlett@oracle.com, vbabka@suse.cz, rppt@kernel.org,
surenb@google.com, mhocko@suse.com, rostedt@goodmis.org,
bsegall@google.com, mgorman@suse.de, vschneid@redhat.com,
jannh@google.com, pfalcato@suse.de, riel@surriel.com,
harry.yoo@oracle.com, linux-kernel@vger.kernel.org,
linux-perf-users@vger.kernel.org, linux-fsdevel@vger.kernel.org,
linux-mm@kvack.org, duanxiongchun@bytedance.com,
yinhongbo@bytedance.com, dengliang.1214@bytedance.com,
xieyongji@bytedance.com, chaiwen.cc@bytedance.com,
songmuchun@bytedance.com, yuanzhu@bytedance.com,
chengguozhu@bytedance.com, sunjiadong.lff@bytedance.com,
Bo Li <libo.gcs85@bytedance.com>
Subject: [RFC v2 27/35] RPAL: add epoll support
Date: Fri, 30 May 2025 17:27:55 +0800 [thread overview]
Message-ID: <7eb30a577e2c6a4f582515357aea25260105eb18.1748594841.git.libo.gcs85@bytedance.com> (raw)
In-Reply-To: <cover.1748594840.git.libo.gcs85@bytedance.com>
To support the epoll family, RPAL needs to add new logic for RPAL services
to the existing epoll logic, ensuring that user mode can execute RPAL
service-related logic through identical interfaces.
When the receiver thread calls epoll_wait(), it can set RPAL_EP_POLL_MAGIC
to notify the kernel to invoke RPAL-related logic. The kernel then sets the
receiver's state to RPAL_RECEIVER_STATE_READY and transitions it to
RPAL_RECEIVER_STATE_WAIT when the receiver is actually removed from the
runqueue, allowing the sender to perform RPAL calls on the receiver thread.
Signed-off-by: Bo Li <libo.gcs85@bytedance.com>
---
arch/x86/rpal/core.c | 4 +
fs/eventpoll.c | 200 +++++++++++++++++++++++++++++++++++++++++++
include/linux/rpal.h | 21 +++++
kernel/sched/core.c | 17 ++++
4 files changed, 242 insertions(+)
diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 47c9e551344e..6a22b9faa100 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -9,6 +9,7 @@
#include <linux/rpal.h>
#include <linux/sched/task_stack.h>
#include <linux/pkeys.h>
+#include <linux/file.h>
#include <asm/fsgsbase.h>
#include "internal.h"
@@ -63,6 +64,7 @@ void rpal_kernel_ret(struct pt_regs *regs)
if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) {
rcc = current->rpal_rd->rcc;
+ regs->ax = rpal_try_send_events(current->rpal_rd->ep, rcc);
atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_KERNEL_RET);
} else {
tsk = current->rpal_sd->receiver;
@@ -142,6 +144,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
struct task_struct *prev = current;
if (rpal_test_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT)) {
+ rpal_resume_ep(next);
current->rpal_sd->receiver = next;
rpal_lock_cpu(current);
rpal_lock_cpu(next);
@@ -154,6 +157,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
*/
rebuild_sender_stack(current->rpal_sd, regs);
rpal_schedule(next);
+ fdput(next->rpal_rd->f);
} else {
update_dst_stack(next, regs);
/*
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d4dbffdedd08..437cd5764c03 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -38,6 +38,7 @@
#include <linux/compat.h>
#include <linux/rculist.h>
#include <linux/capability.h>
+#include <linux/rpal.h>
#include <net/busy_poll.h>
/*
@@ -2141,6 +2142,187 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
}
}
+#ifdef CONFIG_RPAL
+
+void rpal_resume_ep(struct task_struct *tsk)
+{
+ struct rpal_receiver_data *rrd = tsk->rpal_rd;
+ struct eventpoll *ep = (struct eventpoll *)rrd->ep;
+ struct rpal_receiver_call_context *rcc = rrd->rcc;
+
+ if (rcc->timeout > 0) {
+ hrtimer_cancel(&rrd->ep_sleeper.timer);
+ destroy_hrtimer_on_stack(&rrd->ep_sleeper.timer);
+ }
+ if (!list_empty_careful(&rrd->ep_wait.entry)) {
+ write_lock(&ep->lock);
+ __remove_wait_queue(&ep->wq, &rrd->ep_wait);
+ write_unlock(&ep->lock);
+ }
+}
+
+int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc)
+{
+ int eavail;
+ int res = 0;
+
+ res = ep_send_events(ep, rcc->events, rcc->maxevents);
+ if (res > 0)
+ ep_suspend_napi_irqs(ep);
+
+ eavail = ep_events_available(ep);
+ if (!eavail) {
+ atomic_and(~RPAL_KERNEL_PENDING, &rcc->ep_pending);
+ /* check again to avoid data race on RPAL_KERNEL_PENDING */
+ eavail = ep_events_available(ep);
+ if (eavail)
+ atomic_or(RPAL_KERNEL_PENDING, &rcc->ep_pending);
+ }
+ return res;
+}
+
+static int rpal_schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
+ const enum hrtimer_mode mode,
+ clockid_t clock_id)
+{
+ struct hrtimer_sleeper *t = ¤t->rpal_rd->ep_sleeper;
+
+ /*
+ * Optimize when a zero timeout value is given. It does not
+ * matter whether this is an absolute or a relative time.
+ */
+ if (expires && *expires == 0) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ /*
+ * A NULL parameter means "infinite"
+ */
+ if (!expires) {
+ schedule();
+ return -EINTR;
+ }
+
+ hrtimer_setup_sleeper_on_stack(t, clock_id, mode);
+ hrtimer_set_expires_range_ns(&t->timer, *expires, delta);
+ hrtimer_sleeper_start_expires(t, mode);
+
+ if (likely(t->task))
+ schedule();
+
+ hrtimer_cancel(&t->timer);
+ destroy_hrtimer_on_stack(&t->timer);
+
+ __set_current_state(TASK_RUNNING);
+
+ return !t->task ? 0 : -EINTR;
+}
+
+static int rpal_ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
+ int maxevents, struct timespec64 *timeout)
+{
+ int res = 0, eavail, timed_out = 0;
+ u64 slack = 0;
+ struct rpal_receiver_data *rrd = current->rpal_rd;
+ wait_queue_entry_t *wait = &rrd->ep_wait;
+ ktime_t expires, *to = NULL;
+
+ rrd->ep = ep;
+
+ lockdep_assert_irqs_enabled();
+
+ if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
+ slack = select_estimate_accuracy(timeout);
+ to = &expires;
+ *to = timespec64_to_ktime(*timeout);
+ } else if (timeout) {
+ timed_out = 1;
+ }
+
+ eavail = ep_events_available(ep);
+
+ while (1) {
+ if (eavail) {
+ res = rpal_try_send_events(ep, rrd->rcc);
+ if (res) {
+ atomic_xchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_RUNNING);
+ return res;
+ }
+ }
+
+ if (timed_out) {
+ atomic_xchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_RUNNING);
+ return 0;
+ }
+
+ eavail = ep_busy_loop(ep);
+ if (eavail)
+ continue;
+
+ if (signal_pending(current)) {
+ atomic_xchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_RUNNING);
+ return -EINTR;
+ }
+
+ init_wait(wait);
+ wait->func = rpal_ep_autoremove_wake_function;
+ wait->private = rrd;
+ write_lock_irq(&ep->lock);
+
+ atomic_xchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_READY);
+ __set_current_state(TASK_INTERRUPTIBLE);
+
+ eavail = ep_events_available(ep);
+ if (!eavail)
+ __add_wait_queue_exclusive(&ep->wq, wait);
+
+ write_unlock_irq(&ep->lock);
+
+ if (!eavail && ep_schedule_timeout(to)) {
+ if (RPAL_USER_PENDING & atomic_read(&rrd->rcc->ep_pending)) {
+ timed_out = 1;
+ } else {
+ timed_out =
+ !rpal_schedule_hrtimeout_range_clock(
+ to, slack, HRTIMER_MODE_ABS,
+ CLOCK_MONOTONIC);
+ }
+ }
+ atomic_cmpxchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_READY,
+ RPAL_RECEIVER_STATE_RUNNING);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * We were woken up, thus go and try to harvest some events.
+ * If timed out and still on the wait queue, recheck eavail
+ * carefully under lock, below.
+ */
+ eavail = 1;
+
+ if (!list_empty_careful(&wait->entry)) {
+ write_lock_irq(&ep->lock);
+ /*
+ * If the thread timed out and is not on the wait queue,
+ * it means that the thread was woken up after its
+ * timeout expired before it could reacquire the lock.
+ * Thus, when wait.entry is empty, it needs to harvest
+ * events.
+ */
+ if (timed_out)
+ eavail = list_empty(&wait->entry);
+ __remove_wait_queue(&ep->wq, wait);
+ write_unlock_irq(&ep->lock);
+ }
+ }
+}
+#endif
+
/**
* ep_loop_check_proc - verify that adding an epoll file inside another
* epoll structure does not violate the constraints, in
@@ -2529,7 +2711,25 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
ep = fd_file(f)->private_data;
/* Time to fish for events ... */
+#ifdef CONFIG_RPAL
+ /*
+ * For RPAL task, if it is a receiver and it set MAGIC in shared memory,
+ * We think it is prepared for rpal calls. Therefore, we need to handle
+ * it differently.
+ *
+ * In other cases, RPAL task always plays like a normal task.
+ */
+ if (rpal_current_service() &&
+ rpal_test_current_thread_flag(RPAL_RECEIVER_BIT) &&
+ current->rpal_rd->rcc->rpal_ep_poll_magic == RPAL_EP_POLL_MAGIC) {
+ current->rpal_rd->f = f;
+ return rpal_ep_poll(ep, events, maxevents, to);
+ } else {
+ return ep_poll(ep, events, maxevents, to);
+ }
+#else
return ep_poll(ep, events, maxevents, to);
+#endif
}
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index f2474cb53abe..5912ffec6e28 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -16,6 +16,8 @@
#include <linux/hashtable.h>
#include <linux/atomic.h>
#include <linux/sizes.h>
+#include <linux/file.h>
+#include <linux/hrtimer.h>
#define RPAL_ERROR_MSG "rpal error: "
#define rpal_err(x...) pr_err(RPAL_ERROR_MSG x)
@@ -89,6 +91,7 @@ enum {
};
#define RPAL_ERROR_MAGIC 0x98CC98CC
+#define RPAL_EP_POLL_MAGIC 0xCC98CC98
#define RPAL_SID_SHIFT 24
#define RPAL_ID_SHIFT 8
@@ -103,6 +106,9 @@ enum {
#define RPAL_PKRU_UNION 1
#define RPAL_PKRU_INTERSECT 2
+#define RPAL_KERNEL_PENDING 0x1
+#define RPAL_USER_PENDING 0x2
+
extern unsigned long rpal_cap;
enum rpal_task_flag_bits {
@@ -282,6 +288,12 @@ struct rpal_receiver_call_context {
int receiver_id;
atomic_t receiver_state;
atomic_t sender_state;
+ atomic_t ep_pending;
+ int rpal_ep_poll_magic;
+ int epfd;
+ void __user *events;
+ int maxevents;
+ int timeout;
};
/* recovery point for sender */
@@ -325,6 +337,10 @@ struct rpal_receiver_data {
struct rpal_shared_page *rsp;
struct rpal_receiver_call_context *rcc;
struct task_struct *sender;
+ void *ep;
+ struct fd f;
+ struct hrtimer_sleeper ep_sleeper;
+ wait_queue_entry_t ep_wait;
};
struct rpal_sender_data {
@@ -574,4 +590,9 @@ __rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p);
asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev);
int do_rpal_mprotect_pkey(unsigned long start, size_t len, int pkey);
void rpal_set_pku_schedule_tail(struct task_struct *prev);
+int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr,
+ unsigned int mode, int wake_flags,
+ void *key);
+void rpal_resume_ep(struct task_struct *tsk);
+int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc);
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eb5d5bd51597..486d59bdd3fc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6794,6 +6794,23 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
#define SM_RTLOCK_WAIT 2
#ifdef CONFIG_RPAL
+int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr,
+ unsigned int mode, int wake_flags,
+ void *key)
+{
+ struct rpal_receiver_data *rrd = curr->private;
+ struct task_struct *tsk = rrd->rcd.bp_task;
+ int ret;
+
+ ret = try_to_wake_up(tsk, mode, wake_flags);
+
+ list_del_init_careful(&curr->entry);
+ if (!ret)
+ atomic_or(RPAL_KERNEL_PENDING, &rrd->rcc->ep_pending);
+
+ return 1;
+}
+
static inline void rpal_check_ready_state(struct task_struct *tsk, int state)
{
if (rpal_test_task_thread_flag(tsk, RPAL_RECEIVER_BIT)) {
--
2.20.1
next prev parent reply other threads:[~2025-05-30 9:35 UTC|newest]
Thread overview: 46+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-05-30 9:27 [RFC v2 00/35] optimize cost of inter-process communication Bo Li
2025-05-30 9:27 ` [RFC v2 01/35] Kbuild: rpal support Bo Li
2025-05-30 9:27 ` [RFC v2 02/35] RPAL: add struct rpal_service Bo Li
2025-05-30 9:27 ` [RFC v2 03/35] RPAL: add service registration interface Bo Li
2025-05-30 9:27 ` [RFC v2 04/35] RPAL: add member to task_struct and mm_struct Bo Li
2025-05-30 9:27 ` [RFC v2 05/35] RPAL: enable virtual address space partitions Bo Li
2025-05-30 9:27 ` [RFC v2 06/35] RPAL: add user interface Bo Li
2025-05-30 9:27 ` [RFC v2 07/35] RPAL: enable shared page mmap Bo Li
2025-05-30 9:27 ` [RFC v2 08/35] RPAL: enable sender/receiver registration Bo Li
2025-05-30 9:27 ` [RFC v2 09/35] RPAL: enable address space sharing Bo Li
2025-05-30 9:27 ` [RFC v2 10/35] RPAL: allow service enable/disable Bo Li
2025-05-30 9:27 ` [RFC v2 11/35] RPAL: add service request/release Bo Li
2025-05-30 9:27 ` [RFC v2 12/35] RPAL: enable service disable notification Bo Li
2025-05-30 9:27 ` [RFC v2 13/35] RPAL: add tlb flushing support Bo Li
2025-05-30 9:27 ` [RFC v2 14/35] RPAL: enable page fault handling Bo Li
2025-05-30 13:59 ` Dave Hansen
2025-05-30 9:27 ` [RFC v2 15/35] RPAL: add sender/receiver state Bo Li
2025-05-30 9:27 ` [RFC v2 16/35] RPAL: add cpu lock interface Bo Li
2025-05-30 9:27 ` [RFC v2 17/35] RPAL: add a mapping between fsbase and tasks Bo Li
2025-05-30 9:27 ` [RFC v2 18/35] sched: pick a specified task Bo Li
2025-05-30 9:27 ` [RFC v2 19/35] RPAL: add lazy switch main logic Bo Li
2025-05-30 9:27 ` [RFC v2 20/35] RPAL: add rpal_ret_from_lazy_switch Bo Li
2025-05-30 9:27 ` [RFC v2 21/35] RPAL: add kernel entry handling for lazy switch Bo Li
2025-05-30 9:27 ` [RFC v2 22/35] RPAL: rebuild receiver state Bo Li
2025-05-30 9:27 ` [RFC v2 23/35] RPAL: resume cpumask when fork Bo Li
2025-05-30 9:27 ` [RFC v2 24/35] RPAL: critical section optimization Bo Li
2025-05-30 9:27 ` [RFC v2 25/35] RPAL: add MPK initialization and interface Bo Li
2025-05-30 9:27 ` [RFC v2 26/35] RPAL: enable MPK support Bo Li
2025-05-30 17:03 ` Dave Hansen
2025-05-30 9:27 ` Bo Li [this message]
2025-05-30 9:27 ` [RFC v2 28/35] RPAL: add rpal_uds_fdmap() support Bo Li
2025-05-30 9:27 ` [RFC v2 29/35] RPAL: fix race condition in pkru update Bo Li
2025-05-30 9:27 ` [RFC v2 30/35] RPAL: fix pkru setup when fork Bo Li
2025-05-30 9:27 ` [RFC v2 31/35] RPAL: add receiver waker Bo Li
2025-05-30 9:28 ` [RFC v2 32/35] RPAL: fix unknown nmi on AMD CPU Bo Li
2025-05-30 9:28 ` [RFC v2 33/35] RPAL: enable time slice correction Bo Li
2025-05-30 9:28 ` [RFC v2 34/35] RPAL: enable fast epoll wait Bo Li
2025-05-30 9:28 ` [RFC v2 35/35] samples/rpal: add RPAL samples Bo Li
2025-05-30 9:33 ` [RFC v2 00/35] optimize cost of inter-process communication Lorenzo Stoakes
2025-06-03 8:22 ` Bo Li
2025-06-03 9:22 ` Lorenzo Stoakes
2025-05-30 9:41 ` Pedro Falcato
2025-05-30 9:56 ` David Hildenbrand
2025-05-30 22:42 ` Andrew Morton
2025-05-31 7:16 ` Ingo Molnar
2025-06-03 17:49 ` H. Peter Anvin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=7eb30a577e2c6a4f582515357aea25260105eb18.1748594841.git.libo.gcs85@bytedance.com \
--to=libo.gcs85@bytedance.com \
--cc=Liam.Howlett@oracle.com \
--cc=acme@kernel.org \
--cc=adrian.hunter@intel.com \
--cc=akpm@linux-foundation.org \
--cc=alexander.shishkin@linux.intel.com \
--cc=bp@alien8.de \
--cc=brauner@kernel.org \
--cc=bsegall@google.com \
--cc=chaiwen.cc@bytedance.com \
--cc=chengguozhu@bytedance.com \
--cc=dave.hansen@linux.intel.com \
--cc=david@redhat.com \
--cc=dengliang.1214@bytedance.com \
--cc=dietmar.eggemann@arm.com \
--cc=duanxiongchun@bytedance.com \
--cc=harry.yoo@oracle.com \
--cc=hpa@zytor.com \
--cc=irogers@google.com \
--cc=jack@suse.cz \
--cc=jannh@google.com \
--cc=jolsa@kernel.org \
--cc=juri.lelli@redhat.com \
--cc=kan.liang@linux.intel.com \
--cc=kees@kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-perf-users@vger.kernel.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=luto@kernel.org \
--cc=mark.rutland@arm.com \
--cc=mgorman@suse.de \
--cc=mhocko@suse.com \
--cc=mingo@redhat.com \
--cc=namhyung@kernel.org \
--cc=peterz@infradead.org \
--cc=pfalcato@suse.de \
--cc=riel@surriel.com \
--cc=rostedt@goodmis.org \
--cc=rppt@kernel.org \
--cc=songmuchun@bytedance.com \
--cc=sunjiadong.lff@bytedance.com \
--cc=surenb@google.com \
--cc=tglx@linutronix.de \
--cc=vbabka@suse.cz \
--cc=vincent.guittot@linaro.org \
--cc=viro@zeniv.linux.org.uk \
--cc=vschneid@redhat.com \
--cc=x86@kernel.org \
--cc=xieyongji@bytedance.com \
--cc=yinhongbo@bytedance.com \
--cc=yuanzhu@bytedance.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox