[RFC v2 16/35] RPAL: add cpu lock interface

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Bo Li <libo.gcs85@bytedance.com>
To: tglx@linutronix.de, mingo@redhat.com, bp@alien8.de,
	dave.hansen@linux.intel.com, x86@kernel.org, luto@kernel.org,
	kees@kernel.org, akpm@linux-foundation.org, david@redhat.com,
	juri.lelli@redhat.com, vincent.guittot@linaro.org,
	peterz@infradead.org
Cc: dietmar.eggemann@arm.com, hpa@zytor.com, acme@kernel.org,
	namhyung@kernel.org, mark.rutland@arm.com,
	alexander.shishkin@linux.intel.com, jolsa@kernel.org,
	irogers@google.com, adrian.hunter@intel.com,
	kan.liang@linux.intel.com, viro@zeniv.linux.org.uk,
	brauner@kernel.org, jack@suse.cz, lorenzo.stoakes@oracle.com,
	Liam.Howlett@oracle.com, vbabka@suse.cz, rppt@kernel.org,
	surenb@google.com, mhocko@suse.com, rostedt@goodmis.org,
	bsegall@google.com, mgorman@suse.de, vschneid@redhat.com,
	jannh@google.com, pfalcato@suse.de, riel@surriel.com,
	harry.yoo@oracle.com, linux-kernel@vger.kernel.org,
	linux-perf-users@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, duanxiongchun@bytedance.com,
	yinhongbo@bytedance.com, dengliang.1214@bytedance.com,
	xieyongji@bytedance.com, chaiwen.cc@bytedance.com,
	songmuchun@bytedance.com, yuanzhu@bytedance.com,
	chengguozhu@bytedance.com, sunjiadong.lff@bytedance.com,
	Bo Li <libo.gcs85@bytedance.com>
Subject: [RFC v2 16/35] RPAL: add cpu lock interface
Date: Fri, 30 May 2025 17:27:44 +0800	[thread overview]
Message-ID: <8ff6cea94a6438a0856c86a11d56be462314b1f8.1748594841.git.libo.gcs85@bytedance.com> (raw)
In-Reply-To: <cover.1748594840.git.libo.gcs85@bytedance.com>

Lazy switch enables the kernel to switch from one task to another to keep
the kernel context and user context matched. For the scheduler, both tasks
involved in the context switch must reside in the same run queue (rq).
Therefore, before a lazy switch occurs, the kernel must first bind both
tasks to the same CPU to facilitate the subsequent context switch.

This patch introduces the rpal_lock_cpu() interface, which binds two tasks
to the same CPU while bypassing cpumask restrictions. The rpal_unlock_cpu()
function serves as the inverse operation to release this binding. To ensure
consistency, the kernel must prevent other threads from modifying the CPU
affinity of tasks locked by rpal_lock_cpu(). Therefore, when using
set_cpus_allowed_ptr() to change a task's CPU affinity, other threads must
wait until the binding established by rpal_lock_cpu() is released before
proceeding with modifications.

Signed-off-by: Bo Li <libo.gcs85@bytedance.com>
---
 arch/x86/rpal/core.c   |  18 +++++++
 arch/x86/rpal/thread.c |  14 ++++++
 include/linux/rpal.h   |   8 +++
 kernel/sched/core.c    | 109 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 149 insertions(+)

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 61f5d40b0157..c185a453c1b2 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -15,6 +15,24 @@ int __init rpal_init(void);
 bool rpal_inited;
 unsigned long rpal_cap;
 
+static inline void rpal_lock_cpu(struct task_struct *tsk)
+{
+	rpal_set_cpus_allowed_ptr(tsk, true);
+	if (unlikely(!irqs_disabled())) {
+		local_irq_disable();
+		rpal_err("%s: irq is enabled\n", __func__);
+	}
+}
+
+static inline void rpal_unlock_cpu(struct task_struct *tsk)
+{
+	rpal_set_cpus_allowed_ptr(tsk, false);
+	if (unlikely(!irqs_disabled())) {
+		local_irq_disable();
+		rpal_err("%s: irq is enabled\n", __func__);
+	}
+}
+
 int __init rpal_init(void)
 {
 	int ret = 0;
diff --git a/arch/x86/rpal/thread.c b/arch/x86/rpal/thread.c
index e50a4c865ff8..bc203e9c6e5e 100644
--- a/arch/x86/rpal/thread.c
+++ b/arch/x86/rpal/thread.c
@@ -47,6 +47,10 @@ int rpal_register_sender(unsigned long addr)
 	}
 
 	rpal_common_data_init(&rsd->rcd);
+	if (rpal_init_thread_pending(&rsd->rcd)) {
+		ret = -ENOMEM;
+		goto free_rsd;
+	}
 	rsd->rsp = rsp;
 	rsd->scc = (struct rpal_sender_call_context *)(addr - rsp->user_start +
 						       rsp->kernel_start);
@@ -58,6 +62,8 @@ int rpal_register_sender(unsigned long addr)
 
 	return 0;
 
+free_rsd:
+	kfree(rsd);
 put_shared_page:
 	rpal_put_shared_page(rsp);
 out:
@@ -77,6 +83,7 @@ int rpal_unregister_sender(void)
 
 	rpal_put_shared_page(rsd->rsp);
 	rpal_clear_current_thread_flag(RPAL_SENDER_BIT);
+	rpal_free_thread_pending(&rsd->rcd);
 	kfree(rsd);
 
 	atomic_dec(&cur->thread_cnt);
@@ -116,6 +123,10 @@ int rpal_register_receiver(unsigned long addr)
 	}
 
 	rpal_common_data_init(&rrd->rcd);
+	if (rpal_init_thread_pending(&rrd->rcd)) {
+		ret = -ENOMEM;
+		goto free_rrd;
+	}
 	rrd->rsp = rsp;
 	rrd->rcc =
 		(struct rpal_receiver_call_context *)(addr - rsp->user_start +
@@ -128,6 +139,8 @@ int rpal_register_receiver(unsigned long addr)
 
 	return 0;
 
+free_rrd:
+	kfree(rrd);
 put_shared_page:
 	rpal_put_shared_page(rsp);
 out:
@@ -147,6 +160,7 @@ int rpal_unregister_receiver(void)
 
 	rpal_put_shared_page(rrd->rsp);
 	rpal_clear_current_thread_flag(RPAL_RECEIVER_BIT);
+	rpal_free_thread_pending(&rrd->rcd);
 	kfree(rrd);
 
 	atomic_dec(&cur->thread_cnt);
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 4f4719bb7eae..5b115be14a55 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -99,6 +99,7 @@ extern unsigned long rpal_cap;
 enum rpal_task_flag_bits {
 	RPAL_SENDER_BIT,
 	RPAL_RECEIVER_BIT,
+	RPAL_CPU_LOCKED_BIT,
 };
 
 enum rpal_receiver_state {
@@ -270,8 +271,12 @@ struct rpal_shared_page {
 struct rpal_common_data {
 	/* back pointer to task_struct */
 	struct task_struct *bp_task;
+	/* pending struct for cpu locking */
+	void *pending;
 	/* service id of rpal_service */
 	int service_id;
+	/* cpumask before locked */
+	cpumask_t old_mask;
 };
 
 struct rpal_receiver_data {
@@ -464,4 +469,7 @@ struct mm_struct *rpal_pf_get_real_mm(unsigned long address, int *rebuild);
 extern void rpal_pick_mmap_base(struct mm_struct *mm,
 	struct rlimit *rlim_stack);
 int rpal_try_to_wake_up(struct task_struct *p);
+int rpal_init_thread_pending(struct rpal_common_data *rcd);
+void rpal_free_thread_pending(struct rpal_common_data *rcd);
+int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock);
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 045e92ee2e3b..a862bf4a0161 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3155,6 +3155,104 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
 	return ret;
 }
 
+#ifdef CONFIG_RPAL
+int rpal_init_thread_pending(struct rpal_common_data *rcd)
+{
+	struct set_affinity_pending *pending;
+
+	pending = kzalloc(sizeof(*pending), GFP_KERNEL);
+	if (!pending)
+		return -ENOMEM;
+	pending->stop_pending = 0;
+	pending->arg = (struct migration_arg){
+		.task = current,
+		.pending = NULL,
+	};
+	rcd->pending = pending;
+	return 0;
+}
+
+void rpal_free_thread_pending(struct rpal_common_data *rcd)
+{
+	if (rcd->pending != NULL)
+		kfree(rcd->pending);
+}
+
+/*
+ * CPU lock is forced and all cpumask will be ignored by RPAL temporary.
+ */
+int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock)
+{
+	const struct cpumask *cpu_valid_mask = cpu_active_mask;
+	struct set_affinity_pending *pending = p->rpal_cd->pending;
+	struct cpumask mask;
+	unsigned int dest_cpu;
+	struct rq_flags rf;
+	struct rq *rq;
+	int ret = 0;
+	struct affinity_context ac = {
+		.new_mask = &mask,
+		.flags = 0,
+	};
+
+	if (unlikely(p->flags & PF_KTHREAD))
+		rpal_err("p: %d, p->flags & PF_KTHREAD\n", p->pid);
+
+	rq = task_rq_lock(p, &rf);
+
+	if (is_lock) {
+		cpumask_copy(&p->rpal_cd->old_mask, &p->cpus_mask);
+		cpumask_clear(&mask);
+		cpumask_set_cpu(smp_processor_id(), &mask);
+		rpal_set_task_thread_flag(p, RPAL_CPU_LOCKED_BIT);
+	} else {
+		cpumask_copy(&mask, &p->rpal_cd->old_mask);
+		rpal_clear_task_thread_flag(p, RPAL_CPU_LOCKED_BIT);
+	}
+
+	update_rq_clock(rq);
+
+	if (cpumask_equal(&p->cpus_mask, ac.new_mask))
+		goto out;
+	/*
+	 * Picking a ~random cpu helps in cases where we are changing affinity
+	 * for groups of tasks (ie. cpuset), so that load balancing is not
+	 * immediately required to distribute the tasks within their new mask.
+	 */
+	dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ac.new_mask);
+	if (dest_cpu >= nr_cpu_ids) {
+		ret = -EINVAL;
+		goto out;
+	}
+	__do_set_cpus_allowed(p, &ac);
+	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+		preempt_disable();
+		task_rq_unlock(rq, p, &rf);
+		preempt_enable();
+	} else {
+		pending->arg.dest_cpu = dest_cpu;
+
+		if (task_on_cpu(rq, p) ||
+		    READ_ONCE(p->__state) == TASK_WAKING) {
+			preempt_disable();
+			task_rq_unlock(rq, p, &rf);
+			stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+					    &pending->arg, &pending->stop_work);
+		} else {
+			if (task_on_rq_queued(p))
+				rq = move_queued_task(rq, &rf, p, dest_cpu);
+			task_rq_unlock(rq, p, &rf);
+		}
+	}
+
+	return 0;
+
+out:
+	task_rq_unlock(rq, p, &rf);
+	return ret;
+}
+#endif
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
@@ -3169,7 +3267,18 @@ int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
 	struct rq_flags rf;
 	struct rq *rq;
 
+#ifdef CONFIG_RPAL
+retry:
+	rq = task_rq_lock(p, &rf);
+	if (rpal_test_task_thread_flag(p, RPAL_CPU_LOCKED_BIT)) {
+		update_rq_clock(rq);
+		task_rq_unlock(rq, p, &rf);
+		schedule();
+		goto retry;
+	}
+#else
 	rq = task_rq_lock(p, &rf);
+#endif
 	/*
 	 * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
 	 * flags are set.
-- 
2.20.1

next prev parent reply	other threads:[~2025-05-30  9:32 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-30  9:27 [RFC v2 00/35] optimize cost of inter-process communication Bo Li
2025-05-30  9:27 ` [RFC v2 01/35] Kbuild: rpal support Bo Li
2025-05-30  9:27 ` [RFC v2 02/35] RPAL: add struct rpal_service Bo Li
2025-05-30  9:27 ` [RFC v2 03/35] RPAL: add service registration interface Bo Li
2025-05-30  9:27 ` [RFC v2 04/35] RPAL: add member to task_struct and mm_struct Bo Li
2025-05-30  9:27 ` [RFC v2 05/35] RPAL: enable virtual address space partitions Bo Li
2025-05-30  9:27 ` [RFC v2 06/35] RPAL: add user interface Bo Li
2025-05-30  9:27 ` [RFC v2 07/35] RPAL: enable shared page mmap Bo Li
2025-05-30  9:27 ` [RFC v2 08/35] RPAL: enable sender/receiver registration Bo Li
2025-05-30  9:27 ` [RFC v2 09/35] RPAL: enable address space sharing Bo Li
2025-05-30  9:27 ` [RFC v2 10/35] RPAL: allow service enable/disable Bo Li
2025-05-30  9:27 ` [RFC v2 11/35] RPAL: add service request/release Bo Li
2025-05-30  9:27 ` [RFC v2 12/35] RPAL: enable service disable notification Bo Li
2025-05-30  9:27 ` [RFC v2 13/35] RPAL: add tlb flushing support Bo Li
2025-05-30  9:27 ` [RFC v2 14/35] RPAL: enable page fault handling Bo Li
2025-05-30 13:59   ` Dave Hansen
2025-05-30  9:27 ` [RFC v2 15/35] RPAL: add sender/receiver state Bo Li
2025-05-30  9:27 ` Bo Li [this message]
2025-05-30  9:27 ` [RFC v2 17/35] RPAL: add a mapping between fsbase and tasks Bo Li
2025-05-30  9:27 ` [RFC v2 18/35] sched: pick a specified task Bo Li
2025-05-30  9:27 ` [RFC v2 19/35] RPAL: add lazy switch main logic Bo Li
2025-05-30  9:27 ` [RFC v2 20/35] RPAL: add rpal_ret_from_lazy_switch Bo Li
2025-05-30  9:27 ` [RFC v2 21/35] RPAL: add kernel entry handling for lazy switch Bo Li
2025-05-30  9:27 ` [RFC v2 22/35] RPAL: rebuild receiver state Bo Li
2025-05-30  9:27 ` [RFC v2 23/35] RPAL: resume cpumask when fork Bo Li
2025-05-30  9:27 ` [RFC v2 24/35] RPAL: critical section optimization Bo Li
2025-05-30  9:27 ` [RFC v2 25/35] RPAL: add MPK initialization and interface Bo Li
2025-05-30  9:27 ` [RFC v2 26/35] RPAL: enable MPK support Bo Li
2025-05-30 17:03   ` Dave Hansen
2025-05-30  9:27 ` [RFC v2 27/35] RPAL: add epoll support Bo Li
2025-05-30  9:27 ` [RFC v2 28/35] RPAL: add rpal_uds_fdmap() support Bo Li
2025-05-30  9:27 ` [RFC v2 29/35] RPAL: fix race condition in pkru update Bo Li
2025-05-30  9:27 ` [RFC v2 30/35] RPAL: fix pkru setup when fork Bo Li
2025-05-30  9:27 ` [RFC v2 31/35] RPAL: add receiver waker Bo Li
2025-05-30  9:28 ` [RFC v2 32/35] RPAL: fix unknown nmi on AMD CPU Bo Li
2025-05-30  9:28 ` [RFC v2 33/35] RPAL: enable time slice correction Bo Li
2025-05-30  9:28 ` [RFC v2 34/35] RPAL: enable fast epoll wait Bo Li
2025-05-30  9:28 ` [RFC v2 35/35] samples/rpal: add RPAL samples Bo Li
2025-05-30  9:33 ` [RFC v2 00/35] optimize cost of inter-process communication Lorenzo Stoakes
2025-06-03  8:22   ` Bo Li
2025-06-03  9:22     ` Lorenzo Stoakes
2025-05-30  9:41 ` Pedro Falcato
2025-05-30  9:56 ` David Hildenbrand
2025-05-30 22:42 ` Andrew Morton
2025-05-31  7:16 ` Ingo Molnar
2025-06-03 17:49 ` H. Peter Anvin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8ff6cea94a6438a0856c86a11d56be462314b1f8.1748594841.git.libo.gcs85@bytedance.com \
    --to=libo.gcs85@bytedance.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=acme@kernel.org \
    --cc=adrian.hunter@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=bp@alien8.de \
    --cc=brauner@kernel.org \
    --cc=bsegall@google.com \
    --cc=chaiwen.cc@bytedance.com \
    --cc=chengguozhu@bytedance.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=dengliang.1214@bytedance.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=duanxiongchun@bytedance.com \
    --cc=harry.yoo@oracle.com \
    --cc=hpa@zytor.com \
    --cc=irogers@google.com \
    --cc=jack@suse.cz \
    --cc=jannh@google.com \
    --cc=jolsa@kernel.org \
    --cc=juri.lelli@redhat.com \
    --cc=kan.liang@linux.intel.com \
    --cc=kees@kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=luto@kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mgorman@suse.de \
    --cc=mhocko@suse.com \
    --cc=mingo@redhat.com \
    --cc=namhyung@kernel.org \
    --cc=peterz@infradead.org \
    --cc=pfalcato@suse.de \
    --cc=riel@surriel.com \
    --cc=rostedt@goodmis.org \
    --cc=rppt@kernel.org \
    --cc=songmuchun@bytedance.com \
    --cc=sunjiadong.lff@bytedance.com \
    --cc=surenb@google.com \
    --cc=tglx@linutronix.de \
    --cc=vbabka@suse.cz \
    --cc=vincent.guittot@linaro.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=vschneid@redhat.com \
    --cc=x86@kernel.org \
    --cc=xieyongji@bytedance.com \
    --cc=yinhongbo@bytedance.com \
    --cc=yuanzhu@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox