linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Bo Li <libo.gcs85@bytedance.com>
To: tglx@linutronix.de, mingo@redhat.com, bp@alien8.de,
	dave.hansen@linux.intel.com, x86@kernel.org, luto@kernel.org,
	kees@kernel.org, akpm@linux-foundation.org, david@redhat.com,
	juri.lelli@redhat.com, vincent.guittot@linaro.org,
	peterz@infradead.org
Cc: dietmar.eggemann@arm.com, hpa@zytor.com, acme@kernel.org,
	namhyung@kernel.org, mark.rutland@arm.com,
	alexander.shishkin@linux.intel.com, jolsa@kernel.org,
	irogers@google.com, adrian.hunter@intel.com,
	kan.liang@linux.intel.com, viro@zeniv.linux.org.uk,
	brauner@kernel.org, jack@suse.cz, lorenzo.stoakes@oracle.com,
	Liam.Howlett@oracle.com, vbabka@suse.cz, rppt@kernel.org,
	surenb@google.com, mhocko@suse.com, rostedt@goodmis.org,
	bsegall@google.com, mgorman@suse.de, vschneid@redhat.com,
	jannh@google.com, pfalcato@suse.de, riel@surriel.com,
	harry.yoo@oracle.com, linux-kernel@vger.kernel.org,
	linux-perf-users@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, duanxiongchun@bytedance.com,
	yinhongbo@bytedance.com, dengliang.1214@bytedance.com,
	xieyongji@bytedance.com, chaiwen.cc@bytedance.com,
	songmuchun@bytedance.com, yuanzhu@bytedance.com,
	chengguozhu@bytedance.com, sunjiadong.lff@bytedance.com,
	Bo Li <libo.gcs85@bytedance.com>
Subject: [RFC v2 33/35] RPAL: enable time slice correction
Date: Fri, 30 May 2025 17:28:01 +0800	[thread overview]
Message-ID: <8941a17e12edce00c1cc1c78f4dd3e1bf28e47c0.1748594841.git.libo.gcs85@bytedance.com> (raw)
In-Reply-To: <cover.1748594840.git.libo.gcs85@bytedance.com>

After an RPAL call, the receiver's user mode code executes. However, the
kernel incorrectly attributes this CPU time to the sender due to the
unchanged kernel context. This results in incorrect runtime statistics.

This patch adds a new member total_time to both rpal_sender_call_context
and rpal_receiver_call_context. This member tracks how much runtime (
measured in CPU cycles via rdtsc()) has been incorrectly accounted for.
The kernel measures total_time at the entry of __schedule() and corrects
the delta in the update_rq_clock_task() function.

Additionally, since RPAL calls occur in user space, runtime statistics are
typically calculated by user space. However, when a lazy switch happens,
the kernel takes over. To address this, the patch introduces a start_time
member to record when an RPAL call is initiated, enabling the kernel to
accurately calculate the runtime that needs correction.

Signed-off-by: Bo Li <libo.gcs85@bytedance.com>
---
 arch/x86/rpal/core.c   |  8 ++++++++
 arch/x86/rpal/thread.c |  6 ++++++
 include/linux/rpal.h   |  3 +++
 include/linux/sched.h  |  1 +
 init/init_task.c       |  1 +
 kernel/fork.c          |  1 +
 kernel/sched/core.c    | 42 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 62 insertions(+)

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 92281b557a6c..2ac5d932f69c 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -144,6 +144,13 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
 	struct task_struct *prev = current;
 
 	if (rpal_test_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT)) {
+		struct rpal_receiver_call_context *rcc = next->rpal_rd->rcc;
+		struct rpal_sender_call_context *scc = current->rpal_sd->scc;
+		u64 slice = rdtsc_ordered() - scc->start_time;
+
+		rcc->total_time += slice;
+		scc->total_time += slice;
+
 		rpal_resume_ep(next);
 		current->rpal_sd->receiver = next;
 		rpal_lock_cpu(current);
@@ -169,6 +176,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
 		rpal_schedule(next);
 		rpal_clear_task_thread_flag(prev, RPAL_LAZY_SWITCHED_BIT);
 		prev->rpal_rd->sender = NULL;
+		next->rpal_sd->scc->start_time = rdtsc_ordered();
 	}
 	if (unlikely(!irqs_disabled())) {
 		local_irq_disable();
diff --git a/arch/x86/rpal/thread.c b/arch/x86/rpal/thread.c
index 51c9eec639cb..5cd0be631521 100644
--- a/arch/x86/rpal/thread.c
+++ b/arch/x86/rpal/thread.c
@@ -99,6 +99,8 @@ int rpal_register_sender(unsigned long addr)
 	rsd->scc = (struct rpal_sender_call_context *)(addr - rsp->user_start +
 						       rsp->kernel_start);
 	rsd->receiver = NULL;
+	rsd->scc->start_time = 0;
+	rsd->scc->total_time = 0;
 
 	current->rpal_sd = rsd;
 	rpal_set_current_thread_flag(RPAL_SENDER_BIT);
@@ -182,6 +184,7 @@ int rpal_register_receiver(unsigned long addr)
 		(struct rpal_receiver_call_context *)(addr - rsp->user_start +
 						      rsp->kernel_start);
 	rrd->sender = NULL;
+	rrd->rcc->total_time = 0;
 
 	current->rpal_rd = rrd;
 	rpal_set_current_thread_flag(RPAL_RECEIVER_BIT);
@@ -289,6 +292,9 @@ int rpal_rebuild_sender_context_on_fault(struct pt_regs *regs,
 				rpal_pkey_to_pkru(rpal_current_service()->pkey),
 				RPAL_PKRU_SET);
 #endif
+			if (!rpal_is_correct_address(rpal_current_service(), regs->ip))
+				/* receiver has crashed */
+				scc->total_time += rdtsc_ordered() - scc->start_time;
 			return 0;
 		}
 	}
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 1d8c1bdc90f2..f5f4da63f28c 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -310,6 +310,7 @@ struct rpal_receiver_call_context {
 	void __user *events;
 	int maxevents;
 	int timeout;
+	int64_t total_time;
 };
 
 /* recovery point for sender */
@@ -325,6 +326,8 @@ struct rpal_sender_call_context {
 	struct rpal_task_context rtc;
 	struct rpal_error_context ec;
 	int sender_id;
+	s64 start_time;
+	s64 total_time;
 };
 
 /* End */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5f25cc09fb71..a03113fecdc5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1663,6 +1663,7 @@ struct task_struct {
 		struct rpal_sender_data *rpal_sd;
 		struct rpal_receiver_data *rpal_rd;
 	};
+	s64 rpal_steal_time;
 #endif
 
 	/* CPU-specific state of this task: */
diff --git a/init/init_task.c b/init/init_task.c
index 2eb08b96e66b..3606cf701dfe 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -224,6 +224,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 	.rpal_rs = NULL,
 	.rpal_flag = 0,
 	.rpal_cd = NULL,
+	.rpal_steal_time = 0,
 #endif
 };
 EXPORT_SYMBOL(init_task);
diff --git a/kernel/fork.c b/kernel/fork.c
index 11cba74d07c8..ff6331a28987 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1222,6 +1222,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->rpal_rs = NULL;
 	tsk->rpal_flag = 0;
 	tsk->rpal_cd = NULL;
+	tsk->rpal_steal_time = 0;
 #endif
 	return tsk;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c219ada29d34..d6f8e0d76fc0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -789,6 +789,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 		delta -= steal;
 	}
 #endif
+#ifdef CONFIG_RPAL
+	if (unlikely(current->rpal_steal_time != 0)) {
+		delta += current->rpal_steal_time;
+		if (unlikely(delta < 0))
+			delta = 0;
+		current->rpal_steal_time = 0;
+	}
+#endif
 
 	rq->clock_task += delta;
 
@@ -6872,6 +6880,36 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
 	return true;
 }
 
+#ifdef CONFIG_RPAL
+static void rpal_acct_runtime(void)
+{
+	if (rpal_current_service()) {
+		if (rpal_test_task_thread_flag(current, RPAL_SENDER_BIT) &&
+		    current->rpal_sd->scc->total_time != 0) {
+			struct rpal_sender_call_context *scc =
+				current->rpal_sd->scc;
+
+			u64 slice =
+				native_sched_clock_from_tsc(scc->total_time) -
+				native_sched_clock_from_tsc(0);
+			current->rpal_steal_time -= slice;
+			scc->total_time = 0;
+		} else if (rpal_test_task_thread_flag(current,
+						      RPAL_RECEIVER_BIT) &&
+			   current->rpal_rd->rcc->total_time != 0) {
+			struct rpal_receiver_call_context *rcc =
+				current->rpal_rd->rcc;
+
+			u64 slice =
+				native_sched_clock_from_tsc(rcc->total_time) -
+				native_sched_clock_from_tsc(0);
+			current->rpal_steal_time += slice;
+			rcc->total_time = 0;
+		}
+	}
+}
+#endif
+
 /*
  * __schedule() is the main scheduler function.
  *
@@ -6926,6 +6964,10 @@ static void __sched notrace __schedule(int sched_mode)
 	struct rq *rq;
 	int cpu;
 
+#ifdef CONFIG_RPAL
+	rpal_acct_runtime();
+#endif
+
 	trace_sched_entry_tp(preempt, CALLER_ADDR0);
 
 	cpu = smp_processor_id();
-- 
2.20.1



  parent reply	other threads:[~2025-05-30  9:36 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-30  9:27 [RFC v2 00/35] optimize cost of inter-process communication Bo Li
2025-05-30  9:27 ` [RFC v2 01/35] Kbuild: rpal support Bo Li
2025-05-30  9:27 ` [RFC v2 02/35] RPAL: add struct rpal_service Bo Li
2025-05-30  9:27 ` [RFC v2 03/35] RPAL: add service registration interface Bo Li
2025-05-30  9:27 ` [RFC v2 04/35] RPAL: add member to task_struct and mm_struct Bo Li
2025-05-30  9:27 ` [RFC v2 05/35] RPAL: enable virtual address space partitions Bo Li
2025-05-30  9:27 ` [RFC v2 06/35] RPAL: add user interface Bo Li
2025-05-30  9:27 ` [RFC v2 07/35] RPAL: enable shared page mmap Bo Li
2025-05-30  9:27 ` [RFC v2 08/35] RPAL: enable sender/receiver registration Bo Li
2025-05-30  9:27 ` [RFC v2 09/35] RPAL: enable address space sharing Bo Li
2025-05-30  9:27 ` [RFC v2 10/35] RPAL: allow service enable/disable Bo Li
2025-05-30  9:27 ` [RFC v2 11/35] RPAL: add service request/release Bo Li
2025-05-30  9:27 ` [RFC v2 12/35] RPAL: enable service disable notification Bo Li
2025-05-30  9:27 ` [RFC v2 13/35] RPAL: add tlb flushing support Bo Li
2025-05-30  9:27 ` [RFC v2 14/35] RPAL: enable page fault handling Bo Li
2025-05-30 13:59   ` Dave Hansen
2025-05-30  9:27 ` [RFC v2 15/35] RPAL: add sender/receiver state Bo Li
2025-05-30  9:27 ` [RFC v2 16/35] RPAL: add cpu lock interface Bo Li
2025-05-30  9:27 ` [RFC v2 17/35] RPAL: add a mapping between fsbase and tasks Bo Li
2025-05-30  9:27 ` [RFC v2 18/35] sched: pick a specified task Bo Li
2025-05-30  9:27 ` [RFC v2 19/35] RPAL: add lazy switch main logic Bo Li
2025-05-30  9:27 ` [RFC v2 20/35] RPAL: add rpal_ret_from_lazy_switch Bo Li
2025-05-30  9:27 ` [RFC v2 21/35] RPAL: add kernel entry handling for lazy switch Bo Li
2025-05-30  9:27 ` [RFC v2 22/35] RPAL: rebuild receiver state Bo Li
2025-05-30  9:27 ` [RFC v2 23/35] RPAL: resume cpumask when fork Bo Li
2025-05-30  9:27 ` [RFC v2 24/35] RPAL: critical section optimization Bo Li
2025-05-30  9:27 ` [RFC v2 25/35] RPAL: add MPK initialization and interface Bo Li
2025-05-30  9:27 ` [RFC v2 26/35] RPAL: enable MPK support Bo Li
2025-05-30 17:03   ` Dave Hansen
2025-05-30  9:27 ` [RFC v2 27/35] RPAL: add epoll support Bo Li
2025-05-30  9:27 ` [RFC v2 28/35] RPAL: add rpal_uds_fdmap() support Bo Li
2025-05-30  9:27 ` [RFC v2 29/35] RPAL: fix race condition in pkru update Bo Li
2025-05-30  9:27 ` [RFC v2 30/35] RPAL: fix pkru setup when fork Bo Li
2025-05-30  9:27 ` [RFC v2 31/35] RPAL: add receiver waker Bo Li
2025-05-30  9:28 ` [RFC v2 32/35] RPAL: fix unknown nmi on AMD CPU Bo Li
2025-05-30  9:28 ` Bo Li [this message]
2025-05-30  9:28 ` [RFC v2 34/35] RPAL: enable fast epoll wait Bo Li
2025-05-30  9:28 ` [RFC v2 35/35] samples/rpal: add RPAL samples Bo Li
2025-05-30  9:33 ` [RFC v2 00/35] optimize cost of inter-process communication Lorenzo Stoakes
2025-06-03  8:22   ` Bo Li
2025-06-03  9:22     ` Lorenzo Stoakes
2025-05-30  9:41 ` Pedro Falcato
2025-05-30  9:56 ` David Hildenbrand
2025-05-30 22:42 ` Andrew Morton
2025-05-31  7:16 ` Ingo Molnar
2025-06-03 17:49 ` H. Peter Anvin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8941a17e12edce00c1cc1c78f4dd3e1bf28e47c0.1748594841.git.libo.gcs85@bytedance.com \
    --to=libo.gcs85@bytedance.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=acme@kernel.org \
    --cc=adrian.hunter@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=bp@alien8.de \
    --cc=brauner@kernel.org \
    --cc=bsegall@google.com \
    --cc=chaiwen.cc@bytedance.com \
    --cc=chengguozhu@bytedance.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=dengliang.1214@bytedance.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=duanxiongchun@bytedance.com \
    --cc=harry.yoo@oracle.com \
    --cc=hpa@zytor.com \
    --cc=irogers@google.com \
    --cc=jack@suse.cz \
    --cc=jannh@google.com \
    --cc=jolsa@kernel.org \
    --cc=juri.lelli@redhat.com \
    --cc=kan.liang@linux.intel.com \
    --cc=kees@kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=luto@kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mgorman@suse.de \
    --cc=mhocko@suse.com \
    --cc=mingo@redhat.com \
    --cc=namhyung@kernel.org \
    --cc=peterz@infradead.org \
    --cc=pfalcato@suse.de \
    --cc=riel@surriel.com \
    --cc=rostedt@goodmis.org \
    --cc=rppt@kernel.org \
    --cc=songmuchun@bytedance.com \
    --cc=sunjiadong.lff@bytedance.com \
    --cc=surenb@google.com \
    --cc=tglx@linutronix.de \
    --cc=vbabka@suse.cz \
    --cc=vincent.guittot@linaro.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=vschneid@redhat.com \
    --cc=x86@kernel.org \
    --cc=xieyongji@bytedance.com \
    --cc=yinhongbo@bytedance.com \
    --cc=yuanzhu@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox