[PATCH v3 2/4] rseq: Schedule the mm_cid_compaction from rseq_sched_switch

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v3 2/4] rseq: Schedule the mm_cid_compaction from rseq_sched_switch_event()
       [not found] <20250929114225.36172-1-gmonaco@redhat.com>
@ 2025-09-29 11:42 ` Gabriele Monaco
  2025-09-29 11:42 ` [PATCH v3 3/4] sched: Compact RSEQ concurrency IDs in batches Gabriele Monaco
  1 sibling, 0 replies; 2+ messages in thread
From: Gabriele Monaco @ 2025-09-29 11:42 UTC (permalink / raw)
  To: linux-kernel, Mathieu Desnoyers, Peter Zijlstra, Thomas Gleixner,
	Ingo Molnar, Andrew Morton, David Hildenbrand, linux-mm
  Cc: Gabriele Monaco

Currently the mm_cid_compaction is triggered by the scheduler tick and
runs in a task_work, behaviour is more unpredictable with periodic tasks
with short runtime, which may rarely run during a tick.

Schedule the mm_cid_compaction from the rseq_sched_switch_event() call
only if the scan is required, that is when the pseudo-period of 100ms
elapsed.

Keep a tick handler used for long running tasks that are never preempted
(i.e. that never call rseq_sched_switch_event), which triggers a
compaction and mm_cid update only in that case.

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/linux/mm_types.h | 11 +++++++++
 include/linux/rseq.h     |  3 +++
 include/linux/sched.h    |  3 +++
 kernel/sched/core.c      | 48 ++++++++++++++++++++++++++++++++++------
 kernel/sched/sched.h     |  2 ++
 5 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 08bc2442db93..5dab88707014 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1424,6 +1424,13 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
 	WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
 	raw_spin_unlock(&mm->cpus_allowed_lock);
 }
+
+static inline bool mm_cid_needs_scan(struct mm_struct *mm)
+{
+	if (!mm)
+		return false;
+	return time_after(jiffies, READ_ONCE(mm->mm_cid_next_scan));
+}
 #else /* CONFIG_SCHED_MM_CID */
 static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
 static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
@@ -1434,6 +1441,10 @@ static inline unsigned int mm_cid_size(void)
 	return 0;
 }
 static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
+static inline bool mm_cid_needs_scan(struct mm_struct *mm)
+{
+	return false;
+}
 #endif /* CONFIG_SCHED_MM_CID */
 
 struct mmu_gather;
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index b8ea95011ec3..12eecde46ff5 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -4,6 +4,7 @@
 
 #ifdef CONFIG_RSEQ
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 
 void __rseq_handle_slowpath(struct pt_regs *regs);
 
@@ -68,6 +69,8 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t)
 			rseq_raise_notify_resume(t);
 		}
 	}
+	if (mm_cid_needs_scan(t->mm))
+		task_add_mm_cid(t);
 }
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 857ed17d443b..80c1afb2087d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1407,6 +1407,7 @@ struct task_struct {
 	int				last_mm_cid;	/* Most recent cid in mm */
 	int				migrate_from_cpu;
 	int				mm_cid_active;	/* Whether cid bitmap is active */
+	unsigned long			last_cid_reset;	/* Time of last reset in jiffies */
 	struct callback_head		cid_work;
 #endif
 
@@ -2300,6 +2301,7 @@ void sched_mm_cid_before_execve(struct task_struct *t);
 void sched_mm_cid_after_execve(struct task_struct *t);
 void sched_mm_cid_fork(struct task_struct *t);
 void sched_mm_cid_exit_signals(struct task_struct *t);
+void task_add_mm_cid(struct task_struct *t);
 static inline int task_mm_cid(struct task_struct *t)
 {
 	return t->mm_cid;
@@ -2309,6 +2311,7 @@ static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
 static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
 static inline void sched_mm_cid_fork(struct task_struct *t) { }
 static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
+static inline void task_add_mm_cid(struct task_struct *t) { }
 static inline int task_mm_cid(struct task_struct *t)
 {
 	/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e742a655c9a8..30652bb4a223 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10840,19 +10840,53 @@ void init_sched_mm_cid(struct task_struct *t)
 	init_task_work(&t->cid_work, task_mm_cid_work);
 }
 
+void task_add_mm_cid(struct task_struct *t)
+{
+	struct callback_head *work = &t->cid_work;
+
+	if (work->next != work)
+		return;
+	/* No page allocation under rq lock */
+	task_work_add(t, work, TWA_RESUME);
+}
+
 void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
 {
-	struct callback_head *work = &curr->cid_work;
-	unsigned long now = jiffies;
+	u64 rtime = curr->se.sum_exec_runtime - curr->se.prev_sum_exec_runtime;
 
+	/*
+	 * If a task is running unpreempted for a long time, it won't get its
+	 * mm_cid compacted and won't update its mm_cid value after a
+	 * compaction occurs.
+	 * For such a task, this function does two things:
+	 * A) trigger the mm_cid recompaction,
+	 * B) trigger an update of the task's rseq->mm_cid field at some point
+	 * after recompaction, so it can get a mm_cid value closer to 0.
+	 * A change in the mm_cid triggers an rseq_preempt.
+	 *
+	 * B occurs once after the compaction work completes, neither A nor B
+	 * run as long as the compaction work is pending, the task is exiting
+	 * or is not a userspace task.
+	 */
 	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
-	    work->next != work)
+	    test_tsk_thread_flag(curr, TIF_NOTIFY_RESUME))
 		return;
-	if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
+	if (rtime < RSEQ_UNPREEMPTED_THRESHOLD)
 		return;
-
-	/* No page allocation under rq lock */
-	task_work_add(curr, work, TWA_RESUME);
+	if (mm_cid_needs_scan(curr->mm)) {
+		/* Trigger mm_cid recompaction */
+		task_add_mm_cid(curr);
+	} else if (time_after(jiffies, curr->last_cid_reset +
+			      msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
+		/* Update mm_cid field */
+		if (!curr->mm_cid_active)
+			return;
+		mm_cid_snapshot_time(rq, curr->mm);
+		mm_cid_put_lazy(curr);
+		curr->last_mm_cid = curr->mm_cid = mm_cid_get(rq, curr, curr->mm);
+		rseq_sched_set_task_mm_cid(curr, curr->mm_cid);
+		rseq_sched_switch_event(curr);
+	}
 }
 
 void sched_mm_cid_exit_signals(struct task_struct *t)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8f14d231e7a7..8c0fb3b0fb35 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3512,6 +3512,7 @@ extern const char *preempt_modes[];
 
 #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
 #define MM_CID_SCAN_DELAY	100			/* 100ms */
+#define RSEQ_UNPREEMPTED_THRESHOLD	SCHED_MM_CID_PERIOD_NS
 
 extern raw_spinlock_t cid_lock;
 extern int use_cid_lock;
@@ -3715,6 +3716,7 @@ static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
 	int cid;
 
 	lockdep_assert_rq_held(rq);
+	t->last_cid_reset = jiffies;
 	cpumask = mm_cidmask(mm);
 	cid = __this_cpu_read(pcpu_cid->cid);
 	if (mm_cid_is_valid(cid)) {
-- 
2.51.0



^ permalink raw reply	[flat|nested] 2+ messages in thread

* [PATCH v3 3/4] sched: Compact RSEQ concurrency IDs in batches
       [not found] <20250929114225.36172-1-gmonaco@redhat.com>
  2025-09-29 11:42 ` [PATCH v3 2/4] rseq: Schedule the mm_cid_compaction from rseq_sched_switch_event() Gabriele Monaco
@ 2025-09-29 11:42 ` Gabriele Monaco
  1 sibling, 0 replies; 2+ messages in thread
From: Gabriele Monaco @ 2025-09-29 11:42 UTC (permalink / raw)
  To: linux-kernel, Mathieu Desnoyers, Peter Zijlstra, Thomas Gleixner,
	Ingo Molnar, Andrew Morton, David Hildenbrand, linux-mm
  Cc: Gabriele Monaco

Currently, task_mm_cid_work() is called from resume_user_mode_work().
This can delay the execution of the corresponding thread for the entire
duration of the function, negatively affecting the response in case of
real time tasks.
In practice, we observe task_mm_cid_work increasing the latency of
30-35us on a 128 cores system, this order of magnitude is meaningful
under PREEMPT_RT.

Run the task_mm_cid_work in batches of up to CONFIG_RSEQ_CID_SCAN_BATCH
CPUs, this reduces the duration of the delay for each scan.

The task_mm_cid_work contains a mechanism to avoid running more
frequently than every 100ms. Keep this pseudo-periodicity only on
complete scans.
This means each call to task_mm_cid_work returns prematurely if the
period did not elapse and a scan is not ongoing (i.e. the next batch to
scan is not the first).
This way full scans are not excessively delayed while still keeping each
run, and introduced latency, short.

Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/linux/mm_types.h | 15 +++++++++++++++
 init/Kconfig             | 12 ++++++++++++
 kernel/sched/core.c      | 31 ++++++++++++++++++++++++++++---
 3 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5dab88707014..83f6dc06b15f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -994,6 +994,13 @@ struct mm_struct {
 		 * When the next mm_cid scan is due (in jiffies).
 		 */
 		unsigned long mm_cid_next_scan;
+		/*
+		 * @mm_cid_scan_batch: Counter for batch used in the next scan.
+		 *
+		 * Scan in batches of CONFIG_RSEQ_CID_SCAN_BATCH. This field
+		 * increments at each scan and reset when all batches are done.
+		 */
+		unsigned int mm_cid_scan_batch;
 		/**
 		 * @nr_cpus_allowed: Number of CPUs allowed for mm.
 		 *
@@ -1389,6 +1396,7 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 	raw_spin_lock_init(&mm->cpus_allowed_lock);
 	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
 	cpumask_clear(mm_cidmask(mm));
+	mm->mm_cid_scan_batch = 0;
 }
 
 static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
@@ -1427,8 +1435,15 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
 
 static inline bool mm_cid_needs_scan(struct mm_struct *mm)
 {
+	unsigned int next_batch;
+
 	if (!mm)
 		return false;
+	next_batch = READ_ONCE(mm->mm_cid_scan_batch);
+	/* Always needs scan unless it's the first batch. */
+	if (CONFIG_RSEQ_CID_SCAN_BATCH * next_batch < num_possible_cpus() &&
+	    next_batch)
+		return true;
 	return time_after(jiffies, READ_ONCE(mm->mm_cid_next_scan));
 }
 #else /* CONFIG_SCHED_MM_CID */
diff --git a/init/Kconfig b/init/Kconfig
index 854b35e33318..8905d64c2598 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1918,6 +1918,18 @@ config DEBUG_RSEQ
 
 	  If unsure, say N.
 
+config RSEQ_CID_SCAN_BATCH
+	int "Number of CPUs to scan at every mm_cid compaction attempt"
+	range 1 NR_CPUS
+	default 16
+	depends on SCHED_MM_CID
+	help
+	  CPUs are scanned pseudo-periodically to compact the CID of each task,
+	  this operation can take a longer amount of time on systems with many
+	  CPUs, resulting in higher scheduling latency for the current task.
+	  A higher value means the CID is compacted faster, but results in
+	  higher scheduling latency.
+
 config CACHESTAT_SYSCALL
 	bool "Enable cachestat() system call" if EXPERT
 	default y
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 30652bb4a223..14b79c143d26 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10784,11 +10784,11 @@ static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
 
 static void task_mm_cid_work(struct callback_head *work)
 {
+	int weight, cpu, from_cpu, this_batch, next_batch, idx;
 	unsigned long now = jiffies, old_scan, next_scan;
 	struct task_struct *t = current;
 	struct cpumask *cidmask;
 	struct mm_struct *mm;
-	int weight, cpu;
 
 	WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
 
@@ -10798,6 +10798,17 @@ static void task_mm_cid_work(struct callback_head *work)
 	mm = t->mm;
 	if (!mm)
 		return;
+	this_batch = READ_ONCE(mm->mm_cid_scan_batch);
+	next_batch = this_batch + 1;
+	from_cpu = cpumask_nth(this_batch * CONFIG_RSEQ_CID_SCAN_BATCH,
+			       cpu_possible_mask);
+	if (from_cpu >= nr_cpu_ids) {
+		from_cpu = 0;
+		next_batch = 1;
+	}
+	/* Delay scan only if we are done with all cpus. */
+	if (from_cpu != 0)
+		goto cid_compact;
 	old_scan = READ_ONCE(mm->mm_cid_next_scan);
 	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
 	if (!old_scan) {
@@ -10813,17 +10824,31 @@ static void task_mm_cid_work(struct callback_head *work)
 		return;
 	if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
 		return;
+
+cid_compact:
+	if (!try_cmpxchg(&mm->mm_cid_scan_batch, &this_batch, next_batch))
+		return;
 	cidmask = mm_cidmask(mm);
 	/* Clear cids that were not recently used. */
-	for_each_possible_cpu(cpu)
+	idx = 0;
+	cpu = from_cpu;
+	for_each_cpu_from(cpu, cpu_possible_mask) {
+		if (idx++ == CONFIG_RSEQ_CID_SCAN_BATCH)
+			break;
 		sched_mm_cid_remote_clear_old(mm, cpu);
+	}
 	weight = cpumask_weight(cidmask);
 	/*
 	 * Clear cids that are greater or equal to the cidmask weight to
 	 * recompact it.
 	 */
-	for_each_possible_cpu(cpu)
+	idx = 0;
+	cpu = from_cpu;
+	for_each_cpu_from(cpu, cpu_possible_mask) {
+		if (idx++ == CONFIG_RSEQ_CID_SCAN_BATCH)
+			break;
 		sched_mm_cid_remote_clear_weight(mm, cpu, weight);
+	}
 }
 
 void init_sched_mm_cid(struct task_struct *t)
-- 
2.51.0



^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2025-09-29 11:42 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20250929114225.36172-1-gmonaco@redhat.com>
2025-09-29 11:42 ` [PATCH v3 2/4] rseq: Schedule the mm_cid_compaction from rseq_sched_switch_event() Gabriele Monaco
2025-09-29 11:42 ` [PATCH v3 3/4] sched: Compact RSEQ concurrency IDs in batches Gabriele Monaco

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox