* [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work_struct
[not found] <20250311062849.72083-1-gmonaco@redhat.com>
@ 2025-03-11 6:28 ` Gabriele Monaco
2025-04-09 14:03 ` Peter Zijlstra
0 siblings, 1 reply; 9+ messages in thread
From: Gabriele Monaco @ 2025-03-11 6:28 UTC (permalink / raw)
To: linux-kernel, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Mathieu Desnoyers, Paul E. McKenney, linux-mm
Cc: Gabriele Monaco, Ingo Molnar, Shuah Khan
Currently, the task_mm_cid_work function is called in a task work
triggered by a scheduler tick to frequently compact the mm_cids of each
process. This can delay the execution of the corresponding thread for
the entire duration of the function, negatively affecting the response
in case of real time tasks. In practice, we observe task_mm_cid_work
increasing the latency of 30-35us on a 128 cores system, this order of
magnitude is meaningful under PREEMPT_RT.
Run the task_mm_cid_work in a new work_struct connected to the
mm_struct rather than in the task context before returning to
userspace.
This work_struct is initialised with the mm and disabled before freeing
it. The queuing of the work happens while returning to userspace in
__rseq_handle_notify_resume, maintaining the checks to avoid running
more frequently than MM_CID_SCAN_DELAY.
To make sure this happens predictably also on long running tasks, we
trigger a call to __rseq_handle_notify_resume also from the scheduler
tick if the runtime exceeded a 100ms threshold.
The main advantage of this change is that the function can be offloaded
to a different CPU and even preempted by RT tasks.
Moreover, this new behaviour is more predictable with periodic tasks
with short runtime, which may rarely run during a scheduler tick.
Now, the work is always scheduled when the task returns to userspace.
The work is disabled during mmdrop, since the function cannot sleep in
all kernel configurations, we cannot wait for possibly running work
items to terminate. We make sure the mm is valid in case the task is
terminating by reserving it with mmgrab/mmdrop, returning prematurely if
we are really the last user while the work gets to run.
This situation is unlikely since we don't schedule the work for exiting
tasks, but we cannot rule it out.
Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
include/linux/mm_types.h | 17 ++++++++++++++++
include/linux/rseq.h | 13 ++++++++++++
include/linux/sched.h | 7 ++++++-
kernel/rseq.c | 2 ++
kernel/sched/core.c | 43 ++++++++++++++--------------------------
kernel/sched/sched.h | 2 --
6 files changed, 53 insertions(+), 31 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0234f14f2aa6b..c79f468337fc0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -889,6 +889,10 @@ struct mm_struct {
* mm nr_cpus_allowed updates.
*/
raw_spinlock_t cpus_allowed_lock;
+ /*
+ * @cid_work: Work item to run the mm_cid scan.
+ */
+ struct work_struct cid_work;
#endif
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* size of all page tables */
@@ -1185,6 +1189,8 @@ enum mm_cid_state {
MM_CID_LAZY_PUT = (1U << 31),
};
+extern void task_mm_cid_work(struct work_struct *work);
+
static inline bool mm_cid_is_unset(int cid)
{
return cid == MM_CID_UNSET;
@@ -1257,12 +1263,14 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *
if (!mm->pcpu_cid)
return -ENOMEM;
mm_init_cid(mm, p);
+ INIT_WORK(&mm->cid_work, task_mm_cid_work);
return 0;
}
#define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__))
static inline void mm_destroy_cid(struct mm_struct *mm)
{
+ disable_work(&mm->cid_work);
free_percpu(mm->pcpu_cid);
mm->pcpu_cid = NULL;
}
@@ -1284,6 +1292,11 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
raw_spin_unlock(&mm->cpus_allowed_lock);
}
+
+static inline bool mm_cid_needs_scan(struct mm_struct *mm)
+{
+ return mm && !time_before(jiffies, READ_ONCE(mm->mm_cid_next_scan));
+}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
@@ -1294,6 +1307,10 @@ static inline unsigned int mm_cid_size(void)
return 0;
}
static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
+static inline bool mm_cid_needs_scan(struct mm_struct *mm)
+{
+ return false;
+}
#endif /* CONFIG_SCHED_MM_CID */
struct mmu_gather;
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index bc8af3eb55987..d20fd72f4c80d 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -7,6 +7,8 @@
#include <linux/preempt.h>
#include <linux/sched.h>
+#define RSEQ_UNPREEMPTED_THRESHOLD (100ULL * 1000000) /* 100ms */
+
/*
* Map the event mask on the user-space ABI enum rseq_cs_flags
* for direct mask checks.
@@ -54,6 +56,14 @@ static inline void rseq_preempt(struct task_struct *t)
rseq_set_notify_resume(t);
}
+static inline void rseq_preempt_from_tick(struct task_struct *t)
+{
+ u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
+
+ if (rtime > RSEQ_UNPREEMPTED_THRESHOLD)
+ rseq_preempt(t);
+}
+
/* rseq_migrate() requires preemption to be disabled. */
static inline void rseq_migrate(struct task_struct *t)
{
@@ -104,6 +114,9 @@ static inline void rseq_signal_deliver(struct ksignal *ksig,
static inline void rseq_preempt(struct task_struct *t)
{
}
+static inline void rseq_preempt_from_tick(struct task_struct *t)
+{
+}
static inline void rseq_migrate(struct task_struct *t)
{
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c15365a30c08..a40bb0b38d2e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1397,7 +1397,6 @@ struct task_struct {
int last_mm_cid; /* Most recent cid in mm */
int migrate_from_cpu;
int mm_cid_active; /* Whether cid bitmap is active */
- struct callback_head cid_work;
#endif
struct tlbflush_unmap_batch tlb_ubc;
@@ -2254,4 +2253,10 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
#define alloc_tag_restore(_tag, _old) do {} while (0)
#endif
+#ifdef CONFIG_SCHED_MM_CID
+extern void task_queue_mm_cid(struct task_struct *curr);
+#else
+static inline void task_queue_mm_cid(struct task_struct *curr) { }
+#endif
+
#endif
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 2cb16091ec0ae..909547ec52fd6 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -419,6 +419,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
}
if (unlikely(rseq_update_cpu_node_id(t)))
goto error;
+ if (mm_cid_needs_scan(t->mm))
+ task_queue_mm_cid(t);
return;
error:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 67189907214d3..f42b6f2d06b95 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5663,7 +5663,7 @@ void sched_tick(void)
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
- task_tick_mm_cid(rq, donor);
+ rseq_preempt_from_tick(donor);
scx_tick(rq);
rq_unlock(rq, &rf);
@@ -10530,22 +10530,16 @@ static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
}
-static void task_mm_cid_work(struct callback_head *work)
+void task_mm_cid_work(struct work_struct *work)
{
unsigned long now = jiffies, old_scan, next_scan;
- struct task_struct *t = current;
struct cpumask *cidmask;
- struct mm_struct *mm;
+ struct mm_struct *mm = container_of(work, struct mm_struct, cid_work);
int weight, cpu;
- SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
-
- work->next = work; /* Prevent double-add */
- if (t->flags & PF_EXITING)
- return;
- mm = t->mm;
- if (!mm)
- return;
+ /* We are the last user, process already terminated. */
+ if (atomic_read(&mm->mm_count) == 1)
+ goto out_drop;
old_scan = READ_ONCE(mm->mm_cid_next_scan);
next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
if (!old_scan) {
@@ -10558,9 +10552,9 @@ static void task_mm_cid_work(struct callback_head *work)
old_scan = next_scan;
}
if (time_before(now, old_scan))
- return;
+ goto out_drop;
if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
- return;
+ goto out_drop;
cidmask = mm_cidmask(mm);
/* Clear cids that were not recently used. */
for_each_possible_cpu(cpu)
@@ -10572,6 +10566,8 @@ static void task_mm_cid_work(struct callback_head *work)
*/
for_each_possible_cpu(cpu)
sched_mm_cid_remote_clear_weight(mm, cpu, weight);
+out_drop:
+ mmdrop(mm);
}
void init_sched_mm_cid(struct task_struct *t)
@@ -10584,23 +10580,14 @@ void init_sched_mm_cid(struct task_struct *t)
if (mm_users == 1)
mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
}
- t->cid_work.next = &t->cid_work; /* Protect against double add */
- init_task_work(&t->cid_work, task_mm_cid_work);
}
-void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
+/* Call only when curr is a user thread. */
+void task_queue_mm_cid(struct task_struct *curr)
{
- struct callback_head *work = &curr->cid_work;
- unsigned long now = jiffies;
-
- if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
- work->next != work)
- return;
- if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
- return;
-
- /* No page allocation under rq lock */
- task_work_add(curr, work, TWA_RESUME);
+ /* Ensure the mm exists when we run. */
+ mmgrab(curr->mm);
+ queue_work(system_unbound_wq, &curr->mm->cid_work);
}
void sched_mm_cid_exit_signals(struct task_struct *t)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c8512a9fb0229..37a2e2328283e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3630,7 +3630,6 @@ extern int use_cid_lock;
extern void sched_mm_cid_migrate_from(struct task_struct *t);
extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
-extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
extern void init_sched_mm_cid(struct task_struct *t);
static inline void __mm_cid_put(struct mm_struct *mm, int cid)
@@ -3899,7 +3898,6 @@ static inline void switch_mm_cid(struct rq *rq,
static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
-static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
static inline void init_sched_mm_cid(struct task_struct *t) { }
#endif /* !CONFIG_SCHED_MM_CID */
--
2.48.1
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work_struct
2025-03-11 6:28 ` [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work_struct Gabriele Monaco
@ 2025-04-09 14:03 ` Peter Zijlstra
2025-04-09 14:15 ` Mathieu Desnoyers
0 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2025-04-09 14:03 UTC (permalink / raw)
To: Gabriele Monaco
Cc: linux-kernel, Andrew Morton, Ingo Molnar, Mathieu Desnoyers,
Paul E. McKenney, linux-mm, Ingo Molnar, Shuah Khan
On Tue, Mar 11, 2025 at 07:28:45AM +0100, Gabriele Monaco wrote:
> +static inline void rseq_preempt_from_tick(struct task_struct *t)
> +{
> + u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
> +
> + if (rtime > RSEQ_UNPREEMPTED_THRESHOLD)
> + rseq_preempt(t);
> +}
This confused me.
The goal seems to be to tickle __rseq_handle_notify_resume() so it'll
end up queueing that work thing. But why do we want to set PREEMPT_BIT
here?
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work_struct
2025-04-09 14:03 ` Peter Zijlstra
@ 2025-04-09 14:15 ` Mathieu Desnoyers
2025-04-09 15:20 ` Peter Zijlstra
0 siblings, 1 reply; 9+ messages in thread
From: Mathieu Desnoyers @ 2025-04-09 14:15 UTC (permalink / raw)
To: Peter Zijlstra, Gabriele Monaco
Cc: linux-kernel, Andrew Morton, Ingo Molnar, Paul E. McKenney,
linux-mm, Ingo Molnar, Shuah Khan
On 2025-04-09 10:03, Peter Zijlstra wrote:
> On Tue, Mar 11, 2025 at 07:28:45AM +0100, Gabriele Monaco wrote:
>> +static inline void rseq_preempt_from_tick(struct task_struct *t)
>> +{
>> + u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
>> +
>> + if (rtime > RSEQ_UNPREEMPTED_THRESHOLD)
>> + rseq_preempt(t);
>> +}
>
> This confused me.
>
> The goal seems to be to tickle __rseq_handle_notify_resume() so it'll
> end up queueing that work thing. But why do we want to set PREEMPT_BIT
> here?
In that scenario, we trigger (from tick) the fact that we may recompact the
mm_cid, and thus need to update the rseq mm_cid field before returning to
userspace.
Changing the value of the mm_cid field while userspace is within a rseq
critical section should abort the critical section, because the rseq
critical section should be able to expect the mm_cid to be invariant
for the whole c.s..
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work_struct
2025-04-09 14:15 ` Mathieu Desnoyers
@ 2025-04-09 15:20 ` Peter Zijlstra
2025-04-09 15:53 ` Mathieu Desnoyers
0 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2025-04-09 15:20 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Gabriele Monaco, linux-kernel, Andrew Morton, Ingo Molnar,
Paul E. McKenney, linux-mm, Ingo Molnar, Shuah Khan
On Wed, Apr 09, 2025 at 10:15:42AM -0400, Mathieu Desnoyers wrote:
> On 2025-04-09 10:03, Peter Zijlstra wrote:
> > On Tue, Mar 11, 2025 at 07:28:45AM +0100, Gabriele Monaco wrote:
> > > +static inline void rseq_preempt_from_tick(struct task_struct *t)
> > > +{
> > > + u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
> > > +
> > > + if (rtime > RSEQ_UNPREEMPTED_THRESHOLD)
> > > + rseq_preempt(t);
> > > +}
> >
> > This confused me.
> >
> > The goal seems to be to tickle __rseq_handle_notify_resume() so it'll
> > end up queueing that work thing. But why do we want to set PREEMPT_BIT
> > here?
>
> In that scenario, we trigger (from tick) the fact that we may recompact the
> mm_cid, and thus need to update the rseq mm_cid field before returning to
> userspace.
>
> Changing the value of the mm_cid field while userspace is within a rseq
> critical section should abort the critical section, because the rseq
> critical section should be able to expect the mm_cid to be invariant
> for the whole c.s..
But, if we run that compaction in a worker, what guarantees the
compaction is done and mm_cid is stable, but the time this task returns
to userspace again?
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work_struct
2025-04-09 15:20 ` Peter Zijlstra
@ 2025-04-09 15:53 ` Mathieu Desnoyers
2025-04-09 19:08 ` Peter Zijlstra
2025-04-10 12:50 ` [PATCH] fixup: " Gabriele Monaco
0 siblings, 2 replies; 9+ messages in thread
From: Mathieu Desnoyers @ 2025-04-09 15:53 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Gabriele Monaco, linux-kernel, Andrew Morton, Ingo Molnar,
Paul E. McKenney, linux-mm, Ingo Molnar, Shuah Khan
On 2025-04-09 11:20, Peter Zijlstra wrote:
> On Wed, Apr 09, 2025 at 10:15:42AM -0400, Mathieu Desnoyers wrote:
>> On 2025-04-09 10:03, Peter Zijlstra wrote:
>>> On Tue, Mar 11, 2025 at 07:28:45AM +0100, Gabriele Monaco wrote:
>>>> +static inline void rseq_preempt_from_tick(struct task_struct *t)
>>>> +{
>>>> + u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
>>>> +
>>>> + if (rtime > RSEQ_UNPREEMPTED_THRESHOLD)
>>>> + rseq_preempt(t);
>>>> +}
>>>
>>> This confused me.
>>>
>>> The goal seems to be to tickle __rseq_handle_notify_resume() so it'll
>>> end up queueing that work thing. But why do we want to set PREEMPT_BIT
>>> here?
>>
>> In that scenario, we trigger (from tick) the fact that we may recompact the
>> mm_cid, and thus need to update the rseq mm_cid field before returning to
>> userspace.
>>
>> Changing the value of the mm_cid field while userspace is within a rseq
>> critical section should abort the critical section, because the rseq
>> critical section should be able to expect the mm_cid to be invariant
>> for the whole c.s..
>
> But, if we run that compaction in a worker, what guarantees the
> compaction is done and mm_cid is stable, but the time this task returns
> to userspace again?
So let's say we have a task which is running and not preempted by any
other task on a cpu for a long time.
The idea is to have the tick do two things:
A) trigger the mm_cid recompaction,
B) trigger an update of the task's rseq->mm_cid field at some point
after recompaction, so it can get a mm_cid value closer to 0.
So in its current form this patch will indeed trigger rseq_preempt()
for *every tick* after the task has run for more than 100ms, which
I don't think is intended. This should be fixed.
Also, doing just an rseq_preempt() is not the correct approach, as
AFAIU it won't force the long running task to release the currently
held mm_cid value.
I think we need something that looks like the following based on the
current patch:
- rename rseq_preempt_from_tick() to rseq_tick(),
- modify rseq_tick() to ensure it calls rseq_set_notify_resume(t)
rather than rseq_preempt().
- modify rseq_tick() to ensure it only calls it once every
RSEQ_UNPREEMPTED_THRESHOLD, rather than every tick after
RSEQ_UNPREEMPTED_THRESHOLD.
- modify rseq_tick() so at some point after the work has
compacted mm_cids, we do the same things as switch_mm_cid()
does, namely to release the currently held cid and get a likely
smaller one (closer to 0). If the value changes, then we should
trigger rseq_preempt() so the task updates the mm_cid field before
returning to userspace and restarts ongoing rseq critical section.
Thoughts ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work_struct
2025-04-09 15:53 ` Mathieu Desnoyers
@ 2025-04-09 19:08 ` Peter Zijlstra
2025-04-10 12:50 ` [PATCH] fixup: " Gabriele Monaco
1 sibling, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2025-04-09 19:08 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: Gabriele Monaco, linux-kernel, Andrew Morton, Ingo Molnar,
Paul E. McKenney, linux-mm, Ingo Molnar, Shuah Khan
On Wed, Apr 09, 2025 at 11:53:05AM -0400, Mathieu Desnoyers wrote:
> On 2025-04-09 11:20, Peter Zijlstra wrote:
> > On Wed, Apr 09, 2025 at 10:15:42AM -0400, Mathieu Desnoyers wrote:
> > > On 2025-04-09 10:03, Peter Zijlstra wrote:
> > > > On Tue, Mar 11, 2025 at 07:28:45AM +0100, Gabriele Monaco wrote:
> > > > > +static inline void rseq_preempt_from_tick(struct task_struct *t)
> > > > > +{
> > > > > + u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
> > > > > +
> > > > > + if (rtime > RSEQ_UNPREEMPTED_THRESHOLD)
> > > > > + rseq_preempt(t);
> > > > > +}
> > > >
> > > > This confused me.
> > > >
> > > > The goal seems to be to tickle __rseq_handle_notify_resume() so it'll
> > > > end up queueing that work thing. But why do we want to set PREEMPT_BIT
> > > > here?
> > >
> > > In that scenario, we trigger (from tick) the fact that we may recompact the
> > > mm_cid, and thus need to update the rseq mm_cid field before returning to
> > > userspace.
> > >
> > > Changing the value of the mm_cid field while userspace is within a rseq
> > > critical section should abort the critical section, because the rseq
> > > critical section should be able to expect the mm_cid to be invariant
> > > for the whole c.s..
> >
> > But, if we run that compaction in a worker, what guarantees the
> > compaction is done and mm_cid is stable, but the time this task returns
> > to userspace again?
>
> So let's say we have a task which is running and not preempted by any
> other task on a cpu for a long time.
>
> The idea is to have the tick do two things:
>
> A) trigger the mm_cid recompaction,
>
> B) trigger an update of the task's rseq->mm_cid field at some point
> after recompaction, so it can get a mm_cid value closer to 0.
>
> So in its current form this patch will indeed trigger rseq_preempt()
> for *every tick* after the task has run for more than 100ms, which
> I don't think is intended. This should be fixed.
>
> Also, doing just an rseq_preempt() is not the correct approach, as
> AFAIU it won't force the long running task to release the currently
> held mm_cid value.
>
> I think we need something that looks like the following based on the
> current patch:
>
> - rename rseq_preempt_from_tick() to rseq_tick(),
>
> - modify rseq_tick() to ensure it calls rseq_set_notify_resume(t)
> rather than rseq_preempt().
>
> - modify rseq_tick() to ensure it only calls it once every
> RSEQ_UNPREEMPTED_THRESHOLD, rather than every tick after
> RSEQ_UNPREEMPTED_THRESHOLD.
>
> - modify rseq_tick() so at some point after the work has
> compacted mm_cids, we do the same things as switch_mm_cid()
> does, namely to release the currently held cid and get a likely
> smaller one (closer to 0). If the value changes, then we should
> trigger rseq_preempt() so the task updates the mm_cid field before
> returning to userspace and restarts ongoing rseq critical section.
>
> Thoughts ?
Yes, that seems better. Also be sure there's a comment around there
somewhere that explains this. Because I'm sure I'll have forgotten all
about this in a few months time :-)
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH] fixup: [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work_struct
2025-04-09 15:53 ` Mathieu Desnoyers
2025-04-09 19:08 ` Peter Zijlstra
@ 2025-04-10 12:50 ` Gabriele Monaco
2025-04-10 14:04 ` Mathieu Desnoyers
1 sibling, 1 reply; 9+ messages in thread
From: Gabriele Monaco @ 2025-04-10 12:50 UTC (permalink / raw)
To: mathieu.desnoyers, peterz, Ingo Molnar, linux-kernel
Cc: akpm, gmonaco, linux-mm, paulmck, shuah
Thanks both for the comments, I tried to implement what Mathieu
suggested. This patch applies directly on 2/3 but I'm sending it here
first to get feedback.
Essentially, I refactored a bit to avoid the need to add more
dependencies to rseq, the rseq_tick is now called task_tick_mm_cid (as
before the series) and it does the two things you mentioned:
* A) trigger the mm_cid recompaction
* B) trigger an update of the task's rseq->mm_cid field at some point
after recompaction, so it can get a mm_cid value closer to 0.
Now, A occurs only after the scan time elapsed, which means it could
potentially run multiple times in case the work is not scheduled before
the next tick, I'm not sure adding more checks to make sure it
happens once and only once really makes sense here.
B is occurring after the work updates the last scan time, so we are in a
condition where the runtime is above threshold but the (next) scan time
did not expire yet.
I tried to account for multiple threads updating the mm_cid (not
necessarily the long running one, or in case more are long running), for
this I'm tracking the last time we updated the mm_cid, if that occurred
before the last mm_cid scan, we need to update (and preempt).
Does this make sense to you?
Thanks,
Gabriele
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
include/linux/rseq.h | 14 +-------------
include/linux/sched.h | 1 +
kernel/sched/core.c | 42 +++++++++++++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 3 +++
4 files changed, 46 insertions(+), 14 deletions(-)
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index d20fd72f4c80d..7e3fa2ae9e7a4 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -7,8 +7,6 @@
#include <linux/preempt.h>
#include <linux/sched.h>
-#define RSEQ_UNPREEMPTED_THRESHOLD (100ULL * 1000000) /* 100ms */
-
/*
* Map the event mask on the user-space ABI enum rseq_cs_flags
* for direct mask checks.
@@ -54,14 +52,7 @@ static inline void rseq_preempt(struct task_struct *t)
{
__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
rseq_set_notify_resume(t);
-}
-
-static inline void rseq_preempt_from_tick(struct task_struct *t)
-{
- u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
-
- if (rtime > RSEQ_UNPREEMPTED_THRESHOLD)
- rseq_preempt(t);
+ t->last_rseq_preempt = jiffies;
}
/* rseq_migrate() requires preemption to be disabled. */
@@ -114,9 +105,6 @@ static inline void rseq_signal_deliver(struct ksignal *ksig,
static inline void rseq_preempt(struct task_struct *t)
{
}
-static inline void rseq_preempt_from_tick(struct task_struct *t)
-{
-}
static inline void rseq_migrate(struct task_struct *t)
{
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 851933e62bed3..5b057095d5dc0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1424,6 +1424,7 @@ struct task_struct {
int last_mm_cid; /* Most recent cid in mm */
int migrate_from_cpu;
int mm_cid_active; /* Whether cid bitmap is active */
+ unsigned long last_rseq_preempt; /* Time of last preempt in jiffies */
#endif
struct tlbflush_unmap_batch tlb_ubc;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 52ad709094167..9f0c9cc284804 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5663,7 +5663,7 @@ void sched_tick(void)
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
- rseq_preempt_from_tick(donor);
+ task_tick_mm_cid(rq, donor);
scx_tick(rq);
rq_unlock(rq, &rf);
@@ -10618,6 +10618,46 @@ void init_sched_mm_cid(struct task_struct *t)
}
}
+void task_tick_mm_cid(struct rq *rq, struct task_struct *t)
+{
+ u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
+
+ /*
+ * If a task is running unpreempted for a long time, it won't get its
+ * mm_cid compacted and won't update its mm_cid value after a
+ * compaction occurs.
+ * For such a task, this function does two things:
+ * A) trigger the mm_cid recompaction,
+ * B) trigger an update of the task's rseq->mm_cid field at some point
+ * after recompaction, so it can get a mm_cid value closer to 0.
+ * A change in the mm_cid triggers an rseq_preempt.
+ *
+ * A occurs only after the next scan time elapsed but before the
+ * compaction work is actually scheduled.
+ * B occurs once after the compaction work completes, that is when scan
+ * is no longer needed (it occurred for this mm) but the last rseq
+ * preempt was done before the last mm_cid scan.
+ */
+ if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
+ if (mm_cid_needs_scan(t->mm))
+ rseq_set_notify_resume(t);
+ else if (time_after(jiffies, t->last_rseq_preempt +
+ msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
+ int old_cid = t->mm_cid;
+
+ if (!t->mm_cid_active)
+ return;
+ mm_cid_snapshot_time(rq, t->mm);
+ mm_cid_put_lazy(t);
+ t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, t->mm);
+ if (old_cid == t->mm_cid)
+ t->last_rseq_preempt = jiffies;
+ else
+ rseq_preempt(t);
+ }
+ }
+}
+
/* Call only when curr is a user thread. */
void task_queue_mm_cid(struct task_struct *curr)
{
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1703cd16d5433..7d104d12ed974 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3582,12 +3582,14 @@ extern const char *preempt_modes[];
#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
#define MM_CID_SCAN_DELAY 100 /* 100ms */
+#define RSEQ_UNPREEMPTED_THRESHOLD SCHED_MM_CID_PERIOD_NS
extern raw_spinlock_t cid_lock;
extern int use_cid_lock;
extern void sched_mm_cid_migrate_from(struct task_struct *t);
extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
+extern void task_tick_mm_cid(struct rq *rq, struct task_struct *t);
extern void init_sched_mm_cid(struct task_struct *t);
static inline void __mm_cid_put(struct mm_struct *mm, int cid)
@@ -3856,6 +3858,7 @@ static inline void switch_mm_cid(struct rq *rq,
static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
+static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *t) { }
static inline void init_sched_mm_cid(struct task_struct *t) { }
#endif /* !CONFIG_SCHED_MM_CID */
base-commit: c59c19fcfad857c96effa3b2e9eb6d934d2380d8
--
2.49.0
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] fixup: [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work_struct
2025-04-10 12:50 ` [PATCH] fixup: " Gabriele Monaco
@ 2025-04-10 14:04 ` Mathieu Desnoyers
2025-04-10 14:36 ` Gabriele Monaco
0 siblings, 1 reply; 9+ messages in thread
From: Mathieu Desnoyers @ 2025-04-10 14:04 UTC (permalink / raw)
To: Gabriele Monaco, peterz, Ingo Molnar, linux-kernel
Cc: akpm, linux-mm, paulmck, shuah
On 2025-04-10 08:50, Gabriele Monaco wrote:
> Thanks both for the comments, I tried to implement what Mathieu
> suggested. This patch applies directly on 2/3 but I'm sending it here
> first to get feedback.
>
> Essentially, I refactored a bit to avoid the need to add more
> dependencies to rseq, the rseq_tick is now called task_tick_mm_cid (as
> before the series) and it does the two things you mentioned:
> * A) trigger the mm_cid recompaction
> * B) trigger an update of the task's rseq->mm_cid field at some point
> after recompaction, so it can get a mm_cid value closer to 0.
>
> Now, A occurs only after the scan time elapsed, which means it could
> potentially run multiple times in case the work is not scheduled before
> the next tick, I'm not sure adding more checks to make sure it
> happens once and only once really makes sense here.
The scan is gated by two checks now:
> + if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
> + if (mm_cid_needs_scan(t->mm))
And likewise for the periodic check for preemption:
> + if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
[...]
> + else if (time_after(jiffies, t->last_rseq_preempt +
> + msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
Those second levels of time checks would prevent adding significant
overhead on every tick after the threshold is reached.
>
> B is occurring after the work updates the last scan time, so we are in a
> condition where the runtime is above threshold but the (next) scan time
> did not expire yet.
> I tried to account for multiple threads updating the mm_cid (not
> necessarily the long running one, or in case more are long running), for
> this I'm tracking the last time we updated the mm_cid, if that occurred
> before the last mm_cid scan, we need to update (and preempt).
>
> Does this make sense to you?
It makes sense. Note that it adds overhead to rseq_preempt() (a store
to t->last_rseq_preempt), which is a fast path. I don't know if we
should care.
Also part of task_tick_mm_cid could be moved to a helper, e.g.:
static
void task_reset_mm_cid(struct rq *rq, struct task_struct *t)
{
int old_cid = t->mm_cid;
if (!t->mm_cid_active)
return;
mm_cid_snapshot_time(rq, t->mm);
mm_cid_put_lazy(t);
t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, t->mm);
if (old_cid == t->mm_cid)
return;
rseq_preempt(t);
}
Thanks,
Mathieu
>
> Thanks,
> Gabriele
>
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
> ---
> include/linux/rseq.h | 14 +-------------
> include/linux/sched.h | 1 +
> kernel/sched/core.c | 42 +++++++++++++++++++++++++++++++++++++++++-
> kernel/sched/sched.h | 3 +++
> 4 files changed, 46 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/rseq.h b/include/linux/rseq.h
> index d20fd72f4c80d..7e3fa2ae9e7a4 100644
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -7,8 +7,6 @@
> #include <linux/preempt.h>
> #include <linux/sched.h>
>
> -#define RSEQ_UNPREEMPTED_THRESHOLD (100ULL * 1000000) /* 100ms */
> -
> /*
> * Map the event mask on the user-space ABI enum rseq_cs_flags
> * for direct mask checks.
> @@ -54,14 +52,7 @@ static inline void rseq_preempt(struct task_struct *t)
> {
> __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
> rseq_set_notify_resume(t);
> -}
> -
> -static inline void rseq_preempt_from_tick(struct task_struct *t)
> -{
> - u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
> -
> - if (rtime > RSEQ_UNPREEMPTED_THRESHOLD)
> - rseq_preempt(t);
> + t->last_rseq_preempt = jiffies;
> }
>
> /* rseq_migrate() requires preemption to be disabled. */
> @@ -114,9 +105,6 @@ static inline void rseq_signal_deliver(struct ksignal *ksig,
> static inline void rseq_preempt(struct task_struct *t)
> {
> }
> -static inline void rseq_preempt_from_tick(struct task_struct *t)
> -{
> -}
> static inline void rseq_migrate(struct task_struct *t)
> {
> }
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 851933e62bed3..5b057095d5dc0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1424,6 +1424,7 @@ struct task_struct {
> int last_mm_cid; /* Most recent cid in mm */
> int migrate_from_cpu;
> int mm_cid_active; /* Whether cid bitmap is active */
> + unsigned long last_rseq_preempt; /* Time of last preempt in jiffies */
> #endif
>
> struct tlbflush_unmap_batch tlb_ubc;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 52ad709094167..9f0c9cc284804 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5663,7 +5663,7 @@ void sched_tick(void)
> resched_latency = cpu_resched_latency(rq);
> calc_global_load_tick(rq);
> sched_core_tick(rq);
> - rseq_preempt_from_tick(donor);
> + task_tick_mm_cid(rq, donor);
> scx_tick(rq);
>
> rq_unlock(rq, &rf);
> @@ -10618,6 +10618,46 @@ void init_sched_mm_cid(struct task_struct *t)
> }
> }
>
> +void task_tick_mm_cid(struct rq *rq, struct task_struct *t)
> +{
> + u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
> +
> + /*
> + * If a task is running unpreempted for a long time, it won't get its
> + * mm_cid compacted and won't update its mm_cid value after a
> + * compaction occurs.
> + * For such a task, this function does two things:
> + * A) trigger the mm_cid recompaction,
> + * B) trigger an update of the task's rseq->mm_cid field at some point
> + * after recompaction, so it can get a mm_cid value closer to 0.
> + * A change in the mm_cid triggers an rseq_preempt.
> + *
> + * A occurs only after the next scan time elapsed but before the
> + * compaction work is actually scheduled.
> + * B occurs once after the compaction work completes, that is when scan
> + * is no longer needed (it occurred for this mm) but the last rseq
> + * preempt was done before the last mm_cid scan.
> + */
> + if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
> + if (mm_cid_needs_scan(t->mm))
> + rseq_set_notify_resume(t);
> + else if (time_after(jiffies, t->last_rseq_preempt +
> + msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
> + int old_cid = t->mm_cid;
> +
> + if (!t->mm_cid_active)
> + return;
> + mm_cid_snapshot_time(rq, t->mm);
> + mm_cid_put_lazy(t);
> + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, t->mm);
> + if (old_cid == t->mm_cid)
> + t->last_rseq_preempt = jiffies;
> + else
> + rseq_preempt(t);
> + }
> + }
> +}
> +
> /* Call only when curr is a user thread. */
> void task_queue_mm_cid(struct task_struct *curr)
> {
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 1703cd16d5433..7d104d12ed974 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3582,12 +3582,14 @@ extern const char *preempt_modes[];
>
> #define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
> #define MM_CID_SCAN_DELAY 100 /* 100ms */
> +#define RSEQ_UNPREEMPTED_THRESHOLD SCHED_MM_CID_PERIOD_NS
>
> extern raw_spinlock_t cid_lock;
> extern int use_cid_lock;
>
> extern void sched_mm_cid_migrate_from(struct task_struct *t);
> extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
> +extern void task_tick_mm_cid(struct rq *rq, struct task_struct *t);
> extern void init_sched_mm_cid(struct task_struct *t);
>
> static inline void __mm_cid_put(struct mm_struct *mm, int cid)
> @@ -3856,6 +3858,7 @@ static inline void switch_mm_cid(struct rq *rq,
> static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
> static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
> static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
> +static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *t) { }
> static inline void init_sched_mm_cid(struct task_struct *t) { }
> #endif /* !CONFIG_SCHED_MM_CID */
>
>
> base-commit: c59c19fcfad857c96effa3b2e9eb6d934d2380d8
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] fixup: [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work_struct
2025-04-10 14:04 ` Mathieu Desnoyers
@ 2025-04-10 14:36 ` Gabriele Monaco
0 siblings, 0 replies; 9+ messages in thread
From: Gabriele Monaco @ 2025-04-10 14:36 UTC (permalink / raw)
To: Mathieu Desnoyers, peterz, Ingo Molnar, linux-kernel
Cc: akpm, linux-mm, paulmck, shuah
On Thu, 2025-04-10 at 10:04 -0400, Mathieu Desnoyers wrote:
> On 2025-04-10 08:50, Gabriele Monaco wrote:
> > Thanks both for the comments, I tried to implement what Mathieu
> > suggested. This patch applies directly on 2/3 but I'm sending it
> > here
> > first to get feedback.
> >
> > Essentially, I refactored a bit to avoid the need to add more
> > dependencies to rseq, the rseq_tick is now called task_tick_mm_cid
> > (as
> > before the series) and it does the two things you mentioned:
> > * A) trigger the mm_cid recompaction
> > * B) trigger an update of the task's rseq->mm_cid field at some
> > point
> > after recompaction, so it can get a mm_cid value closer to 0.
> >
> > Now, A occurs only after the scan time elapsed, which means it
> > could
> > potentially run multiple times in case the work is not scheduled
> > before
> > the next tick, I'm not sure adding more checks to make sure it
> > happens once and only once really makes sense here.
>
> The scan is gated by two checks now:
>
> > + if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
> > + if (mm_cid_needs_scan(t->mm))
>
> And likewise for the periodic check for preemption:
>
Alright, I could add another flag here to prevent re-scheduling,
> > + if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
> [...]
> > + else if (time_after(jiffies, t->last_rseq_preempt
> > +
> > +
> > msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
>
> Those second levels of time checks would prevent adding significant
> overhead on every tick after the threshold is reached.
>
But I'm not sure what to do here: in my understanding, this second
branch can only run once for task t and may run despite the previous
path was skipped (let's assume two long running threads share the mm,
the first thread schedules the work and it completes before the second
threads meets a qualifying tick).
> >
> > B is occurring after the work updates the last scan time, so we are
> > in a
> > condition where the runtime is above threshold but the (next) scan
> > time
> > did not expire yet.
> > I tried to account for multiple threads updating the mm_cid (not
> > necessarily the long running one, or in case more are long
> > running), for
> > this I'm tracking the last time we updated the mm_cid, if that
> > occurred
> > before the last mm_cid scan, we need to update (and preempt).
> >
> > Does this make sense to you?
>
> It makes sense. Note that it adds overhead to rseq_preempt() (a store
> to t->last_rseq_preempt), which is a fast path. I don't know if we
> should care.
Well, I'm trying to track the last time a reset occurred, that happens
rather in mm_cid_get, which doesn't look like a fast path to me.
I could then rename it to last_rseq_reset since it won't be related to
preemption.
>
> Also part of task_tick_mm_cid could be moved to a helper, e.g.:
>
> static
> void task_reset_mm_cid(struct rq *rq, struct task_struct *t)
> {
> int old_cid = t->mm_cid;
>
> if (!t->mm_cid_active)
> return;
> mm_cid_snapshot_time(rq, t->mm);
> mm_cid_put_lazy(t);
> t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, t->mm);
> if (old_cid == t->mm_cid)
> return;
> rseq_preempt(t);
> }
Yeah good point
Thanks,
Gabriele
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2025-04-10 14:36 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20250311062849.72083-1-gmonaco@redhat.com>
2025-03-11 6:28 ` [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work_struct Gabriele Monaco
2025-04-09 14:03 ` Peter Zijlstra
2025-04-09 14:15 ` Mathieu Desnoyers
2025-04-09 15:20 ` Peter Zijlstra
2025-04-09 15:53 ` Mathieu Desnoyers
2025-04-09 19:08 ` Peter Zijlstra
2025-04-10 12:50 ` [PATCH] fixup: " Gabriele Monaco
2025-04-10 14:04 ` Mathieu Desnoyers
2025-04-10 14:36 ` Gabriele Monaco
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox