From: Peter Zijlstra <peterz@infradead.org>
To: Nicholas Piggin <npiggin@gmail.com>
Cc: linux-kernel@vger.kernel.org, x86@kernel.org,
Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
Arnd Bergmann <arnd@arndb.de>,
linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
linux-mm@kvack.org, Anton Blanchard <anton@ozlabs.org>
Subject: Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
Date: Wed, 2 Dec 2020 15:19:57 +0100 [thread overview]
Message-ID: <20201202141957.GJ3021@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <20201128160141.1003903-7-npiggin@gmail.com>
On Sun, Nov 29, 2020 at 02:01:39AM +1000, Nicholas Piggin wrote:
> + * - A delayed freeing and RCU-like quiescing sequence based on
> + * mm switching to avoid IPIs completely.
That one's interesting too. so basically you want to count switch_mm()
invocations on each CPU. Then, periodically snapshot the counter on each
CPU, and when they've all changed, increment a global counter.
Then, you snapshot the global counter and wait for it to increment
(twice I think, the first increment might already be in progress).
The only question here is what should drive this machinery.. the tick
probably.
This shouldn't be too hard to do I think.
Something a little like so perhaps?
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41404afb7f4c..27b64a60a468 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4525,6 +4525,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ rq->nr_mm_switches++;
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
@@ -4739,6 +4740,80 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
+static DEFINE_PER_CPU(unsigned long[2], mm_switches);
+
+static struct {
+ unsigned long __percpu *switches[2];
+ unsigned long generation;
+ atomic_t complete;
+ struct wait_queue_dead wait;
+} mm_foo = {
+ .switches = &mm_switches,
+ .generation = 0,
+ .complete = -1, // XXX bootstrap, hotplug
+ .wait = __WAIT_QUEUE_HEAD_INITIALIZER(mm_foo.wait),
+};
+
+static void mm_gen_tick(int cpu, struct rq *rq)
+{
+ unsigned long prev, curr, switches = rq->nr_mm_switches;
+ int idx = READ_ONCE(mm_foo.generation) & 1;
+
+ /* DATA-DEP on mm_foo.generation */
+
+ prev = __this_cpu_read(mm_foo.switches[idx^1]);
+ curr = __this_cpu_read(mm_foo.switches[idx]);
+
+ /* we haven't switched since the last generation */
+ if (prev == switches)
+ return false;
+
+ __this_cpu_write(mm_foo.switches[idx], switches);
+
+ /*
+ * If @curr is less than @prev, this is the first update of
+ * this generation, per the above, switches has also increased since,
+ * so mark out CPU complete.
+ */
+ if ((long)(curr - prev) < 0 && atomic_dec_and_test(&mm_foo.complete)) {
+ /*
+ * All CPUs are complete, IOW they all switched at least once
+ * since the last generation. Reset the completion counter and
+ * increment the generation.
+ */
+ atomic_set(&mm_foo.complete, nr_online_cpus());
+ /*
+ * Matches the address dependency above:
+ *
+ * idx = gen & 1 complete = nr_cpus
+ * <DATA-DEP> <WMB>
+ * curr = sw[idx] generation++;
+ * prev = sw[idx^1]
+ * if (curr < prev)
+ * complete--
+ *
+ * If we don't observe the new generation; we'll not decrement. If we
+ * do see the new generation, we must also see the new completion count.
+ */
+ smp_wmb();
+ mm_foo.generation++;
+ return true;
+ }
+
+ return false;
+}
+
+static void mm_gen_wake(void)
+{
+ wake_up_all(&mm_foo.wait);
+}
+
+static void mm_gen_wait(void)
+{
+ unsigned int gen = READ_ONCE(mm_foo.generation);
+ wait_event(&mm_foo.wait, READ_ONCE(mm_foo.generation) - gen > 1);
+}
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -4750,6 +4825,7 @@ void scheduler_tick(void)
struct task_struct *curr = rq->curr;
struct rq_flags rf;
unsigned long thermal_pressure;
+ bool wake_mm_gen;
arch_scale_freq_tick();
sched_clock_tick();
@@ -4763,8 +4839,13 @@ void scheduler_tick(void)
calc_global_load_tick(rq);
psi_task_tick(rq);
+ wake_mm_gen = mm_gen_tick(cpu, rq);
+
rq_unlock(rq, &rf);
+ if (wake_mm_gen)
+ mm_gen_wake();
+
perf_event_task_tick();
#ifdef CONFIG_SMP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bf9d8da7d35e..62fb685db8d0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -927,6 +927,7 @@ struct rq {
unsigned int ttwu_pending;
#endif
u64 nr_switches;
+ u64 nr_mm_switches;
#ifdef CONFIG_UCLAMP_TASK
/* Utilization clamp values based on CPU's RUNNABLE tasks */
next prev parent reply other threads:[~2020-12-02 14:20 UTC|newest]
Thread overview: 46+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-11-28 16:01 [PATCH 0/8] shoot lazy tlbs Nicholas Piggin
2020-11-28 16:01 ` [PATCH 1/8] lazy tlb: introduce exit_lazy_tlb Nicholas Piggin
2020-11-29 0:38 ` Andy Lutomirski
2020-12-02 2:49 ` Nicholas Piggin
2020-11-28 16:01 ` [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode Nicholas Piggin
2020-11-28 17:55 ` Andy Lutomirski
2020-12-02 2:49 ` Nicholas Piggin
2020-12-03 5:09 ` Andy Lutomirski
2020-12-05 8:00 ` Nicholas Piggin
2020-12-05 16:11 ` Andy Lutomirski
2020-12-05 23:14 ` Nicholas Piggin
2020-12-06 0:36 ` Andy Lutomirski
2020-12-06 3:59 ` Nicholas Piggin
2020-12-11 0:11 ` Andy Lutomirski
2020-12-14 4:07 ` Nicholas Piggin
2020-12-14 5:53 ` Nicholas Piggin
2020-11-30 14:57 ` Mathieu Desnoyers
2020-11-28 16:01 ` [PATCH 3/8] x86: remove ARCH_HAS_SYNC_CORE_BEFORE_USERMODE Nicholas Piggin
2020-11-28 16:01 ` [PATCH 4/8] lazy tlb: introduce lazy mm refcount helper functions Nicholas Piggin
2020-11-28 16:01 ` [PATCH 5/8] lazy tlb: allow lazy tlb mm switching to be configurable Nicholas Piggin
2020-11-29 0:36 ` Andy Lutomirski
2020-12-02 2:49 ` Nicholas Piggin
2020-11-28 16:01 ` [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option Nicholas Piggin
2020-11-29 3:54 ` Andy Lutomirski
2020-11-29 20:16 ` Andy Lutomirski
2020-11-30 9:25 ` Peter Zijlstra
2020-11-30 18:31 ` Andy Lutomirski
2020-12-01 21:27 ` Will Deacon
2020-12-01 21:50 ` Andy Lutomirski
2020-12-01 23:04 ` Will Deacon
2020-12-02 3:47 ` Nicholas Piggin
2020-12-03 5:05 ` Andy Lutomirski
2020-12-03 17:03 ` Alexander Gordeev
2020-12-03 17:14 ` Andy Lutomirski
2020-12-03 18:33 ` Alexander Gordeev
2020-11-30 9:26 ` Peter Zijlstra
2020-11-30 9:30 ` Peter Zijlstra
2020-11-30 9:34 ` Peter Zijlstra
2020-12-02 3:09 ` Nicholas Piggin
2020-12-02 11:17 ` Peter Zijlstra
2020-12-02 12:45 ` Peter Zijlstra
2020-12-02 14:19 ` Peter Zijlstra [this message]
2020-12-02 14:38 ` Andy Lutomirski
2020-12-02 16:29 ` Peter Zijlstra
2020-11-28 16:01 ` [PATCH 7/8] powerpc: use lazy mm refcount helper functions Nicholas Piggin
2020-11-28 16:01 ` [PATCH 8/8] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN Nicholas Piggin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20201202141957.GJ3021@hirez.programming.kicks-ass.net \
--to=peterz@infradead.org \
--cc=anton@ozlabs.org \
--cc=arnd@arndb.de \
--cc=linux-arch@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=npiggin@gmail.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox