* [RFC PATCH 1/7] mm: remove the per-task RSS counter cache
2022-07-28 20:45 [RFC PATCH 0/7] Replace per-task RSS cache with per-CPU RSS cache Kairui Song
@ 2022-07-28 20:45 ` Kairui Song
2022-07-28 20:45 ` [RFC PATCH 2/7] mm: move check_mm to memory.c Kairui Song
` (5 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Kairui Song @ 2022-07-28 20:45 UTC (permalink / raw)
To: linux-mm; +Cc: linux-kernel, Andrew Morton, Kairui Song
From: Kairui Song <kasong@tencent.com>
The RSS counter cached as introduced in
commit 34e55232e59f ("mm: avoid false sharing of mm_counter") to ease
the contention of the RSS counters of a mm_struct.
There are several problems with this, and the 64 events threshold
might not be an optimal value. It makes the RSS value inaccurate,
in the worst case, RSS value is not accounted until 64 pages are
allocated. With common tools like `top`, there could be hundreds of
MBs of error of the RSS value being reported by kernel.
And since 4 counters share the same event threshold, in the worst case,
each counter will do a global sync every 16 events, which still raises
some contention.
Remove this cache for now, and prepare for a different approach.
Some helper macros are kept since they will come in handy later.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
Documentation/filesystems/proc.rst | 7 -----
fs/exec.c | 2 --
include/linux/mm.h | 20 +-----------
include/linux/mm_types_task.h | 9 ------
include/linux/sched.h | 3 --
kernel/exit.c | 5 ---
kernel/fork.c | 4 ---
kernel/kthread.c | 1 -
mm/madvise.c | 7 ++---
mm/memory.c | 49 ------------------------------
10 files changed, 3 insertions(+), 104 deletions(-)
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 1bc91fb8c321..04a0a18da262 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -224,13 +224,6 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file
contains detailed information about the process itself. Its fields are
explained in Table 1-4.
-(for SMP CONFIG users)
-
-For making accounting scalable, RSS related information are handled in an
-asynchronous manner and the value may not be very precise. To see a precise
-snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
-It's slow but very precise.
-
.. table:: Table 1-2: Contents of the status files (as of 4.19)
========================== ===================================================
diff --git a/fs/exec.c b/fs/exec.c
index 778123259e42..3c787ca8c68e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -988,8 +988,6 @@ static int exec_mmap(struct mm_struct *mm)
tsk = current;
old_mm = current->mm;
exec_mm_release(tsk, old_mm);
- if (old_mm)
- sync_mm_rss(old_mm);
ret = down_write_killable(&tsk->signal->exec_update_lock);
if (ret)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf3d0d673f6b..6346f7e77dc7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1998,17 +1998,7 @@ static inline bool get_user_page_fast_only(unsigned long addr,
*/
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
- long val = atomic_long_read(&mm->rss_stat.count[member]);
-
-#ifdef SPLIT_RSS_COUNTING
- /*
- * counter is updated in asynchronous manner and may go to minus.
- * But it's never be expected number for users.
- */
- if (val < 0)
- val = 0;
-#endif
- return (unsigned long)val;
+ return atomic_long_read(&mm->rss_stat.count[member]);
}
void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
@@ -2094,14 +2084,6 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
*maxrss = hiwater_rss;
}
-#if defined(SPLIT_RSS_COUNTING)
-void sync_mm_rss(struct mm_struct *mm);
-#else
-static inline void sync_mm_rss(struct mm_struct *mm)
-{
-}
-#endif
-
#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline int pte_special(pte_t pte)
{
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index c1bc6731125c..a00327c663db 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -48,15 +48,6 @@ enum {
NR_MM_COUNTERS
};
-#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
-#define SPLIT_RSS_COUNTING
-/* per-thread cached information, */
-struct task_rss_stat {
- int events; /* for synchronization threshold */
- int count[NR_MM_COUNTERS];
-};
-#endif /* USE_SPLIT_PTE_PTLOCKS */
-
struct mm_rss_stat {
atomic_long_t count[NR_MM_COUNTERS];
};
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c46f3a63b758..11d3e1a95302 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -861,9 +861,6 @@ struct task_struct {
/* Per-thread vma caching: */
struct vmacache vmacache;
-#ifdef SPLIT_RSS_COUNTING
- struct task_rss_stat rss_stat;
-#endif
int exit_state;
int exit_code;
int exit_signal;
diff --git a/kernel/exit.c b/kernel/exit.c
index 64c938ce36fe..8c55cda5136f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -482,7 +482,6 @@ static void exit_mm(void)
exit_mm_release(current, mm);
if (!mm)
return;
- sync_mm_rss(mm);
mmap_read_lock(mm);
mmgrab(mm);
BUG_ON(mm != current->active_mm);
@@ -749,10 +748,6 @@ void __noreturn do_exit(long code)
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
-
- /* sync mm's RSS info before statistics gathering */
- if (tsk->mm)
- sync_mm_rss(tsk->mm);
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d44f2d46c69..c090ebd55063 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2145,10 +2145,6 @@ static __latent_entropy struct task_struct *copy_process(
p->io_uring = NULL;
#endif
-#if defined(SPLIT_RSS_COUNTING)
- memset(&p->rss_stat, 0, sizeof(p->rss_stat));
-#endif
-
p->default_timer_slack_ns = current->timer_slack_ns;
#ifdef CONFIG_PSI
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3c677918d8f2..6bfbab4e2103 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1463,7 +1463,6 @@ void kthread_unuse_mm(struct mm_struct *mm)
* clearing tsk->mm.
*/
smp_mb__after_spinlock();
- sync_mm_rss(mm);
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
diff --git a/mm/madvise.c b/mm/madvise.c
index 0316bbc6441b..48cb9e5f92d2 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -711,12 +711,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
mark_page_lazyfree(page);
}
out:
- if (nr_swap) {
- if (current->mm == mm)
- sync_mm_rss(mm);
-
+ if (nr_swap)
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
- }
+
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_pte, ptl);
cond_resched();
diff --git a/mm/memory.c b/mm/memory.c
index 4cf7d4b6c950..6bf7826e666b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -176,53 +176,9 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
trace_rss_stat(mm, member, count);
}
-#if defined(SPLIT_RSS_COUNTING)
-
-void sync_mm_rss(struct mm_struct *mm)
-{
- int i;
-
- for (i = 0; i < NR_MM_COUNTERS; i++) {
- if (current->rss_stat.count[i]) {
- add_mm_counter(mm, i, current->rss_stat.count[i]);
- current->rss_stat.count[i] = 0;
- }
- }
- current->rss_stat.events = 0;
-}
-
-static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
-{
- struct task_struct *task = current;
-
- if (likely(task->mm == mm))
- task->rss_stat.count[member] += val;
- else
- add_mm_counter(mm, member, val);
-}
-#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
-#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
-
-/* sync counter once per 64 page faults */
-#define TASK_RSS_EVENTS_THRESH (64)
-static void check_sync_rss_stat(struct task_struct *task)
-{
- if (unlikely(task != current))
- return;
- if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
- sync_mm_rss(task->mm);
-}
-#else /* SPLIT_RSS_COUNTING */
-
#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
-static void check_sync_rss_stat(struct task_struct *task)
-{
-}
-
-#endif /* SPLIT_RSS_COUNTING */
-
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -502,8 +458,6 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
int i;
- if (current->mm == mm)
- sync_mm_rss(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
if (rss[i])
add_mm_counter(mm, i, rss[i]);
@@ -5120,9 +5074,6 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
count_vm_event(PGFAULT);
count_memcg_event_mm(vma->vm_mm, PGFAULT);
- /* do counter updates before entering really critical section. */
- check_sync_rss_stat(current);
-
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
--
2.35.2
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH 2/7] mm: move check_mm to memory.c
2022-07-28 20:45 [RFC PATCH 0/7] Replace per-task RSS cache with per-CPU RSS cache Kairui Song
2022-07-28 20:45 ` [RFC PATCH 1/7] mm: remove the per-task RSS counter cache Kairui Song
@ 2022-07-28 20:45 ` Kairui Song
2022-07-28 20:45 ` [RFC PATCH 3/7] mm/headers: change emun order of MM_COUNTERS Kairui Song
` (4 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Kairui Song @ 2022-07-28 20:45 UTC (permalink / raw)
To: linux-mm; +Cc: linux-kernel, Andrew Morton, Kairui Song
From: Kairui Song <kasong@tencent.com>
No function change, make it possible to do extra mm operation on mm exit,
prepare for following commits.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
include/linux/mm.h | 3 +++
kernel/fork.c | 33 ---------------------------------
mm/memory.c | 32 ++++++++++++++++++++++++++++++++
3 files changed, 35 insertions(+), 33 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6346f7e77dc7..81ad91621078 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1993,6 +1993,9 @@ static inline bool get_user_page_fast_only(unsigned long addr,
{
return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1;
}
+
+void check_mm(struct mm_struct *mm);
+
/*
* per-process(per-mm_struct) statistics.
*/
diff --git a/kernel/fork.c b/kernel/fork.c
index c090ebd55063..86a239772208 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -128,15 +128,6 @@ int nr_threads; /* The idle threads do not count.. */
static int max_threads; /* tunable limit on nr_threads */
-#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
-
-static const char * const resident_page_types[] = {
- NAMED_ARRAY_INDEX(MM_FILEPAGES),
- NAMED_ARRAY_INDEX(MM_ANONPAGES),
- NAMED_ARRAY_INDEX(MM_SWAPENTS),
- NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
-};
-
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
@@ -748,30 +739,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */
-static void check_mm(struct mm_struct *mm)
-{
- int i;
-
- BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
- "Please make sure 'struct resident_page_types[]' is updated as well");
-
- for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = atomic_long_read(&mm->rss_stat.count[i]);
-
- if (unlikely(x))
- pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
- mm, resident_page_types[i], x);
- }
-
- if (mm_pgtables_bytes(mm))
- pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
- mm_pgtables_bytes(mm));
-
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
-#endif
-}
-
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
diff --git a/mm/memory.c b/mm/memory.c
index 6bf7826e666b..c0597214f9b3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -179,6 +179,38 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
+#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
+static const char * const resident_page_types[] = {
+ NAMED_ARRAY_INDEX(MM_FILEPAGES),
+ NAMED_ARRAY_INDEX(MM_ANONPAGES),
+ NAMED_ARRAY_INDEX(MM_SWAPENTS),
+ NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
+};
+
+void check_mm(struct mm_struct *mm)
+{
+ int i;
+
+ BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
+ "Please make sure 'struct resident_page_types[]' is updated as well");
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+ if (unlikely(x))
+ pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
+ mm, resident_page_types[i], x);
+ }
+
+ if (mm_pgtables_bytes(mm))
+ pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+ mm_pgtables_bytes(mm));
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
+#endif
+}
+
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
--
2.35.2
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH 3/7] mm/headers: change emun order of MM_COUNTERS
2022-07-28 20:45 [RFC PATCH 0/7] Replace per-task RSS cache with per-CPU RSS cache Kairui Song
2022-07-28 20:45 ` [RFC PATCH 1/7] mm: remove the per-task RSS counter cache Kairui Song
2022-07-28 20:45 ` [RFC PATCH 2/7] mm: move check_mm to memory.c Kairui Song
@ 2022-07-28 20:45 ` Kairui Song
2022-07-28 20:45 ` [RFC PATCH 4/7] mm: introduce a generic per-CPU RSS cache Kairui Song
` (3 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Kairui Song @ 2022-07-28 20:45 UTC (permalink / raw)
To: linux-mm; +Cc: linux-kernel, Andrew Morton, Kairui Song
From: Kairui Song <kasong@tencent.com>
get_rss reads MM_FILEPAGES, MM_ANONPAGES, MM_SHMEMPAGES. Make them
continues so it's easier to read in a loop in following commits.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
include/linux/mm_types_task.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index a00327c663db..14182ded3fda 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -43,8 +43,8 @@ struct vmacache {
enum {
MM_FILEPAGES, /* Resident file mapping pages */
MM_ANONPAGES, /* Resident anonymous pages */
- MM_SWAPENTS, /* Anonymous swap entries */
MM_SHMEMPAGES, /* Resident shared memory pages */
+ MM_SWAPENTS, /* Anonymous swap entries */
NR_MM_COUNTERS
};
--
2.35.2
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH 4/7] mm: introduce a generic per-CPU RSS cache
2022-07-28 20:45 [RFC PATCH 0/7] Replace per-task RSS cache with per-CPU RSS cache Kairui Song
` (2 preceding siblings ...)
2022-07-28 20:45 ` [RFC PATCH 3/7] mm/headers: change emun order of MM_COUNTERS Kairui Song
@ 2022-07-28 20:45 ` Kairui Song
2022-07-28 20:45 ` [RFC PATCH 5/7] mm: try use fast path for pmd setting as well Kairui Song
` (2 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Kairui Song @ 2022-07-28 20:45 UTC (permalink / raw)
To: linux-mm; +Cc: linux-kernel, Andrew Morton, Kairui Song
From: Kairui Song <kasong@tencent.com>
The RSS cache used to be a per-task cache, and it's batched into 64
events for each atomic sync. The problems is 64 events is too small for
contention reducing, and too large for an accurate RSS accounting.
This per-cpu RSS cache assumes one mm_struct tends to stay on the same CPU,
so if the mm_struct be accounted matches current active_mm, keep the RSS
accounting CPU local until the mm_struct is switched out, and do an atomic
update only upon switch out.
The fast path of CPU local RSS accounting is extremely lightweight, only
set preemption off and then do a CPU local counter increase.
One major effect is that now RSS reading is much more accurate than before,
but also slower. It needs to iterate all possible CPUs that have cached
the RSS and collect the un-committed caches. With a lockless reader design,
this never blocks the RSS accounting fast path, which ensures a good updater
performance. And considering RSS updating is much more common than reading,
this should improve the performance overall.
This CPU iteration can be avoided by using CPU mask to mark the CPUs
that cached the mm_struct and only read from these CPUs.
It can leverage the existing mm_cpumask used for TLB shootdown, this
has to be done arch by arch in later commits.
This commit provides a baseline version that works on all arch, but with
a performance drop for RSS syncing upon read/invalidation.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
include/linux/mm.h | 15 +--
include/linux/mm_types_task.h | 38 +++++++
kernel/fork.c | 2 +-
kernel/sched/core.c | 3 +
mm/memory.c | 201 ++++++++++++++++++++++++++++++++--
5 files changed, 236 insertions(+), 23 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 81ad91621078..47b8552b1b04 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1994,15 +1994,13 @@ static inline bool get_user_page_fast_only(unsigned long addr,
return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1;
}
-void check_mm(struct mm_struct *mm);
+void check_discard_mm(struct mm_struct *mm);
/*
* per-process(per-mm_struct) statistics.
*/
-static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
-{
- return atomic_long_read(&mm->rss_stat.count[member]);
-}
+unsigned long get_mm_counter(struct mm_struct *mm, int member);
+unsigned long get_mm_rss(struct mm_struct *mm);
void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
@@ -2042,13 +2040,6 @@ static inline int mm_counter(struct page *page)
return mm_counter_file(page);
}
-static inline unsigned long get_mm_rss(struct mm_struct *mm)
-{
- return get_mm_counter(mm, MM_FILEPAGES) +
- get_mm_counter(mm, MM_ANONPAGES) +
- get_mm_counter(mm, MM_SHMEMPAGES);
-}
-
static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
return max(mm->hiwater_rss, get_mm_rss(mm));
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 14182ded3fda..d5d3fbece174 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -12,6 +12,7 @@
#include <linux/threads.h>
#include <linux/atomic.h>
#include <linux/cpumask.h>
+#include <linux/cache.h>
#include <asm/page.h>
@@ -52,6 +53,43 @@ struct mm_rss_stat {
atomic_long_t count[NR_MM_COUNTERS];
};
+struct mm_rss_cache {
+ /*
+ * CPU local only variables, hot path for RSS caching. Readonly for other CPUs.
+ */
+ unsigned long in_use;
+ long count[NR_MM_COUNTERS];
+
+ /* Avoid false sharing when other CPUs collect RSS counter */
+ struct mm_struct *mm ____cacheline_aligned;
+ /* Avoid ABA problem and RSS being accounted for wrong mm */
+ unsigned long sync_count;
+};
+
+/* lowest bit of *mm is never used, so use it as a syncing flag */
+#define RSS_CACHE_MM_SYNCING_MASK 1UL
+
+/* mark the mm as being synced on that cache */
+static __always_inline struct mm_struct *__pcp_rss_mm_mark(struct mm_struct *mm)
+{
+ unsigned long val = (unsigned long)mm;
+
+ val |= RSS_CACHE_MM_SYNCING_MASK;
+
+ return (struct mm_struct *) val;
+}
+
+static __always_inline struct mm_struct *__pcp_rss_mm_unmark(struct mm_struct *mm)
+{
+ unsigned long val = (unsigned long)mm;
+
+ val &= ~RSS_CACHE_MM_SYNCING_MASK;
+
+ return (struct mm_struct *) val;
+}
+
+void switch_pcp_rss_cache_no_irq(struct mm_struct *next_mm);
+
struct page_frag {
struct page *page;
#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
diff --git a/kernel/fork.c b/kernel/fork.c
index 86a239772208..c2f5f6eef6a6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -755,9 +755,9 @@ void __mmdrop(struct mm_struct *mm)
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_subscriptions_destroy(mm);
- check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
+ check_discard_mm(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da0bf6fe9ecd..11df67bb52ee 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5142,6 +5142,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
prepare_lock_switch(rq, next, rf);
+ /* Cache new active_mm */
+ switch_pcp_rss_cache_no_irq(next->active_mm);
+
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
barrier();
diff --git a/mm/memory.c b/mm/memory.c
index c0597214f9b3..f00f302143b6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -176,8 +176,143 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
trace_rss_stat(mm, member, count);
}
-#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
-#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct mm_rss_cache, cpu_rss_cache);
+
+/*
+ * get_mm_counter and get_mm_rss try to read the RSS cache of each
+ * CPU that cached target mm. If the cache is flushed while being read,
+ * skip it. May lead to rare and little bit of accuracy loss, but flushed
+ * cache will surely be accounted in the next read.
+ */
+unsigned long get_mm_counter(struct mm_struct *mm, int member)
+{
+ int cpu;
+ long ret, update, sync_count;
+
+ ret = atomic_long_read(&mm->rss_stat.count[member]);
+ for_each_possible_cpu(cpu) {
+ if (READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu)) != mm)
+ continue;
+ sync_count = READ_ONCE(per_cpu(cpu_rss_cache.sync_count, cpu));
+ /* see smp_mb in switch_pcp_rss_cache_no_irq */
+ smp_rmb();
+
+ update = READ_ONCE(per_cpu(cpu_rss_cache.count[member], cpu));
+
+ /* same as above */
+ smp_rmb();
+ if (READ_ONCE(per_cpu(cpu_rss_cache.sync_count, cpu)) == sync_count &&
+ READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu)) == mm)
+ ret += update;
+ }
+
+ if (ret < 0)
+ ret = 0;
+
+ return ret;
+}
+
+/* see comment for get_mm_counter */
+unsigned long get_mm_rss(struct mm_struct *mm)
+{
+ int cpu;
+ long ret, update, sync_count;
+
+ ret = atomic_long_read(&mm->rss_stat.count[MM_FILEPAGES]),
+ + atomic_long_read(&mm->rss_stat.count[MM_ANONPAGES]),
+ + atomic_long_read(&mm->rss_stat.count[MM_SHMEMPAGES]);
+
+ for_each_possible_cpu(cpu) {
+ if (READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu)) != mm)
+ continue;
+ sync_count = READ_ONCE(per_cpu(cpu_rss_cache.sync_count, cpu));
+ /* see smp_mb in switch_pcp_rss_cache_no_irq */
+ smp_rmb();
+
+ /* Reads MM_FILEPAGES, MM_ANONPAGES, MM_SHMEMPAGES */
+ for (int i = MM_FILEPAGES; i < MM_SWAPENTS; i++)
+ update += READ_ONCE(per_cpu(cpu_rss_cache.count[i], cpu));
+
+ /* same as above */
+ smp_rmb();
+ if (READ_ONCE(per_cpu(cpu_rss_cache.sync_count, cpu)) == sync_count &&
+ READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu)) == mm)
+ ret += update;
+ }
+
+ if (ret < 0)
+ ret = 0;
+
+ return ret;
+}
+
+/* flush the rss cache of current CPU with IRQ disabled, and switch to new mm */
+void switch_pcp_rss_cache_no_irq(struct mm_struct *next_mm)
+{
+ long count;
+ struct mm_struct *cpu_mm;
+
+ cpu_mm = this_cpu_read(cpu_rss_cache.mm);
+ if (cpu_mm == next_mm)
+ return;
+
+ /*
+ * `in_use` counter is hold with preempt disabled, if non-zero, this would be a
+ * interrupt switching the mm, just ignore it.
+ */
+ if (this_cpu_read(cpu_rss_cache.in_use))
+ return;
+
+ if (cpu_mm == NULL)
+ goto commit_done;
+
+ /* Race with check_discard_rss_cache */
+ if (cpu_mm != cmpxchg(this_cpu_ptr(&cpu_rss_cache.mm), cpu_mm,
+ __pcp_rss_mm_mark(cpu_mm)))
+ goto commit_done;
+
+ for (int i = 0; i < NR_MM_COUNTERS; i++) {
+ count = this_cpu_read(cpu_rss_cache.count[i]);
+ if (count)
+ add_mm_counter(cpu_mm, i, count);
+ }
+
+commit_done:
+ for (int i = 0; i < NR_MM_COUNTERS; i++)
+ this_cpu_write(cpu_rss_cache.count[i], 0);
+
+ /*
+ * For remote reading in get_mm_{rss,counter},
+ * ensure new mm and sync counter have zero'ed counters
+ */
+ smp_wmb();
+ this_cpu_write(cpu_rss_cache.mm, next_mm);
+ this_cpu_inc(cpu_rss_cache.sync_count);
+}
+
+static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
+{
+ /*
+ * Disable preempt so task is pinned, and the mm is pinned on this CPU
+ * since caller must be holding a reference.
+ */
+ preempt_disable();
+ this_cpu_inc(cpu_rss_cache.in_use);
+
+ if (likely(mm == this_cpu_read(cpu_rss_cache.mm))) {
+ this_cpu_add(cpu_rss_cache.count[member], val);
+ this_cpu_dec(cpu_rss_cache.in_use);
+ /* Avoid the resched checking oveahead for fast path */
+ preempt_enable_no_resched();
+ } else {
+ this_cpu_dec(cpu_rss_cache.in_use);
+ preempt_enable_no_resched();
+ add_mm_counter(mm, member, val);
+ }
+}
+
+#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
+#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
static const char * const resident_page_types[] = {
@@ -187,20 +322,64 @@ static const char * const resident_page_types[] = {
NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
};
-void check_mm(struct mm_struct *mm)
+static void check_discard_rss_cache(struct mm_struct *mm)
{
- int i;
+ int cpu;
+ long cached_count[NR_MM_COUNTERS] = { 0 };
+ struct mm_struct *cpu_mm;
- BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
- "Please make sure 'struct resident_page_types[]' is updated as well");
+ /* Invalidate the RSS cache on every CPU */
+ for_each_possible_cpu(cpu) {
+ cpu_mm = READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu));
+ if (__pcp_rss_mm_unmark(cpu_mm) != mm)
+ continue;
+
+ /*
+ * If not being flusehd, try read-in the counter and mark it NULL,
+ * once cache's mm is set NULL, counter are considered invalided
+ */
+ if (cpu_mm != __pcp_rss_mm_mark(cpu_mm)) {
+ long count[NR_MM_COUNTERS];
- for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = atomic_long_read(&mm->rss_stat.count[i]);
+ for (int i = 0; i < NR_MM_COUNTERS; i++)
+ count[i] = READ_ONCE(per_cpu(cpu_rss_cache.count[i], cpu));
- if (unlikely(x))
+ /*
+ * If successfully set to NULL, the owner CPU is not flushing it, counters
+ * are uncommiteed and untouched during this period, since a dying mm won't
+ * be accouted anymore
+ */
+ cpu_mm = cmpxchg(&per_cpu(cpu_rss_cache.mm, cpu), mm, NULL);
+ if (cpu_mm == mm) {
+ for (int i = 0; i < NR_MM_COUNTERS; i++)
+ cached_count[i] += count[i];
+ continue;
+ }
+ }
+
+ /* It's being flushed, just busy wait as the critial section is really short */
+ do {
+ cpu_relax();
+ cpu_mm = READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu));
+ } while (cpu_mm == __pcp_rss_mm_mark(mm));
+ }
+
+ for (int i = 0; i < NR_MM_COUNTERS; i++) {
+ long val = atomic_long_read(&mm->rss_stat.count[i]);
+
+ val += cached_count[i];
+
+ if (unlikely(val)) {
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
- mm, resident_page_types[i], x);
+ mm, resident_page_types[i], val);
+ }
}
+}
+
+void check_discard_mm(struct mm_struct *mm)
+{
+ BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
+ "Please make sure 'struct resident_page_types[]' is updated as well");
if (mm_pgtables_bytes(mm))
pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
@@ -209,6 +388,8 @@ void check_mm(struct mm_struct *mm)
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
+
+ check_discard_rss_cache(mm);
}
/*
--
2.35.2
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH 5/7] mm: try use fast path for pmd setting as well
2022-07-28 20:45 [RFC PATCH 0/7] Replace per-task RSS cache with per-CPU RSS cache Kairui Song
` (3 preceding siblings ...)
2022-07-28 20:45 ` [RFC PATCH 4/7] mm: introduce a generic per-CPU RSS cache Kairui Song
@ 2022-07-28 20:45 ` Kairui Song
2022-07-28 20:45 ` [RFC PATCH 6/7] mm: introduce CONFIG_ARCH_PCP_RSS_USE_CPUMASK Kairui Song
2022-07-28 20:45 ` [RFC PATCH 7/7] x86_64/tlb, mm: enable cpumask optimzation for RSS cache Kairui Song
6 siblings, 0 replies; 8+ messages in thread
From: Kairui Song @ 2022-07-28 20:45 UTC (permalink / raw)
To: linux-mm; +Cc: linux-kernel, Andrew Morton, Kairui Song
From: Kairui Song <kasong@tencent.com>
Use the per-CPU RSS cache helper as much as possible.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
mm/memory.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/memory.c b/mm/memory.c
index f00f302143b6..09d7d193da51 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4419,7 +4419,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
+ add_mm_counter_fast(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
page_add_file_rmap(page, vma, true);
/*
--
2.35.2
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH 6/7] mm: introduce CONFIG_ARCH_PCP_RSS_USE_CPUMASK
2022-07-28 20:45 [RFC PATCH 0/7] Replace per-task RSS cache with per-CPU RSS cache Kairui Song
` (4 preceding siblings ...)
2022-07-28 20:45 ` [RFC PATCH 5/7] mm: try use fast path for pmd setting as well Kairui Song
@ 2022-07-28 20:45 ` Kairui Song
2022-07-28 20:45 ` [RFC PATCH 7/7] x86_64/tlb, mm: enable cpumask optimzation for RSS cache Kairui Song
6 siblings, 0 replies; 8+ messages in thread
From: Kairui Song @ 2022-07-28 20:45 UTC (permalink / raw)
To: linux-mm; +Cc: linux-kernel, Andrew Morton, Kairui Song
From: Kairui Song <kasong@tencent.com>
If the arch related code can provide helpers to bind the RSS cache to
mm_cpumask, then the syncing code can just rely on that instead of doing
full CPU synchronization. This speed up the reading/mm_exit by a lot.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
arch/Kconfig | 3 ++
kernel/sched/core.c | 3 +-
mm/memory.c | 94 ++++++++++++++++++++++++++++-----------------
3 files changed, 64 insertions(+), 36 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 71b9272acb28..8df45b6346ae 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1403,6 +1403,9 @@ config ARCH_HAS_ELFCORE_COMPAT
config ARCH_HAS_PARANOID_L1D_FLUSH
bool
+config ARCH_PCP_RSS_USE_CPUMASK
+ bool
+
config DYNAMIC_SIGFRAME
bool
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 11df67bb52ee..6f7991caf24b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5143,7 +5143,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
prepare_lock_switch(rq, next, rf);
/* Cache new active_mm */
- switch_pcp_rss_cache_no_irq(next->active_mm);
+ if (!IS_ENABLED(CONFIG_ARCH_PCP_RSS_USE_CPUMASK))
+ switch_pcp_rss_cache_no_irq(next->active_mm);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
diff --git a/mm/memory.c b/mm/memory.c
index 09d7d193da51..a819009aa3e0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -188,9 +188,16 @@ unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
int cpu;
long ret, update, sync_count;
+ const struct cpumask *mm_mask;
ret = atomic_long_read(&mm->rss_stat.count[member]);
- for_each_possible_cpu(cpu) {
+
+ if (IS_ENABLED(CONFIG_ARCH_PCP_RSS_USE_CPUMASK))
+ mm_mask = mm_cpumask(mm);
+ else
+ mm_mask = cpu_possible_mask;
+
+ for_each_cpu(cpu, mm_mask) {
if (READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu)) != mm)
continue;
sync_count = READ_ONCE(per_cpu(cpu_rss_cache.sync_count, cpu));
@@ -217,12 +224,18 @@ unsigned long get_mm_rss(struct mm_struct *mm)
{
int cpu;
long ret, update, sync_count;
+ const struct cpumask *mm_mask;
ret = atomic_long_read(&mm->rss_stat.count[MM_FILEPAGES]),
+ atomic_long_read(&mm->rss_stat.count[MM_ANONPAGES]),
+ atomic_long_read(&mm->rss_stat.count[MM_SHMEMPAGES]);
- for_each_possible_cpu(cpu) {
+ if (IS_ENABLED(CONFIG_ARCH_PCP_RSS_USE_CPUMASK))
+ mm_mask = mm_cpumask(mm);
+ else
+ mm_mask = cpu_possible_mask;
+
+ for_each_cpu(cpu, mm_mask) {
if (READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu)) != mm)
continue;
sync_count = READ_ONCE(per_cpu(cpu_rss_cache.sync_count, cpu));
@@ -266,10 +279,13 @@ void switch_pcp_rss_cache_no_irq(struct mm_struct *next_mm)
if (cpu_mm == NULL)
goto commit_done;
- /* Race with check_discard_rss_cache */
- if (cpu_mm != cmpxchg(this_cpu_ptr(&cpu_rss_cache.mm), cpu_mm,
- __pcp_rss_mm_mark(cpu_mm)))
- goto commit_done;
+ /* Arch will take care of cache invalidation */
+ if (!IS_ENABLED(CONFIG_ARCH_PCP_RSS_USE_CPUMASK)) {
+ /* Race with check_discard_rss_cache */
+ if (cpu_mm != cmpxchg(this_cpu_ptr(&cpu_rss_cache.mm), cpu_mm,
+ __pcp_rss_mm_mark(cpu_mm)))
+ goto commit_done;
+ }
for (int i = 0; i < NR_MM_COUNTERS; i++) {
count = this_cpu_read(cpu_rss_cache.count[i]);
@@ -328,46 +344,54 @@ static void check_discard_rss_cache(struct mm_struct *mm)
long cached_count[NR_MM_COUNTERS] = { 0 };
struct mm_struct *cpu_mm;
- /* Invalidate the RSS cache on every CPU */
- for_each_possible_cpu(cpu) {
- cpu_mm = READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu));
- if (__pcp_rss_mm_unmark(cpu_mm) != mm)
- continue;
-
- /*
- * If not being flusehd, try read-in the counter and mark it NULL,
- * once cache's mm is set NULL, counter are considered invalided
- */
- if (cpu_mm != __pcp_rss_mm_mark(cpu_mm)) {
- long count[NR_MM_COUNTERS];
-
- for (int i = 0; i < NR_MM_COUNTERS; i++)
- count[i] = READ_ONCE(per_cpu(cpu_rss_cache.count[i], cpu));
+ /* Arch will take care of cache invalidation */
+ if (!IS_ENABLED(CONFIG_ARCH_PCP_RSS_USE_CPUMASK)) {
+ /* Invalidate the RSS cache on every CPU */
+ for_each_possible_cpu(cpu) {
+ cpu_mm = READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu));
+ if (__pcp_rss_mm_unmark(cpu_mm) != mm)
+ continue;
/*
- * If successfully set to NULL, the owner CPU is not flushing it, counters
- * are uncommiteed and untouched during this period, since a dying mm won't
- * be accouted anymore
+ * If not being flusehd, try read-in the counter and mark it NULL,
+ * once cache's mm is set NULL, counter are considered invalided.
*/
- cpu_mm = cmpxchg(&per_cpu(cpu_rss_cache.mm, cpu), mm, NULL);
- if (cpu_mm == mm) {
+ if (cpu_mm != __pcp_rss_mm_mark(cpu_mm)) {
+ long count[NR_MM_COUNTERS];
+
for (int i = 0; i < NR_MM_COUNTERS; i++)
- cached_count[i] += count[i];
- continue;
+ count[i] = READ_ONCE(per_cpu(cpu_rss_cache.count[i], cpu));
+
+ /*
+ * If successfully set to NULL, the owner CPU is not flushing it,
+ * counters are uncommitted and untouched during this period, since
+ * a dying mm won't be accouted anymore.
+ */
+ cpu_mm = cmpxchg(&per_cpu(cpu_rss_cache.mm, cpu), mm, NULL);
+ if (cpu_mm == mm) {
+ for (int i = 0; i < NR_MM_COUNTERS; i++)
+ cached_count[i] += count[i];
+ continue;
+ }
}
- }
- /* It's being flushed, just busy wait as the critial section is really short */
- do {
- cpu_relax();
- cpu_mm = READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu));
- } while (cpu_mm == __pcp_rss_mm_mark(mm));
+ /*
+ * It's being flushed, just busy wait as the critial section
+ * is really short.
+ */
+ do {
+ cpu_relax();
+ cpu_mm = READ_ONCE(per_cpu(cpu_rss_cache.mm, cpu));
+ } while (cpu_mm == __pcp_rss_mm_mark(mm));
+ }
}
for (int i = 0; i < NR_MM_COUNTERS; i++) {
long val = atomic_long_read(&mm->rss_stat.count[i]);
- val += cached_count[i];
+ if (!IS_ENABLED(CONFIG_ARCH_PCP_RSS_USE_CPUMASK)) {
+ val += cached_count[i];
+ }
if (unlikely(val)) {
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
--
2.35.2
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH 7/7] x86_64/tlb, mm: enable cpumask optimzation for RSS cache
2022-07-28 20:45 [RFC PATCH 0/7] Replace per-task RSS cache with per-CPU RSS cache Kairui Song
` (5 preceding siblings ...)
2022-07-28 20:45 ` [RFC PATCH 6/7] mm: introduce CONFIG_ARCH_PCP_RSS_USE_CPUMASK Kairui Song
@ 2022-07-28 20:45 ` Kairui Song
6 siblings, 0 replies; 8+ messages in thread
From: Kairui Song @ 2022-07-28 20:45 UTC (permalink / raw)
To: linux-mm; +Cc: linux-kernel, Andrew Morton, Kairui Song
From: Kairui Song <kasong@tencent.com>
Enable CONFIG_ARCH_PCP_RSS_USE_CPUMASK for x86_64, we do a
RSS cache switch in switch_mm_irqs_off. On x86_64 this is the unified
routine for switching a mm, so hook into this can make sure any dead mm
will have their cache invalidated in time, and cpumask is synced with
cache state.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
arch/x86/Kconfig | 1 +
arch/x86/mm/tlb.c | 5 +++++
2 files changed, 6 insertions(+)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 52a7f91527fe..15e2b29ba972 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -125,6 +125,7 @@ config X86
select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANTS_THP_SWAP if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
+ select ARCH_PCP_RSS_USE_CPUMASK if X86_64
select BUILDTIME_TABLE_SORT
select CLKEVT_I8253
select CLOCKSOURCE_VALIDATE_LAST_CYCLE
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index d400b6d9d246..614865f94d85 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -597,6 +597,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
*/
cond_mitigation(tsk);
+ /*
+ * Flush RSS cache before clear up the bitmask
+ */
+ switch_pcp_rss_cache_no_irq(next);
+
/*
* Stop remote flushes for the previous mm.
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
--
2.35.2
^ permalink raw reply [flat|nested] 8+ messages in thread