[PATCH v0 1/2] mm: swap: Gather swap entries and batch async release core

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Lei Liu <liulei.rjpt@vivo.com>
To: Michal Hocko <mhocko@suse.com>,
	David Rientjes <rientjes@google.com>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	Andrew Morton <akpm@linux-foundation.org>,
	Kemeng Shi <shikemeng@huaweicloud.com>,
	Kairui Song <kasong@tencent.com>, Nhat Pham <nphamcs@gmail.com>,
	Baoquan He <bhe@redhat.com>, Barry Song <baohua@kernel.org>,
	Chris Li <chrisl@kernel.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	Muchun Song <muchun.song@linux.dev>,
	David Hildenbrand <david@redhat.com>,
	Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
	"Liam R. Howlett" <Liam.Howlett@oracle.com>,
	Vlastimil Babka <vbabka@suse.cz>, Mike Rapoport <rppt@kernel.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Chen Yu <yu.c.chen@intel.com>,
	"Peter Zijlstra (Intel)" <peterz@infradead.org>,
	Usama Arif <usamaarif642@gmail.com>,
	Hao Jia <jiahao1@lixiang.com>,
	"Kirill A. Shutemov" <kas@kernel.org>,
	Oleg Nesterov <oleg@redhat.com>,
	Christian Brauner <brauner@kernel.org>,
	Mateusz Guzik <mjguzik@gmail.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Andrii Nakryiko <andrii@kernel.org>,
	Al Viro <viro@zeniv.linux.org.uk>,
	Fushuai Wang <wangfushuai@baidu.com>,
	linux-mm@kvack.org (open list:MEMORY MANAGEMENT - OOM KILLER),
	linux-kernel@vger.kernel.org (open list),
	cgroups@vger.kernel.org (open list:CONTROL GROUP - MEMORY
	RESOURCE CONTROLLER (MEMCG))
Cc: Lei Liu <liulei.rjpt@vivo.com>
Subject: [PATCH v0 1/2] mm: swap: Gather swap entries and batch async release core
Date: Tue,  9 Sep 2025 14:53:40 +0800	[thread overview]
Message-ID: <20250909065349.574894-2-liulei.rjpt@vivo.com> (raw)
In-Reply-To: <20250909065349.574894-1-liulei.rjpt@vivo.com>

Core functionality implementation for asynchronous release of swap entries:
1. For eligible processes, swap pages are first asynchronously aggregated
to a global list
2. Batch release occurs once a defined threshold is reached
3. Asynchronous release is executed by kworkers of a workqueue, with a
max_active configuration macro provided to control concurrent work item
numbers and address NUMA release efficiency issues

Signed-off-by: Lei Liu <liulei.rjpt@vivo.com>
---
 include/linux/oom.h           |  23 ++++++
 include/linux/swapfile.h      |   1 +
 include/linux/vm_event_item.h |   1 +
 kernel/exit.c                 |   2 +
 mm/memcontrol.c               |   6 --
 mm/memory.c                   |   4 +-
 mm/swapfile.c                 | 134 ++++++++++++++++++++++++++++++++++
 mm/vmstat.c                   |   1 +
 8 files changed, 165 insertions(+), 7 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 1e0fc6931ce9..aa34429cc83b 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -56,6 +56,23 @@ struct oom_control {
 extern struct mutex oom_lock;
 extern struct mutex oom_adj_mutex;
 
+extern atomic_t exiting_task_count;  // exiting task counts
+
+static inline int get_exiting_task_count(void)
+{
+	return atomic_read(&exiting_task_count);
+}
+
+static inline void inc_exiting_task_count(void)
+{
+	atomic_inc(&exiting_task_count);
+}
+
+static inline void dec_exiting_task_count(void)
+{
+	atomic_dec(&exiting_task_count);
+}
+
 static inline void set_current_oom_origin(void)
 {
 	current->signal->oom_flag_origin = true;
@@ -76,6 +93,12 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
 	return tsk->signal->oom_mm;
 }
 
+static inline bool task_is_dying(void)
+{
+	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
+		(current->flags & PF_EXITING);
+}
+
 /*
  * Checks whether a page fault on the given mm is still reliable.
  * This is no longer true if the oom reaper started to reap the
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index 99e3ed469e88..dc43464cd838 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -4,6 +4,7 @@
 
 extern unsigned long generic_max_swapfile_size(void);
 unsigned long arch_max_swapfile_size(void);
+int add_to_swap_gather_cache(struct mm_struct *mm, swp_entry_t entry, int nr);
 
 /* Maximum swapfile size supported for the arch (not inclusive). */
 extern unsigned long swapfile_maximum_size;
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 9e15a088ba38..05f33d26d459 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -186,6 +186,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		KSTACK_REST,
 #endif
 #endif /* CONFIG_DEBUG_STACK_USAGE */
+		ASYNC_SWAP_COUNTS,
 		NR_VM_EVENT_ITEMS
 };
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 343eb97543d5..c879fe32aa0e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -897,6 +897,7 @@ void __noreturn do_exit(long code)
 	WARN_ON(irqs_disabled());
 	WARN_ON(tsk->plug);
 
+	inc_exiting_task_count();
 	kcov_task_exit(tsk);
 	kmsan_task_exit(tsk);
 
@@ -1001,6 +1002,7 @@ void __noreturn do_exit(long code)
 	exit_tasks_rcu_finish();
 
 	lockdep_free_task(tsk);
+	dec_exiting_task_count();
 	do_task_dead();
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8dd7fbed5a94..79bc4321cbb3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -102,12 +102,6 @@ static struct kmem_cache *memcg_pn_cachep;
 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 #endif
 
-static inline bool task_is_dying(void)
-{
-	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
-		(current->flags & PF_EXITING);
-}
-
 /* Some nice accessors for the vmpressure. */
 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 0ba4f6b71847..e09db2932b25 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -75,6 +75,7 @@
 #include <linux/ptrace.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/sysctl.h>
+#include <linux/swapfile.h>
 
 #include <trace/events/kmem.h>
 
@@ -1617,7 +1618,8 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
 
 		nr = swap_pte_batch(pte, max_nr, ptent);
 		rss[MM_SWAPENTS] -= nr;
-		free_swap_and_cache_nr(entry, nr);
+		if (add_to_swap_gather_cache(tlb->mm, entry, nr))
+			free_swap_and_cache_nr(entry, nr);
 	} else if (is_migration_entry(entry)) {
 		struct folio *folio = pfn_swap_entry_folio(entry);
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b4f3cc712580..7c69e726b075 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -42,6 +42,10 @@
 #include <linux/suspend.h>
 #include <linux/zswap.h>
 #include <linux/plist.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/vmstat.h>
 
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
@@ -170,6 +174,136 @@ static long swap_usage_in_pages(struct swap_info_struct *si)
 /* Reclaim the swap entry if swap is getting full */
 #define TTRS_FULL		0x4
 
+/* Minimum number of exiting processes, adjustable based on system load */
+#define MIN_EXITING_TASKS_THRESHOLD 1
+/* Number of active work items for asynchronously releasing swap cache.
+ * Defaults to zero and is determined by the system itself, it can also
+ * be configured manually based on system load.
+ */
+#define NUM_ASYNC_SWAP_WORK_ITEMS 0
+
+static struct workqueue_struct *release_wq;
+static LIST_HEAD(swap_cache_list);
+static spinlock_t swap_cache_lock;
+static int cache_count;
+static int max_cache_entries = 32;
+static struct kmem_cache *swap_entry_cachep;
+atomic_t exiting_task_count = ATOMIC_INIT(0);
+
+/* Represents a cache entry for swap operations */
+struct swap_entry_cache {
+	swp_entry_t entry;
+	int nr;
+	struct list_head list;
+};
+
+static int async_swap_free_counts_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "exiting_tasks:%d cache_counts:%d\n",
+		   get_exiting_task_count(), cache_count);
+	return 0;
+}
+
+static void async_release_func(struct work_struct *work)
+{
+	struct swap_entry_cache *sec, *tmp;
+	unsigned int counts = 0;
+	LIST_HEAD(temp_list);
+
+	if (cache_count) {
+		spin_lock_irq(&swap_cache_lock);
+		list_splice_init(&swap_cache_list, &temp_list);
+		cache_count = 0;
+		spin_unlock_irq(&swap_cache_lock);
+	} else {
+		goto out;
+	}
+
+	list_for_each_entry_safe(sec, tmp, &temp_list, list) {
+		free_swap_and_cache_nr(sec->entry, sec->nr);
+		kmem_cache_free(swap_entry_cachep, sec);
+		counts++;
+	}
+	count_vm_events(ASYNC_SWAP_COUNTS, counts);
+out:
+	kfree(work);
+}
+
+static void flush_cache_if_needed(bool check_cache_count)
+{
+	struct work_struct *release_work;
+
+	if ((!check_cache_count && cache_count) ||
+	    cache_count >= max_cache_entries) {
+		release_work = kmalloc(sizeof(*release_work), GFP_ATOMIC);
+		if (release_work) {
+			INIT_WORK(release_work, async_release_func);
+			queue_work(release_wq, release_work);
+		}
+	}
+}
+
+/*
+ * add_to_swap_gather_cache - Add a swap entry to the cache.
+ * @mm: Memory descriptor.
+ * @entry: Swap entry to add.
+ * @nr: Associated number.
+ *
+ * Returns 0 on success, -1 for unmet conditions, -ENOMEM on allocation failure.
+ *
+ * Checks task exiting counts, allocates cache entry, adds it to the swap cache
+ * list, and may trigger a cache flush.
+ */
+int add_to_swap_gather_cache(struct mm_struct *mm, swp_entry_t entry, int nr)
+{
+	struct swap_entry_cache *sec;
+
+	if (!mm || get_exiting_task_count() < MIN_EXITING_TASKS_THRESHOLD)
+		return -1;
+
+	if (!task_is_dying() ||
+	    get_mm_counter(mm, MM_SWAPENTS) < (100 * SWAP_CLUSTER_MAX))
+		return -1;
+
+	sec = kmem_cache_alloc(swap_entry_cachep, GFP_ATOMIC);
+	if (!sec)
+		return -ENOMEM;
+
+	sec->entry = entry;
+	sec->nr = nr;
+	INIT_LIST_HEAD(&sec->list);
+
+	spin_lock_irq(&swap_cache_lock);
+	list_add_tail(&sec->list, &swap_cache_list);
+	cache_count++;
+	spin_unlock_irq(&swap_cache_lock);
+
+	flush_cache_if_needed(true);
+
+	return 0;
+}
+
+static int __init swap_async_free_setup(void)
+{
+	release_wq = alloc_workqueue("async_swap_free",
+				     WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM,
+				     NUM_ASYNC_SWAP_WORK_ITEMS);
+	if (!release_wq)
+		return -ENOMEM;
+
+	swap_entry_cachep = KMEM_CACHE(swap_entry_cache, SLAB_ACCOUNT);
+	if (!swap_entry_cachep)
+		return -ENOMEM;
+
+	spin_lock_init(&swap_cache_lock);
+	proc_create_single("aswap_free_counts", 0, NULL,
+			   async_swap_free_counts_show);
+
+	return 0;
+}
+
+postcore_initcall(swap_async_free_setup);
+
 static bool swap_only_has_cache(struct swap_info_struct *si,
 			      unsigned long offset, int nr_pages)
 {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 71cd1ceba191..fa7fe910becf 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1494,6 +1494,7 @@ const char * const vmstat_text[] = {
 	[I(KSTACK_REST)]			= "kstack_rest",
 #endif
 #endif
+	[I(ASYNC_SWAP_COUNTS)]			= "async_swap_count",
 #undef I
 #endif /* CONFIG_VM_EVENT_COUNTERS */
 };
-- 
2.34.1

next prev parent reply	other threads:[~2025-09-09  6:54 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-09-09  6:53 [PATCH v0 0/2] mm: swap: Gather swap entries and batch async release Lei Liu
2025-09-09  6:53 ` Lei Liu [this message]
2025-09-10  1:39   ` [PATCH v0 1/2] mm: swap: Gather swap entries and batch async release core kernel test robot
2025-09-10  3:12   ` kernel test robot
2025-09-09  6:53 ` [PATCH v0 2/2] mm: swap: Forced swap entries release under memory pressure Lei Liu
2025-09-10  5:36   ` kernel test robot
2025-09-09  7:30 ` [PATCH v0 0/2] mm: swap: Gather swap entries and batch async release Kairui Song
2025-09-09  9:24   ` Barry Song
2025-09-09 16:15     ` Chris Li
2025-09-09 18:01       ` Chris Li
2025-09-10 14:07     ` Lei Liu
2025-10-14 20:42       ` Barry Song
2025-09-09 15:38   ` Chris Li
2025-09-10 14:01   ` Lei Liu
2025-09-09 19:21 ` Shakeel Butt
2025-09-09 19:48   ` Suren Baghdasaryan
2025-09-10 14:14     ` Lei Liu
2025-09-10 14:56       ` Suren Baghdasaryan
2025-09-10 16:05       ` Chris Li
2025-09-10 20:12       ` Shakeel Butt
2025-09-11  3:04         ` Lei Liu
2025-09-10 15:40     ` Chris Li
2025-09-10 20:10     ` Shakeel Butt
2025-09-10 20:41       ` Suren Baghdasaryan
2025-09-10 22:10         ` T.J. Mercier
2025-09-10 22:33           ` Shakeel Butt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250909065349.574894-2-liulei.rjpt@vivo.com \
    --to=liulei.rjpt@vivo.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=andrii@kernel.org \
    --cc=baohua@kernel.org \
    --cc=bhe@redhat.com \
    --cc=brauner@kernel.org \
    --cc=cgroups@vger.kernel.org \
    --cc=chrisl@kernel.org \
    --cc=david@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=jiahao1@lixiang.com \
    --cc=kas@kernel.org \
    --cc=kasong@tencent.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=mhocko@suse.com \
    --cc=mjguzik@gmail.com \
    --cc=muchun.song@linux.dev \
    --cc=nphamcs@gmail.com \
    --cc=oleg@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rientjes@google.com \
    --cc=roman.gushchin@linux.dev \
    --cc=rostedt@goodmis.org \
    --cc=rppt@kernel.org \
    --cc=shakeel.butt@linux.dev \
    --cc=shikemeng@huaweicloud.com \
    --cc=surenb@google.com \
    --cc=usamaarif642@gmail.com \
    --cc=vbabka@suse.cz \
    --cc=viro@zeniv.linux.org.uk \
    --cc=wangfushuai@baidu.com \
    --cc=yu.c.chen@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox