From: Kefeng Wang <wangkefeng.wang@huawei.com>
To: Andrew Morton <akpm@linux-foundation.org>,
David Hildenbrand <david@kernel.org>,
Christian Brauner <brauner@kernel.org>,
Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>,
Jan Kara <jack@suse.cz>,
"Liam R. Howlett" <Liam.Howlett@oracle.com>,
Lorenzo Stoakes <ljs@kernel.org>, Michal Hocko <mhocko@suse.com>,
Mike Rapoport <rppt@kernel.org>,
Suren Baghdasaryan <surenb@google.com>,
Vlastimil Babka <vbabka@kernel.org>,
<linux-fsdevel@vger.kernel.org>, <linux-mm@kvack.org>,
Kefeng Wang <wangkefeng.wang@huawei.com>
Subject: [PATCH RFC] fs: drop_caches: introduce per-node drop_caches interface
Date: Thu, 9 Apr 2026 14:35:03 +0800 [thread overview]
Message-ID: <20260409063503.3475420-1-wangkefeng.wang@huawei.com> (raw)
Add a sysfs interface at /sys/devices/system/node/nodeX/drop_caches
to allow dropping caches on a specific NUMA node.
The existing global drop_caches mechanism (/proc/sys/vm/drop_caches)
operates across all NUMA nodes indiscriminately, causing,
- Unnecessary eviction of hot cache on some nodes
- Performance degradation for applications with NUMA affinity
- Long times spent on large systems with lots of memory
By exposing a per-node interface, admistrator can,
- Target specific nodes experiencing memory pressure
- Preserve cache on unaffected nodes
- Perform reclamation with finer granularity
One use cases is hot-pluggable NUMA nodes, during hot-remove, simply
dropping pagecache is far more efficient than migrating large amounts
of pages to other nodes, which also eliminating the risk of accessing
potentially faulty memory.
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
Documentation/ABI/stable/sysfs-devices-node | 10 ++
drivers/base/node.c | 2 +
fs/drop_caches.c | 101 ++++++++++++++++----
include/linux/fs.h | 9 ++
include/linux/mm.h | 2 +-
include/linux/pagemap.h | 2 +
mm/fadvise.c | 2 +-
mm/filemap.c | 12 ++-
mm/internal.h | 16 +++-
mm/truncate.c | 17 +++-
mm/vmscan.c | 11 ++-
11 files changed, 145 insertions(+), 39 deletions(-)
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index 2d0e023f22a7..ec282291337a 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -236,3 +236,13 @@ Description:
This interface is equivalent to the memcg variant.
See Documentation/admin-guide/cgroup-v2.rst
+
+What: /sys/devices/system/node/nodeX/drop_caches
+Date: Mar 2026
+Contact: Linux Memory Management list <linux-mm@kvack.org>
+Description:
+ Drop clean page cache and/or reclaimable slab objects (dentries
+ and inodes) on a specific NUMA node. This is the per-node
+ equivalent of /proc/sys/vm/drop_caches
+
+ See Documentation/admin-guide/sysctl/vm.rst:drop_caches
diff --git a/drivers/base/node.c b/drivers/base/node.c
index d7647d077b66..878c2cdef2e1 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -896,6 +896,7 @@ int register_node(int nid)
hugetlb_register_node(node);
compaction_register_node(node);
reclaim_register_node(node);
+ drop_caches_register_node(node);
/* link cpu under this node */
for_each_present_cpu(cpu) {
@@ -924,6 +925,7 @@ void unregister_node(int nid)
hugetlb_unregister_node(node);
compaction_unregister_node(node);
reclaim_unregister_node(node);
+ drop_caches_unregister_node(node);
node_remove_accesses(node);
node_remove_caches(node);
device_unregister(&node->dev);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 49f56a598ecb..af88b24cb94f 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -16,9 +16,10 @@
/* A global variable is a bit ugly, but it keeps the code simple */
static int sysctl_drop_caches;
-static void drop_pagecache_sb(struct super_block *sb, void *unused)
+static void drop_pagecache_sb(struct super_block *sb, void *node)
{
struct inode *inode, *toput_inode = NULL;
+ int nid = *(int *)node;
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
@@ -37,7 +38,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
spin_unlock(&inode->i_lock);
spin_unlock(&sb->s_inode_list_lock);
- invalidate_mapping_pages(inode->i_mapping, 0, -1);
+ invalidate_node_mapping_pages(inode->i_mapping, 0, -1, nid);
iput(toput_inode);
toput_inode = inode;
@@ -48,6 +49,47 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
iput(toput_inode);
}
+static unsigned long has_caches(int nid)
+{
+ unsigned long nr;
+
+ if (nid >= 0)
+ nr = node_page_state(NODE_DATA(nid), NR_FILE_PAGES);
+ else
+ nr = global_node_page_state(NR_FILE_PAGES);
+
+ return nr;
+}
+
+static void drop_caches_handler(int flags, int nid)
+{
+ static int stfu;
+
+ if (flags & 1) {
+ if (!has_caches(nid))
+ return;
+
+ lru_add_drain_all();
+ iterate_supers(drop_pagecache_sb, &nid);
+ count_vm_event(DROP_PAGECACHE);
+ }
+
+ if (flags & 2) {
+ drop_slab(nid);
+ count_vm_event(DROP_SLAB);
+ }
+
+ if (!stfu) {
+ if (nid >= 0)
+ pr_info("%s (%d): drop_caches: %d on node %d\n",
+ current->comm, task_pid_nr(current), flags, nid);
+ else
+ pr_info("%s (%d): drop_caches: %d\n",
+ current->comm, task_pid_nr(current), flags);
+ }
+ stfu |= flags & 4;
+}
+
static int drop_caches_sysctl_handler(const struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
@@ -56,25 +98,8 @@ static int drop_caches_sysctl_handler(const struct ctl_table *table, int write,
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (ret)
return ret;
- if (write) {
- static int stfu;
-
- if (sysctl_drop_caches & 1) {
- lru_add_drain_all();
- iterate_supers(drop_pagecache_sb, NULL);
- count_vm_event(DROP_PAGECACHE);
- }
- if (sysctl_drop_caches & 2) {
- drop_slab();
- count_vm_event(DROP_SLAB);
- }
- if (!stfu) {
- pr_info("%s (%d): drop_caches: %d\n",
- current->comm, task_pid_nr(current),
- sysctl_drop_caches);
- }
- stfu |= sysctl_drop_caches & 4;
- }
+ if (write)
+ drop_caches_handler(sysctl_drop_caches, NUMA_NO_NODE);
return 0;
}
@@ -96,3 +121,37 @@ static int __init init_vm_drop_caches_sysctls(void)
return 0;
}
fs_initcall(init_vm_drop_caches_sysctls);
+
+#ifdef CONFIG_NUMA
+/* The range of input is same as sysctl_drop_caches */
+#define INPUT_MIN 1
+#define INPUT_MAX 4
+static ssize_t drop_caches_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ int nid = dev->id;
+ int input;
+
+ if (kstrtoint(buf, 0, &input))
+ return -EINVAL;
+
+ if (input > INPUT_MAX || input < INPUT_MIN)
+ return -EINVAL;
+
+ if (nid >= 0 && nid < nr_node_ids && node_online(nid))
+ drop_caches_handler(input, nid);
+
+ return count;
+}
+
+static DEVICE_ATTR_WO(drop_caches);
+int drop_caches_register_node(struct node *node)
+{
+ return device_create_file(&node->dev, &dev_attr_drop_caches);
+}
+
+void drop_caches_unregister_node(struct node *node)
+{
+ device_remove_file(&node->dev, &dev_attr_drop_caches);
+}
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e1d257e6da68..31d8b9b20425 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -72,6 +72,7 @@ struct fs_parameter_spec;
struct file_kattr;
struct iomap_ops;
struct delegated_inode;
+struct node;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -3664,4 +3665,12 @@ static inline bool extensible_ioctl_valid(unsigned int cmd_a,
return true;
}
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+int drop_caches_register_node(struct node *node);
+void drop_caches_unregister_node(struct node *node);
+#else
+static inline int drop_caches_register_node(struct node *node) { return 0; }
+static inline void drop_caches_unregister_node(struct node *node) {}
+#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+
#endif /* _LINUX_FS_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0b776907152e..f252564841b9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4838,7 +4838,7 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm);
-void drop_slab(void);
+void drop_slab(int nid);
#ifndef CONFIG_MMU
#define randomize_va_space 0
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 31a848485ad9..3e1c5b8b5c69 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -20,6 +20,8 @@ struct folio_batch;
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end);
+unsigned long invalidate_node_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, int nid);
static inline void invalidate_remote_inode(struct inode *inode)
{
diff --git a/mm/fadvise.c b/mm/fadvise.c
index b63fe21416ff..60307c505278 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -155,7 +155,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
lru_add_drain();
mapping_try_invalidate(mapping, start_index, end_index,
- &nr_failed);
+ NUMA_NO_NODE, &nr_failed);
/*
* The failures may be due to the folio being
diff --git a/mm/filemap.c b/mm/filemap.c
index c568d9058ff8..f962f980ef55 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2142,10 +2142,11 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
}
/**
- * find_lock_entries - Find a batch of pagecache entries.
+ * __find_lock_entries - Find a batch of pagecache entries from a specific node
* @mapping: The address_space to search.
* @start: The starting page cache index.
* @end: The final page index (inclusive).
+ * @nid NUMA node ID to filter entries.
* @fbatch: Where the resulting entries are placed.
* @indices: The cache indices of the entries in @fbatch.
*
@@ -2153,7 +2154,8 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
* Swap, shadow and DAX entries are included. Folios are returned
* locked and with an incremented refcount. Folios which are locked
* by somebody else or under writeback are skipped. Folios which are
- * partially outside the range are not returned.
+ * partially outside the range are not returned. Only entries belonging
+ * to nid will be returned, pass NUMA_NO_NODE to search all nodes.
*
* The entries have ascending indexes. The indices may not be consecutive
* due to not-present entries, large folios, folios which could not be
@@ -2161,8 +2163,8 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
*
* Return: The number of entries which were found.
*/
-unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
- pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
+unsigned int __find_lock_entries(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, int nid, struct folio_batch *fbatch, pgoff_t *indices)
{
XA_STATE(xas, &mapping->i_pages, *start);
struct folio *folio;
@@ -2181,6 +2183,8 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
/* Omit large folio which extends beyond the end */
if (base + nr - 1 > end)
goto put;
+ if (nid >= 0 && folio_nid(folio) != nid)
+ goto put;
if (!folio_trylock(folio))
goto put;
if (folio->mapping != mapping ||
diff --git a/mm/internal.h b/mm/internal.h
index 41398ecb2201..3bb88b0017f8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -553,8 +553,18 @@ static inline void force_page_cache_readahead(struct address_space *mapping,
force_page_cache_ra(&ractl, nr_to_read);
}
-unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
- pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
+unsigned int __find_lock_entries(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, int nid, struct folio_batch *fbatch,
+ pgoff_t *indices);
+
+static inline unsigned int find_lock_entries(struct address_space *mapping,
+ pgoff_t *start, pgoff_t end, struct folio_batch *fbatch,
+ pgoff_t *indices)
+{
+ return __find_lock_entries(mapping, start, end, NUMA_NO_NODE, fbatch,
+ indices);
+}
+
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
void filemap_free_folio(struct address_space *mapping, struct folio *folio);
@@ -563,7 +573,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
loff_t end);
long mapping_evict_folio(struct address_space *mapping, struct folio *folio);
unsigned long mapping_try_invalidate(struct address_space *mapping,
- pgoff_t start, pgoff_t end, unsigned long *nr_failed);
+ pgoff_t start, pgoff_t end, int nid, unsigned long *nr_failed);
/**
* folio_evictable - Test whether a folio is evictable.
diff --git a/mm/truncate.c b/mm/truncate.c
index 2931d66c16d0..09cc50083ac8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -522,17 +522,19 @@ void truncate_inode_pages_final(struct address_space *mapping)
EXPORT_SYMBOL(truncate_inode_pages_final);
/**
- * mapping_try_invalidate - Invalidate all the evictable folios of one inode
+ * mapping_try_invalidate - Invalidate all the evictable folios from a
+ * specific NUMA node of one inode
* @mapping: the address_space which holds the folios to invalidate
* @start: the offset 'from' which to invalidate
* @end: the offset 'to' which to invalidate (inclusive)
+ * @nid: the NUMA node id to invalidate from, or NUMA_NO_NODE for all nodes
* @nr_failed: How many folio invalidations failed
*
* This function is similar to invalidate_mapping_pages(), except that it
* returns the number of folios which could not be evicted in @nr_failed.
*/
unsigned long mapping_try_invalidate(struct address_space *mapping,
- pgoff_t start, pgoff_t end, unsigned long *nr_failed)
+ pgoff_t start, pgoff_t end, int nid, unsigned long *nr_failed)
{
pgoff_t indices[FOLIO_BATCH_SIZE];
struct folio_batch fbatch;
@@ -542,7 +544,7 @@ unsigned long mapping_try_invalidate(struct address_space *mapping,
int i;
folio_batch_init(&fbatch);
- while (find_lock_entries(mapping, &index, end, &fbatch, indices)) {
+ while (__find_lock_entries(mapping, &index, end, nid, &fbatch, indices)) {
bool xa_has_values = false;
int nr = folio_batch_count(&fbatch);
@@ -599,10 +601,17 @@ unsigned long mapping_try_invalidate(struct address_space *mapping,
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
- return mapping_try_invalidate(mapping, start, end, NULL);
+ return mapping_try_invalidate(mapping, start, end, NUMA_NO_NODE, NULL);
}
EXPORT_SYMBOL(invalidate_mapping_pages);
+unsigned long invalidate_node_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, int nid)
+{
+ return mapping_try_invalidate(mapping, start, end, nid, NULL);
+}
+EXPORT_SYMBOL(invalidate_node_mapping_pages);
+
static int folio_launder(struct address_space *mapping, struct folio *folio)
{
if (!folio_test_dirty(folio))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5a8c8fcccbfc..84eba9ab5d25 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -418,19 +418,20 @@ static unsigned long drop_slab_node(int nid)
return freed;
}
-void drop_slab(void)
+void drop_slab(int nid)
{
- int nid;
- int shift = 0;
+ int i, shift = 0;
unsigned long freed;
+ nodemask_t n_mask;
+ n_mask = nid >= 0 ? nodemask_of_node(nid) : node_online_map;
do {
freed = 0;
- for_each_online_node(nid) {
+ for_each_node_mask(i, n_mask) {
if (fatal_signal_pending(current))
return;
- freed += drop_slab_node(nid);
+ freed += drop_slab_node(i);
}
} while ((freed >> shift++) > 1);
}
--
2.27.0
next reply other threads:[~2026-04-09 6:35 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-09 6:35 Kefeng Wang [this message]
2026-04-09 7:06 ` Michal Hocko
2026-04-09 7:19 ` Lorenzo Stoakes
2026-04-09 8:21 ` Kefeng Wang
2026-04-09 8:27 ` Lorenzo Stoakes
2026-04-09 8:08 ` Kefeng Wang
2026-04-09 8:22 ` Michal Hocko
2026-04-09 8:54 ` Kefeng Wang
2026-04-09 10:52 ` Michal Hocko
2026-04-09 8:30 ` Lorenzo Stoakes
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260409063503.3475420-1-wangkefeng.wang@huawei.com \
--to=wangkefeng.wang@huawei.com \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=brauner@kernel.org \
--cc=david@kernel.org \
--cc=jack@suse.cz \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=mhocko@suse.com \
--cc=rppt@kernel.org \
--cc=surenb@google.com \
--cc=vbabka@kernel.org \
--cc=viro@zeniv.linux.org.uk \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox