[PATCH RFC v2 16/20] slab: refill sheaves from all nodes

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Vlastimil Babka <vbabka@suse.cz>
To: Harry Yoo <harry.yoo@oracle.com>,
	Petr Tesarik <ptesarik@suse.com>,
	 Christoph Lameter <cl@gentwo.org>,
	David Rientjes <rientjes@google.com>,
	 Roman Gushchin <roman.gushchin@linux.dev>
Cc: Hao Li <hao.li@linux.dev>,
	Andrew Morton <akpm@linux-foundation.org>,
	 Uladzislau Rezki <urezki@gmail.com>,
	 "Liam R. Howlett" <Liam.Howlett@oracle.com>,
	 Suren Baghdasaryan <surenb@google.com>,
	 Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	 Alexei Starovoitov <ast@kernel.org>,
	linux-mm@kvack.org,  linux-kernel@vger.kernel.org,
	linux-rt-devel@lists.linux.dev,  bpf@vger.kernel.org,
	kasan-dev@googlegroups.com,  Vlastimil Babka <vbabka@suse.cz>
Subject: [PATCH RFC v2 16/20] slab: refill sheaves from all nodes
Date: Mon, 12 Jan 2026 16:17:10 +0100	[thread overview]
Message-ID: <20260112-sheaves-for-all-v2-16-98225cfb50cf@suse.cz> (raw)
In-Reply-To: <20260112-sheaves-for-all-v2-0-98225cfb50cf@suse.cz>

__refill_objects() currently only attempts to get partial slabs from the
local node and then allocates new slab(s). Expand it to trying also
other nodes while observing the remote node defrag ratio, similarly to
get_any_partial().

This will prevent allocating new slabs on a node while other nodes have
many free slabs. It does mean sheaves will contain non-local objects in
that case. Allocations that care about specific node will still be
served appropriately, but might get a slowpath allocation.

Like get_any_partial() we do observe cpuset_zone_allowed(), although we
might be refilling a sheaf that will be then used from a different
allocation context.

We can also use the resulting refill_objects() in
__kmem_cache_alloc_bulk() for non-debug caches. This means
kmem_cache_alloc_bulk() will get better performance when sheaves are
exhausted. kmem_cache_alloc_bulk() cannot indicate a preferred node so
it's compatible with sheaves refill in preferring the local node.
Its users also have gfp flags that allow spinning, so document that
as a requirement.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 106 insertions(+), 31 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 088b4f6f81fa..602674d56ae6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2506,8 +2506,8 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
 }
 
 static unsigned int
-__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
-		 unsigned int max);
+refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+	       unsigned int max);
 
 static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
 			 gfp_t gfp)
@@ -2518,8 +2518,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
 	if (!to_fill)
 		return 0;
 
-	filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp,
-			to_fill, to_fill);
+	filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill,
+				to_fill);
 
 	sheaf->size += filled;
 
@@ -6515,29 +6515,22 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 EXPORT_SYMBOL(kmem_cache_free_bulk);
 
 static unsigned int
-__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
-		 unsigned int max)
+__refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+		      unsigned int max, struct kmem_cache_node *n)
 {
 	struct slab *slab, *slab2;
 	struct partial_context pc;
 	unsigned int refilled = 0;
 	unsigned long flags;
 	void *object;
-	int node;
 
 	pc.flags = gfp;
 	pc.min_objects = min;
 	pc.max_objects = max;
 
-	node = numa_mem_id();
-
-	if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
+	if (!get_partial_node_bulk(s, n, &pc))
 		return 0;
 
-	/* TODO: consider also other nodes? */
-	if (!get_partial_node_bulk(s, get_node(s, node), &pc))
-		goto new_slab;
-
 	list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
 
 		list_del(&slab->slab_list);
@@ -6575,8 +6568,6 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
 	}
 
 	if (unlikely(!list_empty(&pc.slabs))) {
-		struct kmem_cache_node *n = get_node(s, node);
-
 		spin_lock_irqsave(&n->list_lock, flags);
 
 		list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
@@ -6598,13 +6589,92 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
 		}
 	}
 
+	return refilled;
+}
 
-	if (likely(refilled >= min))
-		goto out;
+#ifdef CONFIG_NUMA
+static unsigned int
+__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+		     unsigned int max, int local_node)
+{
+	struct zonelist *zonelist;
+	struct zoneref *z;
+	struct zone *zone;
+	enum zone_type highest_zoneidx = gfp_zone(gfp);
+	unsigned int cpuset_mems_cookie;
+	unsigned int refilled = 0;
+
+	/* see get_any_partial() for the defrag ratio description */
+	if (!s->remote_node_defrag_ratio ||
+			get_cycles() % 1024 > s->remote_node_defrag_ratio)
+		return 0;
+
+	do {
+		cpuset_mems_cookie = read_mems_allowed_begin();
+		zonelist = node_zonelist(mempolicy_slab_node(), gfp);
+		for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
+			struct kmem_cache_node *n;
+			unsigned int r;
+
+			n = get_node(s, zone_to_nid(zone));
+
+			if (!n || !cpuset_zone_allowed(zone, gfp) ||
+					n->nr_partial <= s->min_partial)
+				continue;
+
+			r = __refill_objects_node(s, p, gfp, min, max, n);
+			refilled += r;
+
+			if (r >= min) {
+				/*
+				 * Don't check read_mems_allowed_retry() here -
+				 * if mems_allowed was updated in parallel, that
+				 * was a harmless race between allocation and
+				 * the cpuset update
+				 */
+				return refilled;
+			}
+			p += r;
+			min -= r;
+			max -= r;
+		}
+	} while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+	return refilled;
+}
+#else
+static inline unsigned int
+__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+		     unsigned int max, int local_node)
+{
+	return 0;
+}
+#endif
+
+static unsigned int
+refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+	       unsigned int max)
+{
+	int local_node = numa_mem_id();
+	unsigned int refilled;
+	struct slab *slab;
+
+	if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
+		return 0;
+
+	refilled = __refill_objects_node(s, p, gfp, min, max,
+					 get_node(s, local_node));
+	if (refilled >= min)
+		return refilled;
+
+	refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled,
+					 max - refilled, local_node);
+	if (refilled >= min)
+		return refilled;
 
 new_slab:
 
-	slab = new_slab(s, pc.flags, node);
+	slab = new_slab(s, gfp, local_node);
 	if (!slab)
 		goto out;
 
@@ -6620,8 +6690,8 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
 
 	if (refilled < min)
 		goto new_slab;
-out:
 
+out:
 	return refilled;
 }
 
@@ -6631,18 +6701,20 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 {
 	int i;
 
-	/*
-	 * TODO: this might be more efficient (if necessary) by reusing
-	 * __refill_objects()
-	 */
-	for (i = 0; i < size; i++) {
+	if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
+		for (i = 0; i < size; i++) {
 
-		p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_,
-				     s->object_size);
-		if (unlikely(!p[i]))
-			goto error;
+			p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_,
+					     s->object_size);
+			if (unlikely(!p[i]))
+				goto error;
 
-		maybe_wipe_obj_freeptr(s, p[i]);
+			maybe_wipe_obj_freeptr(s, p[i]);
+		}
+	} else {
+		i = refill_objects(s, p, flags, size, size);
+		if (i < size)
+			goto error;
 	}
 
 	return i;
@@ -6653,7 +6725,10 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 
 }
 
-/* Note that interrupts must be enabled when calling this function. */
+/*
+ * Note that interrupts must be enabled when calling this function and gfp
+ * flags must allow spinning.
+ */
 int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
 				 void **p)
 {

-- 
2.52.0

next prev parent reply	other threads:[~2026-01-12 15:17 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-12 15:16 [PATCH RFC v2 00/20] slab: replace cpu (partial) slabs with sheaves Vlastimil Babka
2026-01-12 15:16 ` [PATCH RFC v2 01/20] mm/slab: add rcu_barrier() to kvfree_rcu_barrier_on_cache() Vlastimil Babka
2026-01-12 15:16 ` [PATCH RFC v2 02/20] mm/slab: move and refactor __kmem_cache_alias() Vlastimil Babka
2026-01-12 15:16 ` [PATCH RFC v2 03/20] mm/slab: make caches with sheaves mergeable Vlastimil Babka
2026-01-12 15:16 ` [PATCH RFC v2 04/20] slab: add sheaves to most caches Vlastimil Babka
2026-01-12 15:16 ` [PATCH RFC v2 05/20] slab: introduce percpu sheaves bootstrap Vlastimil Babka
2026-01-12 15:17 ` [PATCH RFC v2 06/20] slab: make percpu sheaves compatible with kmalloc_nolock()/kfree_nolock() Vlastimil Babka
2026-01-12 15:17 ` [PATCH RFC v2 07/20] slab: handle kmalloc sheaves bootstrap Vlastimil Babka
2026-01-12 15:17 ` [PATCH RFC v2 08/20] slab: add optimized sheaf refill from partial list Vlastimil Babka
2026-01-12 15:17 ` [PATCH RFC v2 09/20] slab: remove cpu (partial) slabs usage from allocation paths Vlastimil Babka
2026-01-12 15:17 ` [PATCH RFC v2 10/20] slab: remove SLUB_CPU_PARTIAL Vlastimil Babka
2026-01-12 15:17 ` [PATCH RFC v2 11/20] slab: remove the do_slab_free() fastpath Vlastimil Babka
2026-01-12 15:17 ` [PATCH RFC v2 12/20] slab: remove defer_deactivate_slab() Vlastimil Babka
2026-01-12 15:17 ` [PATCH RFC v2 13/20] slab: simplify kmalloc_nolock() Vlastimil Babka
2026-01-12 15:17 ` [PATCH RFC v2 14/20] slab: remove struct kmem_cache_cpu Vlastimil Babka
2026-01-12 15:17 ` [PATCH RFC v2 15/20] slab: remove unused PREEMPT_RT specific macros Vlastimil Babka
2026-01-12 15:17 ` Vlastimil Babka [this message]
2026-01-12 15:17 ` [PATCH RFC v2 17/20] slab: update overview comments Vlastimil Babka
2026-01-12 15:17 ` [PATCH RFC v2 18/20] slab: remove frozen slab checks from __slab_free() Vlastimil Babka
2026-01-12 15:17 ` [PATCH RFC v2 19/20] mm/slub: remove DEACTIVATE_TO_* stat items Vlastimil Babka
2026-01-12 15:17 ` [PATCH RFC v2 20/20] mm/slub: cleanup and repurpose some " Vlastimil Babka
2026-01-12 15:20 ` [PATCH v2 00/20] slab: replace cpu (partial) slabs with sheaves Vlastimil Babka

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260112-sheaves-for-all-v2-16-98225cfb50cf@suse.cz \
    --to=vbabka@suse.cz \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=ast@kernel.org \
    --cc=bigeasy@linutronix.de \
    --cc=bpf@vger.kernel.org \
    --cc=cl@gentwo.org \
    --cc=hao.li@linux.dev \
    --cc=harry.yoo@oracle.com \
    --cc=kasan-dev@googlegroups.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-rt-devel@lists.linux.dev \
    --cc=ptesarik@suse.com \
    --cc=rientjes@google.com \
    --cc=roman.gushchin@linux.dev \
    --cc=surenb@google.com \
    --cc=urezki@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox