[PATCH v4 9/9] mm, slub: skip percpu sheaves for remote object freeing

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Vlastimil Babka <vbabka@suse.cz>
To: Suren Baghdasaryan <surenb@google.com>,
	 "Liam R. Howlett" <Liam.Howlett@oracle.com>,
	 Christoph Lameter <cl@linux.com>,
	David Rientjes <rientjes@google.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>,
	 Harry Yoo <harry.yoo@oracle.com>,
	Uladzislau Rezki <urezki@gmail.com>,
	 linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	rcu@vger.kernel.org,  maple-tree@lists.infradead.org,
	vbabka@suse.cz
Subject: [PATCH v4 9/9] mm, slub: skip percpu sheaves for remote object freeing
Date: Fri, 25 Apr 2025 10:27:29 +0200	[thread overview]
Message-ID: <20250425-slub-percpu-caches-v4-9-8a636982b4a4@suse.cz> (raw)
In-Reply-To: <20250425-slub-percpu-caches-v4-0-8a636982b4a4@suse.cz>

Since we don't control the NUMA locality of objects in percpu sheaves,
allocations with node restrictions bypass them. Allocations without
restrictions may however still expect to get local objects with high
probability, and the introduction of sheaves can decrease it due to
freed object from a remote node ending up in percpu sheaves.

The fraction of such remote frees seems low (5% on an 8-node machine)
but it can be expected that some cache or workload specific corner cases
exist. We can either conclude that this is not a problem due to the low
fraction, or we can make remote frees bypass percpu sheaves and go
directly to their slabs. This will make the remote frees more expensive,
but if if's only a small fraction, most frees will still benefit from
the lower overhead of percpu sheaves.

This patch thus makes remote object freeing bypass percpu sheaves,
including bulk freeing, and kfree_rcu() via the rcu_free sheaf. However
it's not intended to be 100% guarantee that percpu sheaves will only
contain local objects. The refill from slabs does not provide that
guarantee in the first place, and there might be cpu migrations
happening when we need to unlock the local_lock. Avoiding all that could
be possible but complicated so we can leave it for later investigation
whether it would be worth it. It can be expected that the more selective
freeing will itself prevent accumulation of remote objects in percpu
sheaves so any such violations would have only short-term effects.

Another possible optimization to investigate is whether it would be
beneficial for node-restricted or strict_numa allocations to attempt to
obtain an object from percpu sheaves if the node or mempolicy (i.e.
MPOL_LOCAL) happens to want the local node of the allocating cpu. Right
now such allocations bypass sheaves, but they could probably look first
whether the first available object in percpu sheaves is local, and with
high probability succeed - and only bypass the sheaves in cases it's
not local.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab_common.c |  7 +++++--
 mm/slub.c        | 43 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6c3b90f03cb79b57f426824450f576a977d85c53..af4e225372fa2d1e7d0f55a90b5335a29a36d2ea 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1623,8 +1623,11 @@ static bool kfree_rcu_sheaf(void *obj)

 	slab = folio_slab(folio);
 	s = slab->slab_cache;
-	if (s->cpu_sheaves)
-		return __kfree_rcu_sheaf(s, obj);
+	if (s->cpu_sheaves) {
+		if (likely(!IS_ENABLED(CONFIG_NUMA) ||
+			   slab_nid(slab) == numa_node_id()))
+			return __kfree_rcu_sheaf(s, obj);
+	}

 	return false;
 }
diff --git a/mm/slub.c b/mm/slub.c
index cc273cc45f632e16644355831132cdc391219cec..2bf83e2b85b23f4db2b311edaded4bef6b7d01de 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -455,6 +455,7 @@ struct slab_sheaf {
 	};
 	struct kmem_cache *cache;
 	unsigned int size;
+	int node; /* only used for rcu_sheaf */
 	void *objects[];
 };

@@ -5649,7 +5650,7 @@ static void rcu_free_sheaf(struct rcu_head *head)
 	 */
 	__rcu_free_sheaf_prepare(s, sheaf);

-	barn = get_node(s, numa_mem_id())->barn;
+	barn = get_node(s, sheaf->node)->barn;

 	/* due to slab_free_hook() */
 	if (unlikely(sheaf->size == 0))
@@ -5724,10 +5725,12 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)

 	rcu_sheaf->objects[rcu_sheaf->size++] = obj;

-	if (likely(rcu_sheaf->size < s->sheaf_capacity))
+	if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
 		rcu_sheaf = NULL;
-	else
+	} else {
 		pcs->rcu_free = NULL;
+		rcu_sheaf->node = numa_node_id();
+	}

 	local_unlock(&s->cpu_sheaves->lock);

@@ -5753,9 +5756,13 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 	struct slab_sheaf *main, *empty;
 	unsigned int batch, i = 0;
 	bool init;
+	void *remote_objects[PCS_BATCH_MAX];
+	unsigned int remote_nr = 0;
+	int node = numa_node_id();

 	init = slab_want_init_on_free(s);

+next_remote_batch:
 	while (i < size) {
 		struct slab *slab = virt_to_slab(p[i]);

@@ -5765,7 +5772,15 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 		if (unlikely(!slab_free_hook(s, p[i], init, false))) {
 			p[i] = p[--size];
 			if (!size)
-				return;
+				goto flush_remote;
+			continue;
+		}
+
+		if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) {
+			remote_objects[remote_nr] = p[i];
+			p[i] = p[--size];
+			if (++remote_nr >= PCS_BATCH_MAX)
+				goto flush_remote;
 			continue;
 		}

@@ -5833,6 +5848,15 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 	 */
 fallback:
 	__kmem_cache_free_bulk(s, size, p);
+
+flush_remote:
+	if (remote_nr) {
+		__kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]);
+		if (i < size) {
+			remote_nr = 0;
+			goto next_remote_batch;
+		}
+	}
 }

 #ifndef CONFIG_SLUB_TINY
@@ -5924,8 +5948,15 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
 	if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
 		return;

-	if (!s->cpu_sheaves || !free_to_pcs(s, object))
-		do_slab_free(s, slab, object, object, 1, addr);
+	if (s->cpu_sheaves) {
+		if (likely(!IS_ENABLED(CONFIG_NUMA) ||
+			   slab_nid(slab) == numa_node_id())) {
+			free_to_pcs(s, object);
+			return;
+		}
+	}
+
+	do_slab_free(s, slab, object, object, 1, addr);
 }

 #ifdef CONFIG_MEMCG

-- 
2.49.0

next prev parent reply	other threads:[~2025-04-25  8:28 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-25  8:27 [PATCH v4 0/9] SLUB percpu sheaves Vlastimil Babka
2025-04-25  8:27 ` [PATCH v4 1/9] slab: add opt-in caching layer of " Vlastimil Babka
2025-04-25 17:31   ` Christoph Lameter (Ampere)
2025-04-28  7:01     ` Vlastimil Babka
2025-05-06 17:32       ` Suren Baghdasaryan
2025-05-06 23:11         ` Suren Baghdasaryan
2025-04-29  1:08   ` Harry Yoo
2025-05-13 16:08     ` Vlastimil Babka
2025-05-06 23:14   ` Suren Baghdasaryan
2025-05-14 13:06     ` Vlastimil Babka
2025-04-25  8:27 ` [PATCH v4 2/9] slab: add sheaf support for batching kfree_rcu() operations Vlastimil Babka
2025-04-29  7:36   ` Harry Yoo
2025-05-14 13:07     ` Vlastimil Babka
2025-05-06 21:34   ` Suren Baghdasaryan
2025-05-14 14:01     ` Vlastimil Babka
2025-05-15  8:45       ` Vlastimil Babka
2025-05-15 15:03         ` Suren Baghdasaryan
2025-04-25  8:27 ` [PATCH v4 3/9] slab: sheaf prefilling for guaranteed allocations Vlastimil Babka
2025-05-06 22:54   ` Suren Baghdasaryan
2025-05-07  9:15   ` Harry Yoo
2025-05-07  9:20     ` Harry Yoo
2025-05-15  8:41     ` Vlastimil Babka
2025-04-25  8:27 ` [PATCH v4 4/9] slab: determine barn status racily outside of lock Vlastimil Babka
2025-04-25  8:27 ` [PATCH v4 5/9] tools: Add testing support for changes to rcu and slab for sheaves Vlastimil Babka
2025-04-25  8:27 ` [PATCH v4 6/9] tools: Add sheaves support to testing infrastructure Vlastimil Babka
2025-04-25  8:27 ` [PATCH v4 7/9] maple_tree: use percpu sheaves for maple_node_cache Vlastimil Babka
2025-04-25  8:27 ` [PATCH v4 8/9] mm, vma: use percpu sheaves for vm_area_struct cache Vlastimil Babka
2025-05-06 23:08   ` Suren Baghdasaryan
2025-04-25  8:27 ` Vlastimil Babka [this message]
2025-04-25 17:35   ` [PATCH v4 9/9] mm, slub: skip percpu sheaves for remote object freeing Christoph Lameter (Ampere)
2025-04-28  7:08     ` Vlastimil Babka
2025-05-07 10:39   ` Harry Yoo
2025-05-15  8:59     ` Vlastimil Babka
2025-05-15 12:46 ` [PATCH v4 0/9] SLUB percpu sheaves Vlastimil Babka
2025-05-15 15:01   ` Suren Baghdasaryan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250425-slub-percpu-caches-v4-9-8a636982b4a4@suse.cz \
    --to=vbabka@suse.cz \
    --cc=Liam.Howlett@oracle.com \
    --cc=cl@linux.com \
    --cc=harry.yoo@oracle.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=maple-tree@lists.infradead.org \
    --cc=rcu@vger.kernel.org \
    --cc=rientjes@google.com \
    --cc=roman.gushchin@linux.dev \
    --cc=surenb@google.com \
    --cc=urezki@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox