linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Harry Yoo <harry.yoo@oracle.com>
To: Vlastimil Babka <vbabka@suse.cz>,
	Christoph Lameter <cl@gentwo.org>,
	David Rientjes <rientjes@google.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Dennis Zhou <dennis@kernel.org>, Tejun Heo <tj@kernel.org>,
	Mateusz Guzik <mjguzik@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>,
	Cong Wang <xiyou.wangcong@gmail.com>,
	Jiri Pirko <jiri@resnulli.us>, Vlad Buslov <vladbu@nvidia.com>,
	Yevgeny Kliteynik <kliteyn@nvidia.com>, Jan Kara <jack@suse.cz>,
	Byungchul Park <byungchul@sk.com>,
	linux-mm@kvack.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Harry Yoo <harry.yoo@oracle.com>
Subject: [RFC PATCH 4/7] net/sched/act_api: use slab ctor/dtor to reduce contention on pcpu alloc
Date: Thu, 24 Apr 2025 17:07:52 +0900	[thread overview]
Message-ID: <20250424080755.272925-5-harry.yoo@oracle.com> (raw)
In-Reply-To: <20250424080755.272925-1-harry.yoo@oracle.com>

A tc_action object allocates three percpu memory regions and stores their
pointers within the object. For each object's lifetime, this requires
acquiring and releasing the globak lock, pcpu_alloc_mutex three times.

In workloads that frequently create and destroy TC filters, this leads
to severe lock contention due to the globl lock.

By using the slab constructor/destructor pair, the contention on
pcpu_alloc_mutex is shifted to the creation and destruction of slabs
(which contains multiple objects), which occur far less frequently than
allocating and freeing individual tc_action objects.

When tested with the following command, a 26% reduction in system time
was observed.

$ cd tools/testing/selftests/tc-testing
$ sudo python3 tdc.py -f ./tc-tests/filters/flower.json -d <NIC name>

Lock contention as measured with `perf lock record/report`:

Before:
                Name   acquired  contended     avg wait   total wait     max wait     min wait

                       15042346   15042346      3.82 us     57.40 s       4.00 ms       316 ns
    pcpu_alloc_mutex   10959650   10959650      7.10 us      1.30 m       3.76 ms       313 ns

After:
                Name   acquired  contended     avg wait   total wait     max wait     min wait

                       15488031   15488031      5.16 us      1.33 m  5124095.50 h        316 ns
    pcpu_alloc_mutex    7695276    7695276      3.39 us     26.07 s       4.03 ms       284 ns

The contention has moved from pcpu_alloc_mutex to other locks (which are not
symbolized and appear as blank in the output above).

Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
---
 net/sched/act_api.c | 82 +++++++++++++++++++++++++++++++--------------
 1 file changed, 56 insertions(+), 26 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 839790043256..60cde766135a 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -112,6 +112,8 @@ struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action,
 }
 EXPORT_SYMBOL(tcf_action_set_ctrlact);
 
+static struct kmem_cache *tc_action_cache;
+
 /* XXX: For standalone actions, we don't need a RCU grace period either, because
  * actions are always connected to filters and filters are already destroyed in
  * RCU callbacks, so after a RCU grace period actions are already disconnected
@@ -121,15 +123,15 @@ static void free_tcf(struct tc_action *p)
 {
 	struct tcf_chain *chain = rcu_dereference_protected(p->goto_chain, 1);
 
-	free_percpu(p->cpu_bstats);
-	free_percpu(p->cpu_bstats_hw);
-	free_percpu(p->cpu_qstats);
 
 	tcf_set_action_cookie(&p->user_cookie, NULL);
 	if (chain)
 		tcf_chain_put_by_act(chain);
 
-	kfree(p);
+	if (p->cpu_bstats)
+		kmem_cache_free(tc_action_cache, p);
+	else
+		kfree(p);
 }
 
 static void offload_action_hw_count_set(struct tc_action *act,
@@ -778,27 +780,20 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 		   struct tc_action **a, const struct tc_action_ops *ops,
 		   int bind, bool cpustats, u32 flags)
 {
-	struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
+	struct tc_action *p;
 	struct tcf_idrinfo *idrinfo = tn->idrinfo;
 	int err = -ENOMEM;
 
+	if (cpustats)
+		p = kmem_cache_alloc(tc_action_cache, GFP_KERNEL);
+	else
+		p = kzalloc(ops->size, GFP_KERNEL);
+
 	if (unlikely(!p))
 		return -ENOMEM;
 	refcount_set(&p->tcfa_refcnt, 1);
 	if (bind)
 		atomic_set(&p->tcfa_bindcnt, 1);
-
-	if (cpustats) {
-		p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
-		if (!p->cpu_bstats)
-			goto err1;
-		p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
-		if (!p->cpu_bstats_hw)
-			goto err2;
-		p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
-		if (!p->cpu_qstats)
-			goto err3;
-	}
 	gnet_stats_basic_sync_init(&p->tcfa_bstats);
 	gnet_stats_basic_sync_init(&p->tcfa_bstats_hw);
 	spin_lock_init(&p->tcfa_lock);
@@ -812,7 +807,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 					&p->tcfa_rate_est,
 					&p->tcfa_lock, false, est);
 		if (err)
-			goto err4;
+			goto err;
 	}
 
 	p->idrinfo = idrinfo;
@@ -820,14 +815,11 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 	p->ops = ops;
 	*a = p;
 	return 0;
-err4:
-	free_percpu(p->cpu_qstats);
-err3:
-	free_percpu(p->cpu_bstats_hw);
-err2:
-	free_percpu(p->cpu_bstats);
-err1:
-	kfree(p);
+err:
+	if (cpustats)
+		kmem_cache_free(tc_action_cache, p);
+	else
+		kfree(p);
 	return err;
 }
 EXPORT_SYMBOL(tcf_idr_create);
@@ -2270,8 +2262,46 @@ static const struct rtnl_msg_handler tc_action_rtnl_msg_handlers[] __initconst =
 	 .dumpit = tc_dump_action},
 };
 
+static int tcf_action_ctor(void *object) {
+	struct tc_action *p = object;
+
+	p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
+	if (!p->cpu_bstats)
+		goto err1;
+	p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
+	if (!p->cpu_bstats_hw)
+		goto err2;
+	p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+	if (!p->cpu_qstats)
+		goto err3;
+	return 0;
+
+err3:
+	free_percpu(p->cpu_bstats_hw);
+err2:
+	free_percpu(p->cpu_bstats);
+err1:
+	return -ENOMEM;
+}
+
+static void tcf_action_dtor(void *object) {
+	struct tc_action *p = object;
+
+	free_percpu(p->cpu_bstats);
+	free_percpu(p->cpu_bstats_hw);
+	free_percpu(p->cpu_qstats);
+}
+
 static int __init tc_action_init(void)
 {
+	struct kmem_cache_args kmem_args = {
+		.ctor = tcf_action_ctor,
+		.dtor = tcf_action_dtor,
+	};
+
+	tc_action_cache = kmem_cache_create("tc_action",
+					    sizeof(struct tc_action),
+					    &kmem_args, SLAB_PANIC);
 	rtnl_register_many(tc_action_rtnl_msg_handlers);
 	return 0;
 }
-- 
2.43.0



  parent reply	other threads:[~2025-04-24  8:17 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-24  8:07 [RFC PATCH 0/7] Reviving the slab destructor to tackle the percpu allocator scalability problem Harry Yoo
2025-04-24  8:07 ` [RFC PATCH 1/7] mm/slab: refactor freelist shuffle Harry Yoo
2025-04-24  8:07 ` [RFC PATCH 2/7] treewide, slab: allow slab constructor to return an error Harry Yoo
2025-04-24  8:07 ` [RFC PATCH 3/7] mm/slab: revive the destructor feature in slab allocator Harry Yoo
2025-04-24  8:07 ` Harry Yoo [this message]
2025-04-24  8:07 ` [RFC PATCH 5/7] mm/percpu: allow (un)charging objects without alloc/free Harry Yoo
2025-04-24  8:07 ` [RFC PATCH 6/7] lib/percpu_counter: allow (un)charging percpu counters " Harry Yoo
2025-04-24  8:07 ` [RFC PATCH 7/7] kernel/fork: improve exec() throughput with slab ctor/dtor pair Harry Yoo
2025-04-24  9:29 ` [RFC PATCH 0/7] Reviving the slab destructor to tackle the percpu allocator scalability problem Mateusz Guzik
2025-04-24  9:58   ` Harry Yoo
2025-04-24 15:00     ` Mateusz Guzik
2025-04-24 11:28 ` Pedro Falcato
2025-04-24 15:20   ` Mateusz Guzik
2025-04-24 16:11     ` Mateusz Guzik
2025-04-25  7:40     ` Harry Yoo
2025-04-25 10:12   ` Harry Yoo
2025-04-25 10:42     ` Pedro Falcato
2025-04-28  1:18       ` Harry Yoo
2025-04-30 19:49       ` Mateusz Guzik
2025-05-12 11:00         ` Harry Yoo
2025-04-24 15:50 ` Christoph Lameter (Ampere)
2025-04-24 16:03   ` Mateusz Guzik
2025-04-24 16:39     ` Christoph Lameter (Ampere)
2025-04-24 17:26       ` Mateusz Guzik
2025-04-24 18:47 ` Tejun Heo
2025-04-25 10:10   ` Harry Yoo
2025-04-25 19:03     ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250424080755.272925-5-harry.yoo@oracle.com \
    --to=harry.yoo@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=byungchul@sk.com \
    --cc=cl@gentwo.org \
    --cc=dennis@kernel.org \
    --cc=jack@suse.cz \
    --cc=jhs@mojatatu.com \
    --cc=jiri@resnulli.us \
    --cc=kliteyn@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mjguzik@gmail.com \
    --cc=netdev@vger.kernel.org \
    --cc=rientjes@google.com \
    --cc=tj@kernel.org \
    --cc=vbabka@suse.cz \
    --cc=vladbu@nvidia.com \
    --cc=xiyou.wangcong@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox