linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Alexei Starovoitov <alexei.starovoitov@gmail.com>
To: davem@davemloft.net
Cc: daniel@iogearbox.net, andrii@kernel.org, tj@kernel.org,
	memxor@gmail.com, delyank@fb.com, linux-mm@kvack.org,
	bpf@vger.kernel.org, kernel-team@fb.com
Subject: [PATCH v6 bpf-next 16/16] bpf: Optimize rcu_barrier usage between hash map and bpf_mem_alloc.
Date: Fri,  2 Sep 2022 14:10:58 -0700	[thread overview]
Message-ID: <20220902211058.60789-17-alexei.starovoitov@gmail.com> (raw)
In-Reply-To: <20220902211058.60789-1-alexei.starovoitov@gmail.com>

From: Alexei Starovoitov <ast@kernel.org>

User space might be creating and destroying a lot of hash maps. Synchronous
rcu_barrier-s in a destruction path of hash map delay freeing of hash buckets
and other map memory and may cause artificial OOM situation under stress.
Optimize rcu_barrier usage between bpf hash map and bpf_mem_alloc:
- remove rcu_barrier from hash map, since htab doesn't use call_rcu
  directly and there are no callback to wait for.
- bpf_mem_alloc has call_rcu_in_progress flag that indicates pending callbacks.
  Use it to avoid barriers in fast path.
- When barriers are needed copy bpf_mem_alloc into temp structure
  and wait for rcu barrier-s in the worker to let the rest of
  hash map freeing to proceed.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |  2 +
 kernel/bpf/hashtab.c          |  6 +--
 kernel/bpf/memalloc.c         | 80 ++++++++++++++++++++++++++++-------
 3 files changed, 69 insertions(+), 19 deletions(-)

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index 653ed1584a03..3e164b8efaa9 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -3,6 +3,7 @@
 #ifndef _BPF_MEM_ALLOC_H
 #define _BPF_MEM_ALLOC_H
 #include <linux/compiler_types.h>
+#include <linux/workqueue.h>
 
 struct bpf_mem_cache;
 struct bpf_mem_caches;
@@ -10,6 +11,7 @@ struct bpf_mem_caches;
 struct bpf_mem_alloc {
 	struct bpf_mem_caches __percpu *caches;
 	struct bpf_mem_cache __percpu *cache;
+	struct work_struct work;
 };
 
 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a77b9c4a4e48..0fe3f136cbbe 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1546,10 +1546,10 @@ static void htab_map_free(struct bpf_map *map)
 	 * There is no need to synchronize_rcu() here to protect map elements.
 	 */
 
-	/* some of free_htab_elem() callbacks for elements of this map may
-	 * not have executed. Wait for them.
+	/* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
+	 * underneath and is reponsible for waiting for callbacks to finish
+	 * during bpf_mem_alloc_destroy().
 	 */
-	rcu_barrier();
 	if (!htab_is_prealloc(htab)) {
 		delete_all_elements(htab);
 	} else {
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 38fbd15c130a..5cc952da7d41 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -414,10 +414,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
 {
 	struct llist_node *llnode, *t;
 
-	/* The caller has done rcu_barrier() and no progs are using this
-	 * bpf_mem_cache, but htab_map_free() called bpf_mem_cache_free() for
-	 * all remaining elements and they can be in free_by_rcu or in
-	 * waiting_for_gp lists, so drain those lists now.
+	/* No progs are using this bpf_mem_cache, but htab_map_free() called
+	 * bpf_mem_cache_free() for all remaining elements and they can be in
+	 * free_by_rcu or in waiting_for_gp lists, so drain those lists now.
 	 */
 	llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
 		free_one(c, llnode);
@@ -429,42 +428,91 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
 		free_one(c, llnode);
 }
 
+static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
+{
+	free_percpu(ma->cache);
+	free_percpu(ma->caches);
+	ma->cache = NULL;
+	ma->caches = NULL;
+}
+
+static void free_mem_alloc(struct bpf_mem_alloc *ma)
+{
+	/* waiting_for_gp lists was drained, but __free_rcu might
+	 * still execute. Wait for it now before we freeing percpu caches.
+	 */
+	rcu_barrier_tasks_trace();
+	rcu_barrier();
+	free_mem_alloc_no_barrier(ma);
+}
+
+static void free_mem_alloc_deferred(struct work_struct *work)
+{
+	struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
+
+	free_mem_alloc(ma);
+	kfree(ma);
+}
+
+static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
+{
+	struct bpf_mem_alloc *copy;
+
+	if (!rcu_in_progress) {
+		/* Fast path. No callbacks are pending, hence no need to do
+		 * rcu_barrier-s.
+		 */
+		free_mem_alloc_no_barrier(ma);
+		return;
+	}
+
+	copy = kmalloc(sizeof(*ma), GFP_KERNEL);
+	if (!copy) {
+		/* Slow path with inline barrier-s */
+		free_mem_alloc(ma);
+		return;
+	}
+
+	/* Defer barriers into worker to let the rest of map memory to be freed */
+	copy->cache = ma->cache;
+	ma->cache = NULL;
+	copy->caches = ma->caches;
+	ma->caches = NULL;
+	INIT_WORK(&copy->work, free_mem_alloc_deferred);
+	queue_work(system_unbound_wq, &copy->work);
+}
+
 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
 {
 	struct bpf_mem_caches *cc;
 	struct bpf_mem_cache *c;
-	int cpu, i;
+	int cpu, i, rcu_in_progress;
 
 	if (ma->cache) {
+		rcu_in_progress = 0;
 		for_each_possible_cpu(cpu) {
 			c = per_cpu_ptr(ma->cache, cpu);
 			drain_mem_cache(c);
+			rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
 		}
 		/* objcg is the same across cpus */
 		if (c->objcg)
 			obj_cgroup_put(c->objcg);
-		/* c->waiting_for_gp list was drained, but __free_rcu might
-		 * still execute. Wait for it now before we free 'c'.
-		 */
-		rcu_barrier_tasks_trace();
-		rcu_barrier();
-		free_percpu(ma->cache);
-		ma->cache = NULL;
+		destroy_mem_alloc(ma, rcu_in_progress);
 	}
 	if (ma->caches) {
+		rcu_in_progress = 0;
 		for_each_possible_cpu(cpu) {
 			cc = per_cpu_ptr(ma->caches, cpu);
 			for (i = 0; i < NUM_CACHES; i++) {
 				c = &cc->cache[i];
 				drain_mem_cache(c);
+				rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
 			}
 		}
 		if (c->objcg)
 			obj_cgroup_put(c->objcg);
-		rcu_barrier_tasks_trace();
-		rcu_barrier();
-		free_percpu(ma->caches);
-		ma->caches = NULL;
+		destroy_mem_alloc(ma, rcu_in_progress);
 	}
 }
 
-- 
2.30.2



  parent reply	other threads:[~2022-09-02 21:12 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-09-02 21:10 [PATCH v6 bpf-next 00/16] bpf: BPF specific memory allocator Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 01/16] bpf: Introduce any context " Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 02/16] bpf: Convert hash map to bpf_mem_alloc Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 03/16] selftests/bpf: Improve test coverage of test_maps Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 04/16] samples/bpf: Reduce syscall overhead in map_perf_test Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 05/16] bpf: Relax the requirement to use preallocated hash maps in tracing progs Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 06/16] bpf: Optimize element count in non-preallocated hash map Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 07/16] bpf: Optimize call_rcu " Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 08/16] bpf: Adjust low/high watermarks in bpf_mem_cache Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 09/16] bpf: Batch call_rcu callbacks instead of SLAB_TYPESAFE_BY_RCU Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 10/16] bpf: Add percpu allocation support to bpf_mem_alloc Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 11/16] bpf: Convert percpu hash map to per-cpu bpf_mem_alloc Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 12/16] bpf: Remove tracing program restriction on map types Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 13/16] bpf: Prepare bpf_mem_alloc to be used by sleepable bpf programs Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 14/16] bpf: Remove prealloc-only restriction for " Alexei Starovoitov
2022-09-02 21:10 ` [PATCH v6 bpf-next 15/16] bpf: Remove usage of kmem_cache from bpf_mem_cache Alexei Starovoitov
2022-09-02 21:10 ` Alexei Starovoitov [this message]
2022-09-05 13:50 ` [PATCH v6 bpf-next 00/16] bpf: BPF specific memory allocator patchwork-bot+netdevbpf

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220902211058.60789-17-alexei.starovoitov@gmail.com \
    --to=alexei.starovoitov@gmail.com \
    --cc=andrii@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=delyank@fb.com \
    --cc=kernel-team@fb.com \
    --cc=linux-mm@kvack.org \
    --cc=memxor@gmail.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox