linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Qi Zheng <zhengqi.arch@bytedance.com>
To: akpm@linux-foundation.org, david@fromorbit.com, tkhai@ya.ru,
	vbabka@suse.cz, roman.gushchin@linux.dev, djwong@kernel.org,
	brauner@kernel.org, paulmck@kernel.org, tytso@mit.edu
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	intel-gfx@lists.freedesktop.org, dri-devel@lists.freedesktop.org,
	linux-arm-msm@vger.kernel.org, dm-devel@redhat.com,
	linux-raid@vger.kernel.org, linux-bcache@vger.kernel.org,
	virtualization@lists.linux-foundation.org,
	linux-fsdevel@vger.kernel.org, linux-ext4@vger.kernel.org,
	linux-nfs@vger.kernel.org, linux-xfs@vger.kernel.org,
	linux-btrfs@vger.kernel.org,
	Qi Zheng <zhengqi.arch@bytedance.com>
Subject: [PATCH 25/29] mm: vmscan: make memcg slab shrink lockless
Date: Thu, 22 Jun 2023 16:53:31 +0800	[thread overview]
Message-ID: <20230622085335.77010-26-zhengqi.arch@bytedance.com> (raw)
In-Reply-To: <20230622085335.77010-1-zhengqi.arch@bytedance.com>

Like global slab shrink, this commit also uses refcount+RCU
method to make memcg slab shrink lockless.

We can reproduce the down_read_trylock() hotspot through the
following script:

```

DIR="/root/shrinker/memcg/mnt"

do_create()
{
    mkdir -p /sys/fs/cgroup/memory/test
    mkdir -p /sys/fs/cgroup/perf_event/test
    echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
    for i in `seq 0 $1`;
    do
        mkdir -p /sys/fs/cgroup/memory/test/$i;
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
        mkdir -p $DIR/$i;
    done
}

do_mount()
{
    for i in `seq $1 $2`;
    do
        mount -t tmpfs $i $DIR/$i;
    done
}

do_touch()
{
    for i in `seq $1 $2`;
    do
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs;
            dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
    done
}

case "$1" in
  touch)
    do_touch $2 $3
    ;;
  test)
      do_create 4000
    do_mount 0 4000
    do_touch 0 3000
    ;;
  *)
    exit 1
    ;;
esac
```

Save the above script, then run test and touch commands.
Then we can use the following perf command to view hotspots:

perf top -U -F 999 [-g]

1) Before applying this patchset:

  35.34%  [kernel]             [k] down_read_trylock
  18.44%  [kernel]             [k] shrink_slab
  15.98%  [kernel]             [k] pv_native_safe_halt
  15.08%  [kernel]             [k] up_read
   5.33%  [kernel]             [k] idr_find
   2.71%  [kernel]             [k] _find_next_bit
   2.21%  [kernel]             [k] shrink_node
   1.29%  [kernel]             [k] shrink_lruvec
   0.66%  [kernel]             [k] do_shrink_slab
   0.33%  [kernel]             [k] list_lru_count_one
   0.33%  [kernel]             [k] __radix_tree_lookup
   0.25%  [kernel]             [k] mem_cgroup_iter

-   82.19%    19.49%  [kernel]                  [k] shrink_slab
   - 62.00% shrink_slab
        36.37% down_read_trylock
        15.52% up_read
        5.48% idr_find
        3.38% _find_next_bit
      + 0.98% do_shrink_slab

2) After applying this patchset:

  46.83%  [kernel]           [k] shrink_slab
  20.52%  [kernel]           [k] pv_native_safe_halt
   8.85%  [kernel]           [k] do_shrink_slab
   7.71%  [kernel]           [k] _find_next_bit
   1.72%  [kernel]           [k] xas_descend
   1.70%  [kernel]           [k] shrink_node
   1.44%  [kernel]           [k] shrink_lruvec
   1.43%  [kernel]           [k] mem_cgroup_iter
   1.28%  [kernel]           [k] xas_load
   0.89%  [kernel]           [k] super_cache_count
   0.84%  [kernel]           [k] xas_start
   0.66%  [kernel]           [k] list_lru_count_one

-   65.50%    40.44%  [kernel]                  [k] shrink_slab
   - 22.96% shrink_slab
        13.11% _find_next_bit
      - 9.91% do_shrink_slab
         - 1.59% super_cache_count
              0.92% list_lru_count_one

We can see that the first perf hotspot becomes shrink_slab,
which is what we expect.

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 mm/vmscan.c | 58 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 767569698946..357a1f2ad690 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -213,6 +213,12 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
 					 lockdep_is_held(&shrinker_rwsem));
 }
 
+static struct shrinker_info *shrinker_info_rcu(struct mem_cgroup *memcg,
+					       int nid)
+{
+	return rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+}
+
 static int expand_one_shrinker_info(struct mem_cgroup *memcg,
 				    int map_size, int defer_size,
 				    int old_map_size, int old_defer_size,
@@ -339,7 +345,7 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
 		struct shrinker_info *info;
 
 		rcu_read_lock();
-		info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+		info = shrinker_info_rcu(memcg, nid);
 		if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
 			/* Pairs with smp mb in shrink_slab() */
 			smp_mb__before_atomic();
@@ -359,7 +365,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 		return -ENOSYS;
 
 	down_write(&shrinker_rwsem);
-	/* This may call shrinker, so it must use down_read_trylock() */
 	id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
 	if (id < 0)
 		goto unlock;
@@ -392,18 +397,28 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
 				   struct mem_cgroup *memcg)
 {
 	struct shrinker_info *info;
+	long nr_deferred;
 
-	info = shrinker_info_protected(memcg, nid);
-	return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
+	rcu_read_lock();
+	info = shrinker_info_rcu(memcg, nid);
+	nr_deferred = atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
+	rcu_read_unlock();
+
+	return nr_deferred;
 }
 
 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
 				  struct mem_cgroup *memcg)
 {
 	struct shrinker_info *info;
+	long nr_deferred;
+
+	rcu_read_lock();
+	info = shrinker_info_rcu(memcg, nid);
+	nr_deferred = atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
+	rcu_read_unlock();
 
-	info = shrinker_info_protected(memcg, nid);
-	return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
+	return nr_deferred;
 }
 
 void reparent_shrinker_deferred(struct mem_cgroup *memcg)
@@ -955,19 +970,18 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 {
 	struct shrinker_info *info;
 	unsigned long ret, freed = 0;
-	int i;
+	int i = 0;
 
 	if (!mem_cgroup_online(memcg))
 		return 0;
 
-	if (!down_read_trylock(&shrinker_rwsem))
-		return 0;
-
-	info = shrinker_info_protected(memcg, nid);
+again:
+	rcu_read_lock();
+	info = shrinker_info_rcu(memcg, nid);
 	if (unlikely(!info))
 		goto unlock;
 
-	for_each_set_bit(i, info->map, info->map_nr_max) {
+	for_each_set_bit_from(i, info->map, info->map_nr_max) {
 		struct shrink_control sc = {
 			.gfp_mask = gfp_mask,
 			.nid = nid,
@@ -982,6 +996,10 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 			continue;
 		}
 
+		if (!shrinker_try_get(shrinker))
+			continue;
+		rcu_read_unlock();
+
 		/* Call non-slab shrinkers even though kmem is disabled */
 		if (!memcg_kmem_online() &&
 		    !(shrinker->flags & SHRINKER_NONSLAB))
@@ -1014,13 +1032,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 		}
 		freed += ret;
 
-		if (rwsem_is_contended(&shrinker_rwsem)) {
-			freed = freed ? : 1;
-			break;
-		}
+		shrinker_put(shrinker);
+
+		/*
+		 * We have already exited the read-side of rcu critical section
+		 * before calling do_shrink_slab(), the shrinker_info may be
+		 * released in expand_one_shrinker_info(), so restart the
+		 * iteration.
+		 */
+		i++;
+		goto again;
 	}
 unlock:
-	up_read(&shrinker_rwsem);
+	rcu_read_unlock();
 	return freed;
 }
 #else /* CONFIG_MEMCG */
-- 
2.30.2



  parent reply	other threads:[~2023-06-22  8:57 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-22  8:53 [PATCH 00/29] use refcount+RCU method to implement lockless slab shrink Qi Zheng
2023-06-22  8:53 ` [PATCH 01/29] mm: shrinker: add shrinker::private_data field Qi Zheng
2023-06-22 14:47   ` Vlastimil Babka
2023-06-23 12:50     ` [External] " Qi Zheng
2023-06-22  8:53 ` [PATCH 02/29] mm: vmscan: introduce some helpers for dynamically allocating shrinker Qi Zheng
2023-06-23  6:12   ` Dave Chinner
2023-06-23 12:49     ` Qi Zheng
2023-06-22  8:53 ` [PATCH 03/29] drm/i915: dynamically allocate the i915_gem_mm shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 04/29] drm/msm: dynamically allocate the drm-msm_gem shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 05/29] drm/panfrost: dynamically allocate the drm-panfrost shrinker Qi Zheng
2023-06-23 13:33   ` Qi Zheng
2023-06-23 14:18   ` Bobs_Email
2023-06-22  8:53 ` [PATCH 06/29] dm: dynamically allocate the dm-bufio shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 07/29] dm zoned: dynamically allocate the dm-zoned-meta shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 08/29] md/raid5: dynamically allocate the md-raid5 shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 09/29] bcache: dynamically allocate the md-bcache shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 10/29] vmw_balloon: dynamically allocate the vmw-balloon shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 11/29] virtio_balloon: dynamically allocate the virtio-balloon shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 12/29] mbcache: dynamically allocate the mbcache shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 13/29] ext4: dynamically allocate the ext4-es shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 14/29] jbd2,ext4: dynamically allocate the jbd2-journal shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 15/29] NFSD: dynamically allocate the nfsd-client shrinker Qi Zheng
2023-06-23 21:49   ` Chuck Lever
2023-06-24 11:17     ` Qi Zheng
2023-06-22  8:53 ` [PATCH 16/29] NFSD: dynamically allocate the nfsd-reply shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 17/29] xfs: dynamically allocate the xfs-buf shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 18/29] xfs: dynamically allocate the xfs-inodegc shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 19/29] xfs: dynamically allocate the xfs-qm shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 20/29] zsmalloc: dynamically allocate the mm-zspool shrinker Qi Zheng
2023-06-22  8:53 ` [PATCH 21/29] fs: super: dynamically allocate the s_shrink Qi Zheng
2023-06-22  8:53 ` [PATCH 22/29] drm/ttm: introduce pool_shrink_rwsem Qi Zheng
2023-06-22  8:53 ` [PATCH 23/29] mm: shrinker: add refcount and completion_wait fields Qi Zheng
2023-06-22  8:53 ` [PATCH 24/29] mm: vmscan: make global slab shrink lockless Qi Zheng
2023-06-22 15:12   ` Vlastimil Babka
2023-06-22 16:42     ` Qi Zheng
2023-06-22 17:41       ` Alan Huang
2023-06-22 18:18         ` Qi Zheng
2023-06-23  6:29     ` Dave Chinner
2023-06-23 13:10       ` Qi Zheng
2023-06-23 22:19         ` Dave Chinner
2023-06-24 11:08           ` Qi Zheng
2023-06-25  3:15             ` Qi Zheng
2023-07-04  4:20             ` Qi Zheng
2023-07-03 16:39       ` Paul E. McKenney
2023-07-04  3:45         ` Qi Zheng
2023-07-05  3:27           ` Qi Zheng
2023-06-22  8:53 ` Qi Zheng [this message]
2023-06-22  8:53 ` [PATCH 26/29] mm: shrinker: make count and scan in shrinker debugfs lockless Qi Zheng
2023-06-22  8:53 ` [PATCH 27/29] mm: vmscan: hold write lock to reparent shrinker nr_deferred Qi Zheng
2023-06-22  8:53 ` [PATCH 28/29] mm: shrinkers: convert shrinker_rwsem to mutex Qi Zheng
2023-06-22  8:53 ` [PATCH 29/29] mm: shrinker: move shrinker-related code into a separate file Qi Zheng
2023-06-22 14:53   ` Vlastimil Babka
2023-06-23 13:12     ` Qi Zheng
2023-06-23  5:25   ` Sergey Senozhatsky
2023-06-23 13:24     ` Qi Zheng
2023-06-22  9:02 ` [PATCH 00/29] use refcount+RCU method to implement lockless slab shrink Qi Zheng

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230622085335.77010-26-zhengqi.arch@bytedance.com \
    --to=zhengqi.arch@bytedance.com \
    --cc=akpm@linux-foundation.org \
    --cc=brauner@kernel.org \
    --cc=david@fromorbit.com \
    --cc=djwong@kernel.org \
    --cc=dm-devel@redhat.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=intel-gfx@lists.freedesktop.org \
    --cc=linux-arm-msm@vger.kernel.org \
    --cc=linux-bcache@vger.kernel.org \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=linux-raid@vger.kernel.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=paulmck@kernel.org \
    --cc=roman.gushchin@linux.dev \
    --cc=tkhai@ya.ru \
    --cc=tytso@mit.edu \
    --cc=vbabka@suse.cz \
    --cc=virtualization@lists.linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox