From: Chengming Zhou <zhouchengming@bytedance.com>
To: Vitaly Wool <vitaly.wool@konsulko.com>,
Nhat Pham <nphamcs@gmail.com>,
Johannes Weiner <hannes@cmpxchg.org>,
Michal Hocko <mhocko@kernel.org>,
Seth Jennings <sjenning@redhat.com>,
Dan Streetman <ddstreet@ieee.org>,
Andrew Morton <akpm@linux-foundation.org>,
Yosry Ahmed <yosryahmed@google.com>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
Chengming Zhou <zhouchengming@bytedance.com>
Subject: [PATCH 2/7] mm/zswap: split zswap rb-tree
Date: Wed, 06 Dec 2023 09:46:25 +0000 [thread overview]
Message-ID: <20231206-zswap-lock-optimize-v1-2-e25b059f9c3a@bytedance.com> (raw)
In-Reply-To: <20231206-zswap-lock-optimize-v1-0-e25b059f9c3a@bytedance.com>
Each swapfile has one rb-tree to search the mapping of swp_entry_t to
zswap_entry, that use a spinlock to protect, which can cause heavy lock
contention if multiple tasks zswap_store/load concurrently.
Optimize the scalability problem by splitting the zswap rb-tree into
multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M),
just like we did in the swap cache address_space splitting.
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
include/linux/zswap.h | 4 +--
mm/swapfile.c | 2 +-
mm/zswap.c | 69 ++++++++++++++++++++++++++++++++-------------------
3 files changed, 47 insertions(+), 28 deletions(-)
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 7cccc02cb9e9..d3a8bc300b70 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -30,7 +30,7 @@ struct zswap_lruvec_state {
bool zswap_store(struct folio *folio);
bool zswap_load(struct folio *folio);
void zswap_invalidate(int type, pgoff_t offset);
-int zswap_swapon(int type);
+int zswap_swapon(int type, unsigned long nr_pages);
void zswap_swapoff(int type);
void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
void zswap_lruvec_state_init(struct lruvec *lruvec);
@@ -50,7 +50,7 @@ static inline bool zswap_load(struct folio *folio)
}
static inline void zswap_invalidate(int type, pgoff_t offset) {}
-static inline int zswap_swapon(int type) {}
+static inline int zswap_swapon(int type, unsigned long nr_pages) {}
static inline void zswap_swapoff(int type) {}
static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 939e7590feda..da8367a3e076 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3163,7 +3163,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (error)
goto bad_swap_unlock_inode;
- error = zswap_swapon(p->type);
+ error = zswap_swapon(p->type, maxpages);
if (error)
goto free_swap_address_space;
diff --git a/mm/zswap.c b/mm/zswap.c
index 5e2b8d5ee33b..a6b4859a0164 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -234,6 +234,7 @@ struct zswap_tree {
};
static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
+static unsigned int nr_zswap_trees[MAX_SWAPFILES];
/* RCU-protected iteration */
static LIST_HEAD(zswap_pools);
@@ -260,6 +261,10 @@ static bool zswap_has_pool;
* helpers and fwd declarations
**********************************/
+#define swap_zswap_tree(entry) \
+ (&zswap_trees[swp_type(entry)][swp_offset(entry) \
+ >> SWAP_ADDRESS_SPACE_SHIFT])
+
#define zswap_pool_debug(msg, p) \
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
zpool_get_type((p)->zpools[0]))
@@ -885,7 +890,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
* until the entry is verified to still be alive in the tree.
*/
swpoffset = swp_offset(entry->swpentry);
- tree = zswap_trees[swp_type(entry->swpentry)];
+ tree = swap_zswap_tree(entry->swpentry);
list_lru_isolate(l, item);
/*
* It's safe to drop the lock here because we return either
@@ -1535,10 +1540,9 @@ static void zswap_fill_page(void *ptr, unsigned long value)
bool zswap_store(struct folio *folio)
{
swp_entry_t swp = folio->swap;
- int type = swp_type(swp);
pgoff_t offset = swp_offset(swp);
struct page *page = &folio->page;
- struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_tree *tree = swap_zswap_tree(swp);
struct zswap_entry *entry, *dupentry;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
@@ -1610,7 +1614,7 @@ bool zswap_store(struct folio *folio)
src = kmap_local_page(page);
if (zswap_is_page_same_filled(src, &value)) {
kunmap_local(src);
- entry->swpentry = swp_entry(type, offset);
+ entry->swpentry = swp;
entry->length = 0;
entry->value = value;
atomic_inc(&zswap_same_filled_pages);
@@ -1688,7 +1692,7 @@ bool zswap_store(struct folio *folio)
mutex_unlock(acomp_ctx->mutex);
/* populate entry */
- entry->swpentry = swp_entry(type, offset);
+ entry->swpentry = swp;
entry->handle = handle;
entry->length = dlen;
@@ -1748,10 +1752,9 @@ bool zswap_store(struct folio *folio)
bool zswap_load(struct folio *folio)
{
swp_entry_t swp = folio->swap;
- int type = swp_type(swp);
pgoff_t offset = swp_offset(swp);
struct page *page = &folio->page;
- struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_tree *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
@@ -1835,7 +1838,7 @@ bool zswap_load(struct folio *folio)
void zswap_invalidate(int type, pgoff_t offset)
{
- struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset));
struct zswap_entry *entry;
/* find */
@@ -1850,37 +1853,53 @@ void zswap_invalidate(int type, pgoff_t offset)
spin_unlock(&tree->lock);
}
-int zswap_swapon(int type)
+int zswap_swapon(int type, unsigned long nr_pages)
{
- struct zswap_tree *tree;
+ struct zswap_tree *trees, *tree;
+ unsigned int nr, i;
- tree = kzalloc(sizeof(*tree), GFP_KERNEL);
- if (!tree) {
+ nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+ trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL);
+ if (!trees) {
pr_err("alloc failed, zswap disabled for swap type %d\n", type);
return -ENOMEM;
}
- tree->rbroot = RB_ROOT;
- spin_lock_init(&tree->lock);
- zswap_trees[type] = tree;
+ for (i = 0; i < nr; i++) {
+ tree = trees + i;
+ tree->rbroot = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ }
+
+ nr_zswap_trees[type] = nr;
+ zswap_trees[type] = trees;
return 0;
}
void zswap_swapoff(int type)
{
- struct zswap_tree *tree = zswap_trees[type];
- struct zswap_entry *entry, *n;
+ struct zswap_tree *trees = zswap_trees[type];
+ unsigned int i;
- if (!tree)
+ if (!trees)
return;
- /* walk the tree and free everything */
- spin_lock(&tree->lock);
- rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
- zswap_free_entry(entry);
- tree->rbroot = RB_ROOT;
- spin_unlock(&tree->lock);
- kfree(tree);
+ for (i = 0; i < nr_zswap_trees[type]; i++) {
+ struct zswap_tree *tree = trees + i;
+ struct zswap_entry *entry, *n;
+
+ /* walk the tree and free everything */
+ spin_lock(&tree->lock);
+ rbtree_postorder_for_each_entry_safe(entry, n,
+ &tree->rbroot,
+ rbnode)
+ zswap_free_entry(entry);
+ tree->rbroot = RB_ROOT;
+ spin_unlock(&tree->lock);
+ }
+
+ kvfree(trees);
+ nr_zswap_trees[type] = 0;
zswap_trees[type] = NULL;
}
--
b4 0.10.1
next prev parent reply other threads:[~2023-12-06 9:46 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-12-06 9:46 [PATCH 0/7] mm/zswap: optimize the scalability of " Chengming Zhou
2023-12-06 9:46 ` [PATCH 1/7] mm/zswap: make sure each swapfile always have " Chengming Zhou
2023-12-08 15:17 ` kernel test robot
2023-12-08 15:45 ` Chengming Zhou
2023-12-08 16:45 ` kernel test robot
2023-12-06 9:46 ` Chengming Zhou [this message]
2023-12-06 9:46 ` [PATCH 3/7] mm/zswap: reuse dstmem when decompress Chengming Zhou
2023-12-12 22:58 ` Nhat Pham
2023-12-13 2:41 ` Chengming Zhou
2023-12-06 9:46 ` [PATCH 4/7] mm/zswap: change dstmem size to one page Chengming Zhou
2023-12-06 17:12 ` Nhat Pham
2023-12-07 2:59 ` Chengming Zhou
2023-12-06 9:46 ` [PATCH 5/7] mm/zswap: refactor out __zswap_load() Chengming Zhou
2023-12-12 23:13 ` Nhat Pham
2023-12-13 2:46 ` Chengming Zhou
2023-12-06 9:46 ` [PATCH 6/7] mm/zswap: cleanup zswap_load() Chengming Zhou
2023-12-06 9:46 ` [PATCH 7/7] mm/zswap: cleanup zswap_reclaim_entry() Chengming Zhou
2023-12-06 17:24 ` [PATCH 0/7] mm/zswap: optimize the scalability of zswap rb-tree Nhat Pham
2023-12-06 20:41 ` Yosry Ahmed
2023-12-07 0:43 ` Chris Li
2023-12-07 3:25 ` Chengming Zhou
2023-12-12 23:26 ` Yosry Ahmed
2023-12-12 23:33 ` Nhat Pham
2023-12-13 2:57 ` Chengming Zhou
2023-12-06 20:08 ` Nhat Pham
2023-12-07 3:13 ` Chengming Zhou
2023-12-07 15:18 ` Chengming Zhou
2023-12-07 18:15 ` Nhat Pham
2023-12-07 18:57 ` Nhat Pham
2023-12-08 15:41 ` Chengming Zhou
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20231206-zswap-lock-optimize-v1-2-e25b059f9c3a@bytedance.com \
--to=zhouchengming@bytedance.com \
--cc=akpm@linux-foundation.org \
--cc=ddstreet@ieee.org \
--cc=hannes@cmpxchg.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=nphamcs@gmail.com \
--cc=sjenning@redhat.com \
--cc=vitaly.wool@konsulko.com \
--cc=yosryahmed@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox