* [PATCH 2/2] mm: zpool: return pool size in pages
2024-03-11 16:12 [PATCH 1/2] mm: zswap: optimize zswap pool size tracking Johannes Weiner
@ 2024-03-11 16:12 ` Johannes Weiner
2024-03-11 22:12 ` Yosry Ahmed
` (2 more replies)
2024-03-11 22:09 ` [PATCH 1/2] mm: zswap: optimize zswap pool size tracking Yosry Ahmed
` (2 subsequent siblings)
3 siblings, 3 replies; 12+ messages in thread
From: Johannes Weiner @ 2024-03-11 16:12 UTC (permalink / raw)
To: Andrew Morton
Cc: Yosry Ahmed, Nhat Pham, Chengming Zhou, linux-mm, linux-kernel
All zswap backends track their pool sizes in pages. Currently they
multiply by PAGE_SIZE for zswap, only for zswap to divide again in
order to do limit math. Report pages directly.
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
mm/z3fold.c | 2 +-
mm/zbud.c | 2 +-
mm/zpool.c | 4 ++--
mm/zsmalloc.c | 2 +-
mm/zswap.c | 4 ++--
5 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 7ab05621052d..9bacacd4168c 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1404,7 +1404,7 @@ static void z3fold_zpool_unmap(void *pool, unsigned long handle)
static u64 z3fold_zpool_total_size(void *pool)
{
- return z3fold_get_pool_size(pool) * PAGE_SIZE;
+ return z3fold_get_pool_size(pool);
}
static struct zpool_driver z3fold_zpool_driver = {
diff --git a/mm/zbud.c b/mm/zbud.c
index 2190cc1f37b3..b7d8a22bbf5f 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -412,7 +412,7 @@ static void zbud_zpool_unmap(void *pool, unsigned long handle)
static u64 zbud_zpool_total_size(void *pool)
{
- return zbud_get_pool_size(pool) * PAGE_SIZE;
+ return zbud_get_pool_size(pool);
}
static struct zpool_driver zbud_zpool_driver = {
diff --git a/mm/zpool.c b/mm/zpool.c
index 846410479c2f..410808aee7fe 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -324,9 +324,9 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
* zpool_get_total_size() - The total size of the pool
* @zpool: The zpool to check
*
- * This returns the total size in bytes of the pool.
+ * This returns the total size in pages of the pool.
*
- * Returns: Total size of the zpool in bytes.
+ * Returns: Total size of the zpool in pages.
*/
u64 zpool_get_total_size(struct zpool *zpool)
{
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 7d7cb3eaabe0..398f3856817f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -401,7 +401,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle)
static u64 zs_zpool_total_size(void *pool)
{
- return zs_get_total_pages(pool) << PAGE_SHIFT;
+ return zs_get_total_pages(pool);
}
static struct zpool_driver zs_zpool_driver = {
diff --git a/mm/zswap.c b/mm/zswap.c
index 7c39327a7cc2..fe4343e416e0 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -507,7 +507,7 @@ static unsigned long zswap_max_pages(void)
unsigned long zswap_total_pages(void)
{
struct zswap_pool *pool;
- u64 total = 0;
+ unsigned long total = 0;
rcu_read_lock();
list_for_each_entry_rcu(pool, &zswap_pools, list) {
@@ -518,7 +518,7 @@ unsigned long zswap_total_pages(void)
}
rcu_read_unlock();
- return total >> PAGE_SHIFT;
+ return total;
}
/*********************************
--
2.44.0
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH 2/2] mm: zpool: return pool size in pages
2024-03-11 16:12 ` [PATCH 2/2] mm: zpool: return pool size in pages Johannes Weiner
@ 2024-03-11 22:12 ` Yosry Ahmed
2024-03-12 2:36 ` Johannes Weiner
2024-03-12 4:56 ` Chengming Zhou
2024-03-12 9:15 ` Nhat Pham
2 siblings, 1 reply; 12+ messages in thread
From: Yosry Ahmed @ 2024-03-11 22:12 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Nhat Pham, Chengming Zhou, linux-mm, linux-kernel
On Mon, Mar 11, 2024 at 12:12:14PM -0400, Johannes Weiner wrote:
> All zswap backends track their pool sizes in pages. Currently they
> multiply by PAGE_SIZE for zswap, only for zswap to divide again in
> order to do limit math. Report pages directly.
Nice. Although I would prefer renaming the zpool interface to
total_pages and renaming the zpool backends functions as well to use
pages rather than size.
Either way:
Acked-by: Yosry Ahmed <yosryahmed@google.com>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] mm: zpool: return pool size in pages
2024-03-11 22:12 ` Yosry Ahmed
@ 2024-03-12 2:36 ` Johannes Weiner
2024-03-12 4:07 ` Yosry Ahmed
0 siblings, 1 reply; 12+ messages in thread
From: Johannes Weiner @ 2024-03-12 2:36 UTC (permalink / raw)
To: Yosry Ahmed
Cc: Andrew Morton, Nhat Pham, Chengming Zhou, linux-mm, linux-kernel
On Mon, Mar 11, 2024 at 10:12:02PM +0000, Yosry Ahmed wrote:
> On Mon, Mar 11, 2024 at 12:12:14PM -0400, Johannes Weiner wrote:
> > All zswap backends track their pool sizes in pages. Currently they
> > multiply by PAGE_SIZE for zswap, only for zswap to divide again in
> > order to do limit math. Report pages directly.
>
> Nice. Although I would prefer renaming the zpool interface to
> total_pages and renaming the zpool backends functions as well to use
> pages rather than size.
Ha, I was on the fence, since it's kind of churny. But if you don't
mind, then it works for me as well.
> Either way:
> Acked-by: Yosry Ahmed <yosryahmed@google.com>
Thanks.
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 3296438eec06..a67d62b79698 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -53,7 +53,7 @@ void *zpool_map_handle(struct zpool *pool, unsigned long handle,
void zpool_unmap_handle(struct zpool *pool, unsigned long handle);
-u64 zpool_get_total_size(struct zpool *pool);
+u64 zpool_get_total_pages(struct zpool *pool);
/**
@@ -91,7 +91,7 @@ struct zpool_driver {
enum zpool_mapmode mm);
void (*unmap)(void *pool, unsigned long handle);
- u64 (*total_size)(void *pool);
+ u64 (*total_pages)(void *pool);
};
void zpool_register_driver(struct zpool_driver *driver);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 9bacacd4168c..2ebfed32871b 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1237,12 +1237,12 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
}
/**
- * z3fold_get_pool_size() - gets the z3fold pool size in pages
+ * z3fold_get_pool_pages() - gets the z3fold pool size in pages
* @pool: pool whose size is being queried
*
* Returns: size in pages of the given pool.
*/
-static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
+static u64 z3fold_get_pool_pages(struct z3fold_pool *pool)
{
return atomic64_read(&pool->pages_nr);
}
@@ -1402,9 +1402,9 @@ static void z3fold_zpool_unmap(void *pool, unsigned long handle)
z3fold_unmap(pool, handle);
}
-static u64 z3fold_zpool_total_size(void *pool)
+static u64 z3fold_zpool_total_pages(void *pool)
{
- return z3fold_get_pool_size(pool);
+ return z3fold_get_pool_pages(pool);
}
static struct zpool_driver z3fold_zpool_driver = {
@@ -1417,7 +1417,7 @@ static struct zpool_driver z3fold_zpool_driver = {
.free = z3fold_zpool_free,
.map = z3fold_zpool_map,
.unmap = z3fold_zpool_unmap,
- .total_size = z3fold_zpool_total_size,
+ .total_pages = z3fold_zpool_total_pages,
};
MODULE_ALIAS("zpool-z3fold");
diff --git a/mm/zbud.c b/mm/zbud.c
index b7d8a22bbf5f..e9836fff9438 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -365,13 +365,13 @@ static void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
}
/**
- * zbud_get_pool_size() - gets the zbud pool size in pages
+ * zbud_get_pool_pages() - gets the zbud pool size in pages
* @pool: pool whose size is being queried
*
* Returns: size in pages of the given pool. The pool lock need not be
* taken to access pages_nr.
*/
-static u64 zbud_get_pool_size(struct zbud_pool *pool)
+static u64 zbud_get_pool_pages(struct zbud_pool *pool)
{
return pool->pages_nr;
}
@@ -410,9 +410,9 @@ static void zbud_zpool_unmap(void *pool, unsigned long handle)
zbud_unmap(pool, handle);
}
-static u64 zbud_zpool_total_size(void *pool)
+static u64 zbud_zpool_total_pages(void *pool)
{
- return zbud_get_pool_size(pool);
+ return zbud_get_pool_pages(pool);
}
static struct zpool_driver zbud_zpool_driver = {
@@ -425,7 +425,7 @@ static struct zpool_driver zbud_zpool_driver = {
.free = zbud_zpool_free,
.map = zbud_zpool_map,
.unmap = zbud_zpool_unmap,
- .total_size = zbud_zpool_total_size,
+ .total_pages = zbud_zpool_total_pages,
};
MODULE_ALIAS("zpool-zbud");
diff --git a/mm/zpool.c b/mm/zpool.c
index 410808aee7fe..b9fda1fa857d 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -321,16 +321,16 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
}
/**
- * zpool_get_total_size() - The total size of the pool
+ * zpool_get_total_pages() - The total size of the pool
* @zpool: The zpool to check
*
* This returns the total size in pages of the pool.
*
* Returns: Total size of the zpool in pages.
*/
-u64 zpool_get_total_size(struct zpool *zpool)
+u64 zpool_get_total_pages(struct zpool *zpool)
{
- return zpool->driver->total_size(zpool->pool);
+ return zpool->driver->total_pages(zpool->pool);
}
/**
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 398f3856817f..b42d3545ca85 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -399,7 +399,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle)
zs_unmap_object(pool, handle);
}
-static u64 zs_zpool_total_size(void *pool)
+static u64 zs_zpool_total_pages(void *pool)
{
return zs_get_total_pages(pool);
}
@@ -414,7 +414,7 @@ static struct zpool_driver zs_zpool_driver = {
.free = zs_zpool_free,
.map = zs_zpool_map,
.unmap = zs_zpool_unmap,
- .total_size = zs_zpool_total_size,
+ .total_pages = zs_zpool_total_pages,
};
MODULE_ALIAS("zpool-zsmalloc");
diff --git a/mm/zswap.c b/mm/zswap.c
index 7ed79caf1e1e..9fdf4c76d5ea 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -519,7 +519,7 @@ unsigned long zswap_total_pages(void)
int i;
for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
- total += zpool_get_total_size(pool->zpools[i]);
+ total += zpool_get_total_pages(pool->zpools[i]);
}
rcu_read_unlock();
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH 2/2] mm: zpool: return pool size in pages
2024-03-12 2:36 ` Johannes Weiner
@ 2024-03-12 4:07 ` Yosry Ahmed
0 siblings, 0 replies; 12+ messages in thread
From: Yosry Ahmed @ 2024-03-12 4:07 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Nhat Pham, Chengming Zhou, linux-mm, linux-kernel
On Mon, Mar 11, 2024 at 10:36:37PM -0400, Johannes Weiner wrote:
> On Mon, Mar 11, 2024 at 10:12:02PM +0000, Yosry Ahmed wrote:
> > On Mon, Mar 11, 2024 at 12:12:14PM -0400, Johannes Weiner wrote:
> > > All zswap backends track their pool sizes in pages. Currently they
> > > multiply by PAGE_SIZE for zswap, only for zswap to divide again in
> > > order to do limit math. Report pages directly.
> >
> > Nice. Although I would prefer renaming the zpool interface to
> > total_pages and renaming the zpool backends functions as well to use
> > pages rather than size.
>
> Ha, I was on the fence, since it's kind of churny. But if you don't
> mind, then it works for me as well.
If we are cleaning up, might as well do it all the way :)
>
> > Either way:
> > Acked-by: Yosry Ahmed <yosryahmed@google.com>
>
> Thanks.
LGTM. Feel free to carry the Ack forward.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] mm: zpool: return pool size in pages
2024-03-11 16:12 ` [PATCH 2/2] mm: zpool: return pool size in pages Johannes Weiner
2024-03-11 22:12 ` Yosry Ahmed
@ 2024-03-12 4:56 ` Chengming Zhou
2024-03-12 9:15 ` Nhat Pham
2 siblings, 0 replies; 12+ messages in thread
From: Chengming Zhou @ 2024-03-12 4:56 UTC (permalink / raw)
To: Johannes Weiner, Andrew Morton
Cc: Yosry Ahmed, Nhat Pham, Chengming Zhou, linux-mm, linux-kernel
On 2024/3/12 00:12, Johannes Weiner wrote:
> All zswap backends track their pool sizes in pages. Currently they
> multiply by PAGE_SIZE for zswap, only for zswap to divide again in
> order to do limit math. Report pages directly.
>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
With the incremental diff, feel free to add:
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Thanks.
> ---
> mm/z3fold.c | 2 +-
> mm/zbud.c | 2 +-
> mm/zpool.c | 4 ++--
> mm/zsmalloc.c | 2 +-
> mm/zswap.c | 4 ++--
> 5 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 7ab05621052d..9bacacd4168c 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -1404,7 +1404,7 @@ static void z3fold_zpool_unmap(void *pool, unsigned long handle)
>
> static u64 z3fold_zpool_total_size(void *pool)
> {
> - return z3fold_get_pool_size(pool) * PAGE_SIZE;
> + return z3fold_get_pool_size(pool);
> }
>
> static struct zpool_driver z3fold_zpool_driver = {
> diff --git a/mm/zbud.c b/mm/zbud.c
> index 2190cc1f37b3..b7d8a22bbf5f 100644
> --- a/mm/zbud.c
> +++ b/mm/zbud.c
> @@ -412,7 +412,7 @@ static void zbud_zpool_unmap(void *pool, unsigned long handle)
>
> static u64 zbud_zpool_total_size(void *pool)
> {
> - return zbud_get_pool_size(pool) * PAGE_SIZE;
> + return zbud_get_pool_size(pool);
> }
>
> static struct zpool_driver zbud_zpool_driver = {
> diff --git a/mm/zpool.c b/mm/zpool.c
> index 846410479c2f..410808aee7fe 100644
> --- a/mm/zpool.c
> +++ b/mm/zpool.c
> @@ -324,9 +324,9 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
> * zpool_get_total_size() - The total size of the pool
> * @zpool: The zpool to check
> *
> - * This returns the total size in bytes of the pool.
> + * This returns the total size in pages of the pool.
> *
> - * Returns: Total size of the zpool in bytes.
> + * Returns: Total size of the zpool in pages.
> */
> u64 zpool_get_total_size(struct zpool *zpool)
> {
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 7d7cb3eaabe0..398f3856817f 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -401,7 +401,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle)
>
> static u64 zs_zpool_total_size(void *pool)
> {
> - return zs_get_total_pages(pool) << PAGE_SHIFT;
> + return zs_get_total_pages(pool);
> }
>
> static struct zpool_driver zs_zpool_driver = {
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 7c39327a7cc2..fe4343e416e0 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -507,7 +507,7 @@ static unsigned long zswap_max_pages(void)
> unsigned long zswap_total_pages(void)
> {
> struct zswap_pool *pool;
> - u64 total = 0;
> + unsigned long total = 0;
>
> rcu_read_lock();
> list_for_each_entry_rcu(pool, &zswap_pools, list) {
> @@ -518,7 +518,7 @@ unsigned long zswap_total_pages(void)
> }
> rcu_read_unlock();
>
> - return total >> PAGE_SHIFT;
> + return total;
> }
>
> /*********************************
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH 2/2] mm: zpool: return pool size in pages
2024-03-11 16:12 ` [PATCH 2/2] mm: zpool: return pool size in pages Johannes Weiner
2024-03-11 22:12 ` Yosry Ahmed
2024-03-12 4:56 ` Chengming Zhou
@ 2024-03-12 9:15 ` Nhat Pham
2 siblings, 0 replies; 12+ messages in thread
From: Nhat Pham @ 2024-03-12 9:15 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Yosry Ahmed, Chengming Zhou, linux-mm, linux-kernel
On Mon, Mar 11, 2024 at 11:12 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> All zswap backends track their pool sizes in pages. Currently they
> multiply by PAGE_SIZE for zswap, only for zswap to divide again in
> order to do limit math. Report pages directly.
I've always found this to be weird. Perhaps the original author of
this API wants to support more fine-grained memory consumption
tracking?
>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Anyway, let's just work with realities, rather than hypotheticals :)
With the renaming fixlet:
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking
2024-03-11 16:12 [PATCH 1/2] mm: zswap: optimize zswap pool size tracking Johannes Weiner
2024-03-11 16:12 ` [PATCH 2/2] mm: zpool: return pool size in pages Johannes Weiner
@ 2024-03-11 22:09 ` Yosry Ahmed
2024-03-12 2:34 ` Johannes Weiner
2024-03-12 4:55 ` Chengming Zhou
2024-03-12 9:12 ` Nhat Pham
3 siblings, 1 reply; 12+ messages in thread
From: Yosry Ahmed @ 2024-03-11 22:09 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Nhat Pham, Chengming Zhou, linux-mm, linux-kernel
On Mon, Mar 11, 2024 at 12:12:13PM -0400, Johannes Weiner wrote:
> Profiling the munmap() of a zswapped memory region shows 50%(!) of the
> total cycles currently going into updating the zswap_pool_total_size.
Yikes. I have always hated that size update scheme FWIW.
I have also wondered whether it makes sense to just maintain the number
of pages in zswap as an atomic, like zswap_stored_pages. I guess your
proposed scheme is even cheaper for the load/invalidate paths because we
do nothing at all. It could be an option if the aggregation in other
paths ever becomes a problem, but we would need to make sure it
doesn't regress the load/invalidate paths. Just sharing some thoughts.
>
> There are three consumers of this counter:
> - store, to enforce the globally configured pool limit
> - meminfo & debugfs, to report the size to the user
> - shrink, to determine the batch size for each cycle
>
> Instead of aggregating everytime an entry enters or exits the zswap
> pool, aggregate the value from the zpools on-demand:
>
> - Stores aggregate the counter anyway upon success. Aggregating to
> check the limit instead is the same amount of work.
>
> - Meminfo & debugfs might benefit somewhat from a pre-aggregated
> counter, but aren't exactly hotpaths.
>
> - Shrinking can aggregate once for every cycle instead of doing it for
> every freed entry. As the shrinker might work on tens or hundreds of
> objects per scan cycle, this is a large reduction in aggregations.
>
> The paths that benefit dramatically are swapin, swapoff, and
> unmaps. There could be millions of pages being processed until
> somebody asks for the pool size again. This eliminates the pool size
> updates from those paths entirely.
This looks like a big win, thanks! I wonder if you have any numbers of
perf profiles to share. That would be nice to have, but I think the
benefit is clear regardless.
I also like the implicit cleanup when we switch to maintaining the
number of pages rather than bytes. The code looks much better with all
the shifts and divisions gone :)
I have a couple of comments below. With them addressed, feel free to
add:
Acked-by: Yosry Ahmed <yosryahmed@google.com>
[..]
> @@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
> {
> struct mem_cgroup *memcg;
> int ret, failures = 0;
> + unsigned long thr;
> +
> + /* Reclaim down to the accept threshold */
> + thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
This calculation is repeated twice, so I'd rather keep a helper for it
as an alternative to zswap_can_accept(). Perhaps zswap_threshold_page()
or zswap_acceptance_pages()?
>
> /* global reclaim will select cgroup in a round-robin fashion. */
> do {
> @@ -1432,10 +1416,9 @@ static void shrink_worker(struct work_struct *w)
> break;
> if (ret && ++failures == MAX_RECLAIM_RETRIES)
> break;
> -
> resched:
> cond_resched();
> - } while (!zswap_can_accept());
> + } while (zswap_total_pages() > thr);
> }
[..]
> @@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)
>
> static struct dentry *zswap_debugfs_root;
>
> +static int debugfs_get_total_size(void *data, u64 *val)
> +{
> + *val = zswap_total_pages() * PAGE_SIZE;
> + return 0;
> +}
> +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
I think we are missing a newline here to maintain the current format
(i.e "%llu\n").
> +
> static int zswap_debugfs_init(void)
> {
> if (!debugfs_initialized())
> @@ -1732,8 +1728,8 @@ static int zswap_debugfs_init(void)
> zswap_debugfs_root, &zswap_reject_compress_poor);
> debugfs_create_u64("written_back_pages", 0444,
> zswap_debugfs_root, &zswap_written_back_pages);
> - debugfs_create_u64("pool_total_size", 0444,
> - zswap_debugfs_root, &zswap_pool_total_size);
> + debugfs_create_file("pool_total_size", 0444,
> + zswap_debugfs_root, NULL, &total_size_fops);
> debugfs_create_atomic_t("stored_pages", 0444,
> zswap_debugfs_root, &zswap_stored_pages);
> debugfs_create_atomic_t("same_filled_pages", 0444,
> --
> 2.44.0
>
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking
2024-03-11 22:09 ` [PATCH 1/2] mm: zswap: optimize zswap pool size tracking Yosry Ahmed
@ 2024-03-12 2:34 ` Johannes Weiner
2024-03-12 4:04 ` Yosry Ahmed
0 siblings, 1 reply; 12+ messages in thread
From: Johannes Weiner @ 2024-03-12 2:34 UTC (permalink / raw)
To: Yosry Ahmed
Cc: Andrew Morton, Nhat Pham, Chengming Zhou, linux-mm, linux-kernel
On Mon, Mar 11, 2024 at 10:09:35PM +0000, Yosry Ahmed wrote:
> On Mon, Mar 11, 2024 at 12:12:13PM -0400, Johannes Weiner wrote:
> > Profiling the munmap() of a zswapped memory region shows 50%(!) of the
> > total cycles currently going into updating the zswap_pool_total_size.
>
> Yikes. I have always hated that size update scheme FWIW.
>
> I have also wondered whether it makes sense to just maintain the number
> of pages in zswap as an atomic, like zswap_stored_pages. I guess your
> proposed scheme is even cheaper for the load/invalidate paths because we
> do nothing at all. It could be an option if the aggregation in other
> paths ever becomes a problem, but we would need to make sure it
> doesn't regress the load/invalidate paths. Just sharing some thoughts.
Agree with you there. I actually tried doing it that way at first, but
noticed zram uses zs_get_total_pages() and actually wants a per-pool
count. I didn't want the backend to have to update two atomics, so I
settled for this version.
> > There are three consumers of this counter:
> > - store, to enforce the globally configured pool limit
> > - meminfo & debugfs, to report the size to the user
> > - shrink, to determine the batch size for each cycle
> >
> > Instead of aggregating everytime an entry enters or exits the zswap
> > pool, aggregate the value from the zpools on-demand:
> >
> > - Stores aggregate the counter anyway upon success. Aggregating to
> > check the limit instead is the same amount of work.
> >
> > - Meminfo & debugfs might benefit somewhat from a pre-aggregated
> > counter, but aren't exactly hotpaths.
> >
> > - Shrinking can aggregate once for every cycle instead of doing it for
> > every freed entry. As the shrinker might work on tens or hundreds of
> > objects per scan cycle, this is a large reduction in aggregations.
> >
> > The paths that benefit dramatically are swapin, swapoff, and
> > unmaps. There could be millions of pages being processed until
> > somebody asks for the pool size again. This eliminates the pool size
> > updates from those paths entirely.
>
> This looks like a big win, thanks! I wonder if you have any numbers of
> perf profiles to share. That would be nice to have, but I think the
> benefit is clear regardless.
I deleted the perf files already, but can re-run it tomorrow.
> I also like the implicit cleanup when we switch to maintaining the
> number of pages rather than bytes. The code looks much better with all
> the shifts and divisions gone :)
>
> I have a couple of comments below. With them addressed, feel free to
> add:
> Acked-by: Yosry Ahmed <yosryahmed@google.com>
Thanks!
> > @@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
> > {
> > struct mem_cgroup *memcg;
> > int ret, failures = 0;
> > + unsigned long thr;
> > +
> > + /* Reclaim down to the accept threshold */
> > + thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
>
> This calculation is repeated twice, so I'd rather keep a helper for it
> as an alternative to zswap_can_accept(). Perhaps zswap_threshold_page()
> or zswap_acceptance_pages()?
Sounds good. I went with zswap_accept_thr_pages().
> > @@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)
> >
> > static struct dentry *zswap_debugfs_root;
> >
> > +static int debugfs_get_total_size(void *data, u64 *val)
> > +{
> > + *val = zswap_total_pages() * PAGE_SIZE;
> > + return 0;
> > +}
> > +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
>
> I think we are missing a newline here to maintain the current format
> (i.e "%llu\n").
Oops, good catch! I had verified the debugfs file (along with the
others) with 'grep . *', which hides that this is missing. Fixed up.
Thanks for taking a look. The incremental diff is below. I'll run the
tests and recapture the numbers tomorrow, then send v2.
diff --git a/mm/zswap.c b/mm/zswap.c
index 7c39327a7cc2..1a5cc7298306 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -504,6 +504,11 @@ static unsigned long zswap_max_pages(void)
return totalram_pages() * zswap_max_pool_percent / 100;
}
+static unsigned long zswap_accept_thr_pages(void)
+{
+ return zswap_max_pages() * zswap_accept_thr_percent / 100;
+}
+
unsigned long zswap_total_pages(void)
{
struct zswap_pool *pool;
@@ -1368,7 +1373,7 @@ static void shrink_worker(struct work_struct *w)
unsigned long thr;
/* Reclaim down to the accept threshold */
- thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
+ thr = zswap_accept_thr_pages();
/* global reclaim will select cgroup in a round-robin fashion. */
do {
@@ -1493,9 +1498,7 @@ bool zswap_store(struct folio *folio)
}
if (zswap_pool_reached_full) {
- unsigned long thr = max_pages * zswap_accept_thr_percent / 100;
-
- if (cur_pages > thr)
+ if (cur_pages > zswap_accept_thr_pages())
goto shrink;
else
zswap_pool_reached_full = false;
@@ -1705,7 +1708,7 @@ static int debugfs_get_total_size(void *data, u64 *val)
*val = zswap_total_pages() * PAGE_SIZE;
return 0;
}
-DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
+DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n");
static int zswap_debugfs_init(void)
{
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking
2024-03-12 2:34 ` Johannes Weiner
@ 2024-03-12 4:04 ` Yosry Ahmed
0 siblings, 0 replies; 12+ messages in thread
From: Yosry Ahmed @ 2024-03-12 4:04 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Nhat Pham, Chengming Zhou, linux-mm, linux-kernel
On Mon, Mar 11, 2024 at 10:34:11PM -0400, Johannes Weiner wrote:
> On Mon, Mar 11, 2024 at 10:09:35PM +0000, Yosry Ahmed wrote:
> > On Mon, Mar 11, 2024 at 12:12:13PM -0400, Johannes Weiner wrote:
> > > Profiling the munmap() of a zswapped memory region shows 50%(!) of the
> > > total cycles currently going into updating the zswap_pool_total_size.
> >
> > Yikes. I have always hated that size update scheme FWIW.
> >
> > I have also wondered whether it makes sense to just maintain the number
> > of pages in zswap as an atomic, like zswap_stored_pages. I guess your
> > proposed scheme is even cheaper for the load/invalidate paths because we
> > do nothing at all. It could be an option if the aggregation in other
> > paths ever becomes a problem, but we would need to make sure it
> > doesn't regress the load/invalidate paths. Just sharing some thoughts.
>
> Agree with you there. I actually tried doing it that way at first, but
> noticed zram uses zs_get_total_pages() and actually wants a per-pool
> count. I didn't want the backend to have to update two atomics, so I
> settled for this version.
Could be useful to document this context if you send a v2. This version
is a big improvement anyway, so hopefully we don' t need to revisit.
>
> > > There are three consumers of this counter:
> > > - store, to enforce the globally configured pool limit
> > > - meminfo & debugfs, to report the size to the user
> > > - shrink, to determine the batch size for each cycle
> > >
> > > Instead of aggregating everytime an entry enters or exits the zswap
> > > pool, aggregate the value from the zpools on-demand:
> > >
> > > - Stores aggregate the counter anyway upon success. Aggregating to
> > > check the limit instead is the same amount of work.
> > >
> > > - Meminfo & debugfs might benefit somewhat from a pre-aggregated
> > > counter, but aren't exactly hotpaths.
> > >
> > > - Shrinking can aggregate once for every cycle instead of doing it for
> > > every freed entry. As the shrinker might work on tens or hundreds of
> > > objects per scan cycle, this is a large reduction in aggregations.
> > >
> > > The paths that benefit dramatically are swapin, swapoff, and
> > > unmaps. There could be millions of pages being processed until
> > > somebody asks for the pool size again. This eliminates the pool size
> > > updates from those paths entirely.
> >
> > This looks like a big win, thanks! I wonder if you have any numbers of
> > perf profiles to share. That would be nice to have, but I think the
> > benefit is clear regardless.
>
> I deleted the perf files already, but can re-run it tomorrow.
Thanks!
>
> > I also like the implicit cleanup when we switch to maintaining the
> > number of pages rather than bytes. The code looks much better with all
> > the shifts and divisions gone :)
> >
> > I have a couple of comments below. With them addressed, feel free to
> > add:
> > Acked-by: Yosry Ahmed <yosryahmed@google.com>
>
> Thanks!
>
> > > @@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
> > > {
> > > struct mem_cgroup *memcg;
> > > int ret, failures = 0;
> > > + unsigned long thr;
> > > +
> > > + /* Reclaim down to the accept threshold */
> > > + thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
> >
> > This calculation is repeated twice, so I'd rather keep a helper for it
> > as an alternative to zswap_can_accept(). Perhaps zswap_threshold_page()
> > or zswap_acceptance_pages()?
>
> Sounds good. I went with zswap_accept_thr_pages().
Even better.
>
> > > @@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)
> > >
> > > static struct dentry *zswap_debugfs_root;
> > >
> > > +static int debugfs_get_total_size(void *data, u64 *val)
> > > +{
> > > + *val = zswap_total_pages() * PAGE_SIZE;
> > > + return 0;
> > > +}
> > > +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
> >
> > I think we are missing a newline here to maintain the current format
> > (i.e "%llu\n").
>
> Oops, good catch! I had verified the debugfs file (along with the
> others) with 'grep . *', which hides that this is missing. Fixed up.
>
> Thanks for taking a look. The incremental diff is below. I'll run the
> tests and recapture the numbers tomorrow, then send v2.
LGTM. Feel free to carry the Ack forward.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking
2024-03-11 16:12 [PATCH 1/2] mm: zswap: optimize zswap pool size tracking Johannes Weiner
2024-03-11 16:12 ` [PATCH 2/2] mm: zpool: return pool size in pages Johannes Weiner
2024-03-11 22:09 ` [PATCH 1/2] mm: zswap: optimize zswap pool size tracking Yosry Ahmed
@ 2024-03-12 4:55 ` Chengming Zhou
2024-03-12 9:12 ` Nhat Pham
3 siblings, 0 replies; 12+ messages in thread
From: Chengming Zhou @ 2024-03-12 4:55 UTC (permalink / raw)
To: Johannes Weiner, Andrew Morton
Cc: Yosry Ahmed, Nhat Pham, Chengming Zhou, linux-mm, linux-kernel
On 2024/3/12 00:12, Johannes Weiner wrote:
> Profiling the munmap() of a zswapped memory region shows 50%(!) of the
> total cycles currently going into updating the zswap_pool_total_size.
>
> There are three consumers of this counter:
> - store, to enforce the globally configured pool limit
> - meminfo & debugfs, to report the size to the user
> - shrink, to determine the batch size for each cycle
>
> Instead of aggregating everytime an entry enters or exits the zswap
> pool, aggregate the value from the zpools on-demand:
>
> - Stores aggregate the counter anyway upon success. Aggregating to
> check the limit instead is the same amount of work.
>
> - Meminfo & debugfs might benefit somewhat from a pre-aggregated
> counter, but aren't exactly hotpaths.
>
> - Shrinking can aggregate once for every cycle instead of doing it for
> every freed entry. As the shrinker might work on tens or hundreds of
> objects per scan cycle, this is a large reduction in aggregations.
>
> The paths that benefit dramatically are swapin, swapoff, and
> unmaps. There could be millions of pages being processed until
> somebody asks for the pool size again. This eliminates the pool size
> updates from those paths entirely.
>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Great! This is a clever simplification and optimization.
With the incremental diff, feel free to add:
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Thanks.
> ---
> fs/proc/meminfo.c | 3 +-
> include/linux/zswap.h | 2 +-
> mm/zswap.c | 98 +++++++++++++++++++++----------------------
> 3 files changed, 49 insertions(+), 54 deletions(-)
>
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 45af9a989d40..245171d9164b 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> show_val_kb(m, "SwapTotal: ", i.totalswap);
> show_val_kb(m, "SwapFree: ", i.freeswap);
> #ifdef CONFIG_ZSWAP
> - seq_printf(m, "Zswap: %8lu kB\n",
> - (unsigned long)(zswap_pool_total_size >> 10));
> + show_val_kb(m, "Zswap: ", zswap_total_pages());
> seq_printf(m, "Zswapped: %8lu kB\n",
> (unsigned long)atomic_read(&zswap_stored_pages) <<
> (PAGE_SHIFT - 10));
> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
> index 341aea490070..2a85b941db97 100644
> --- a/include/linux/zswap.h
> +++ b/include/linux/zswap.h
> @@ -7,7 +7,6 @@
>
> struct lruvec;
>
> -extern u64 zswap_pool_total_size;
> extern atomic_t zswap_stored_pages;
>
> #ifdef CONFIG_ZSWAP
> @@ -27,6 +26,7 @@ struct zswap_lruvec_state {
> atomic_long_t nr_zswap_protected;
> };
>
> +unsigned long zswap_total_pages(void);
> bool zswap_store(struct folio *folio);
> bool zswap_load(struct folio *folio);
> void zswap_invalidate(swp_entry_t swp);
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 9a3237752082..7c39327a7cc2 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -43,8 +43,6 @@
> /*********************************
> * statistics
> **********************************/
> -/* Total bytes used by the compressed storage */
> -u64 zswap_pool_total_size;
> /* The number of compressed pages currently stored in zswap */
> atomic_t zswap_stored_pages = ATOMIC_INIT(0);
> /* The number of same-value filled pages currently stored in zswap */
> @@ -264,45 +262,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
> pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
> zpool_get_type((p)->zpools[0]))
>
> -static bool zswap_is_full(void)
> -{
> - return totalram_pages() * zswap_max_pool_percent / 100 <
> - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static bool zswap_can_accept(void)
> -{
> - return totalram_pages() * zswap_accept_thr_percent / 100 *
> - zswap_max_pool_percent / 100 >
> - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static u64 get_zswap_pool_size(struct zswap_pool *pool)
> -{
> - u64 pool_size = 0;
> - int i;
> -
> - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> - pool_size += zpool_get_total_size(pool->zpools[i]);
> -
> - return pool_size;
> -}
> -
> -static void zswap_update_total_size(void)
> -{
> - struct zswap_pool *pool;
> - u64 total = 0;
> -
> - rcu_read_lock();
> -
> - list_for_each_entry_rcu(pool, &zswap_pools, list)
> - total += get_zswap_pool_size(pool);
> -
> - rcu_read_unlock();
> -
> - zswap_pool_total_size = total;
> -}
> -
> /*********************************
> * pool functions
> **********************************/
> @@ -540,6 +499,28 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> return NULL;
> }
>
> +static unsigned long zswap_max_pages(void)
> +{
> + return totalram_pages() * zswap_max_pool_percent / 100;
> +}
> +
> +unsigned long zswap_total_pages(void)
> +{
> + struct zswap_pool *pool;
> + u64 total = 0;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(pool, &zswap_pools, list) {
> + int i;
> +
> + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> + total += zpool_get_total_size(pool->zpools[i]);
> + }
> + rcu_read_unlock();
> +
> + return total >> PAGE_SHIFT;
> +}
> +
> /*********************************
> * param callbacks
> **********************************/
> @@ -912,7 +893,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
> }
> zswap_entry_cache_free(entry);
> atomic_dec(&zswap_stored_pages);
> - zswap_update_total_size();
> }
>
> /*
> @@ -1317,7 +1297,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
> nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
> #else
> /* use pool stats instead of memcg stats */
> - nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
> + nr_backing = zswap_total_pages();
> nr_stored = atomic_read(&zswap_nr_stored);
> #endif
>
> @@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
> {
> struct mem_cgroup *memcg;
> int ret, failures = 0;
> + unsigned long thr;
> +
> + /* Reclaim down to the accept threshold */
> + thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
>
> /* global reclaim will select cgroup in a round-robin fashion. */
> do {
> @@ -1432,10 +1416,9 @@ static void shrink_worker(struct work_struct *w)
> break;
> if (ret && ++failures == MAX_RECLAIM_RETRIES)
> break;
> -
> resched:
> cond_resched();
> - } while (!zswap_can_accept());
> + } while (zswap_total_pages() > thr);
> }
>
> static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
> @@ -1476,6 +1459,7 @@ bool zswap_store(struct folio *folio)
> struct zswap_entry *entry, *dupentry;
> struct obj_cgroup *objcg = NULL;
> struct mem_cgroup *memcg = NULL;
> + unsigned long max_pages, cur_pages;
>
> VM_WARN_ON_ONCE(!folio_test_locked(folio));
> VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1487,6 +1471,7 @@ bool zswap_store(struct folio *folio)
> if (!zswap_enabled)
> goto check_old;
>
> + /* Check cgroup limits */
> objcg = get_obj_cgroup_from_folio(folio);
> if (objcg && !obj_cgroup_may_zswap(objcg)) {
> memcg = get_mem_cgroup_from_objcg(objcg);
> @@ -1497,15 +1482,20 @@ bool zswap_store(struct folio *folio)
> mem_cgroup_put(memcg);
> }
>
> - /* reclaim space if needed */
> - if (zswap_is_full()) {
> + /* Check global limits */
> + cur_pages = zswap_total_pages();
> + max_pages = zswap_max_pages();
> +
> + if (cur_pages >= max_pages) {
> zswap_pool_limit_hit++;
> zswap_pool_reached_full = true;
> goto shrink;
> }
>
> if (zswap_pool_reached_full) {
> - if (!zswap_can_accept())
> + unsigned long thr = max_pages * zswap_accept_thr_percent / 100;
> +
> + if (cur_pages > thr)
> goto shrink;
> else
> zswap_pool_reached_full = false;
> @@ -1581,7 +1571,6 @@ bool zswap_store(struct folio *folio)
>
> /* update stats */
> atomic_inc(&zswap_stored_pages);
> - zswap_update_total_size();
> count_vm_event(ZSWPOUT);
>
> return true;
> @@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)
>
> static struct dentry *zswap_debugfs_root;
>
> +static int debugfs_get_total_size(void *data, u64 *val)
> +{
> + *val = zswap_total_pages() * PAGE_SIZE;
> + return 0;
> +}
> +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
> +
> static int zswap_debugfs_init(void)
> {
> if (!debugfs_initialized())
> @@ -1732,8 +1728,8 @@ static int zswap_debugfs_init(void)
> zswap_debugfs_root, &zswap_reject_compress_poor);
> debugfs_create_u64("written_back_pages", 0444,
> zswap_debugfs_root, &zswap_written_back_pages);
> - debugfs_create_u64("pool_total_size", 0444,
> - zswap_debugfs_root, &zswap_pool_total_size);
> + debugfs_create_file("pool_total_size", 0444,
> + zswap_debugfs_root, NULL, &total_size_fops);
> debugfs_create_atomic_t("stored_pages", 0444,
> zswap_debugfs_root, &zswap_stored_pages);
> debugfs_create_atomic_t("same_filled_pages", 0444,
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking
2024-03-11 16:12 [PATCH 1/2] mm: zswap: optimize zswap pool size tracking Johannes Weiner
` (2 preceding siblings ...)
2024-03-12 4:55 ` Chengming Zhou
@ 2024-03-12 9:12 ` Nhat Pham
3 siblings, 0 replies; 12+ messages in thread
From: Nhat Pham @ 2024-03-12 9:12 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Yosry Ahmed, Chengming Zhou, linux-mm, linux-kernel
On Mon, Mar 11, 2024 at 11:12 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> Profiling the munmap() of a zswapped memory region shows 50%(!) of the
> total cycles currently going into updating the zswap_pool_total_size.
>
> There are three consumers of this counter:
> - store, to enforce the globally configured pool limit
> - meminfo & debugfs, to report the size to the user
> - shrink, to determine the batch size for each cycle
>
> Instead of aggregating everytime an entry enters or exits the zswap
> pool, aggregate the value from the zpools on-demand:
>
> - Stores aggregate the counter anyway upon success. Aggregating to
> check the limit instead is the same amount of work.
>
> - Meminfo & debugfs might benefit somewhat from a pre-aggregated
> counter, but aren't exactly hotpaths.
>
> - Shrinking can aggregate once for every cycle instead of doing it for
> every freed entry. As the shrinker might work on tens or hundreds of
> objects per scan cycle, this is a large reduction in aggregations.
Nice!
>
> The paths that benefit dramatically are swapin, swapoff, and
> unmaps. There could be millions of pages being processed until
> somebody asks for the pool size again. This eliminates the pool size
> updates from those paths entirely.
>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
With your fixlet applied:
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
> ---
> fs/proc/meminfo.c | 3 +-
> include/linux/zswap.h | 2 +-
> mm/zswap.c | 98 +++++++++++++++++++++----------------------
> 3 files changed, 49 insertions(+), 54 deletions(-)
>
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 45af9a989d40..245171d9164b 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> show_val_kb(m, "SwapTotal: ", i.totalswap);
> show_val_kb(m, "SwapFree: ", i.freeswap);
> #ifdef CONFIG_ZSWAP
> - seq_printf(m, "Zswap: %8lu kB\n",
> - (unsigned long)(zswap_pool_total_size >> 10));
> + show_val_kb(m, "Zswap: ", zswap_total_pages());
> seq_printf(m, "Zswapped: %8lu kB\n",
> (unsigned long)atomic_read(&zswap_stored_pages) <<
> (PAGE_SHIFT - 10));
> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
> index 341aea490070..2a85b941db97 100644
> --- a/include/linux/zswap.h
> +++ b/include/linux/zswap.h
> @@ -7,7 +7,6 @@
>
> struct lruvec;
>
> -extern u64 zswap_pool_total_size;
> extern atomic_t zswap_stored_pages;
>
> #ifdef CONFIG_ZSWAP
> @@ -27,6 +26,7 @@ struct zswap_lruvec_state {
> atomic_long_t nr_zswap_protected;
> };
>
> +unsigned long zswap_total_pages(void);
> bool zswap_store(struct folio *folio);
> bool zswap_load(struct folio *folio);
> void zswap_invalidate(swp_entry_t swp);
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 9a3237752082..7c39327a7cc2 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -43,8 +43,6 @@
> /*********************************
> * statistics
> **********************************/
> -/* Total bytes used by the compressed storage */
> -u64 zswap_pool_total_size;
> /* The number of compressed pages currently stored in zswap */
> atomic_t zswap_stored_pages = ATOMIC_INIT(0);
> /* The number of same-value filled pages currently stored in zswap */
> @@ -264,45 +262,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
> pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
> zpool_get_type((p)->zpools[0]))
>
> -static bool zswap_is_full(void)
> -{
> - return totalram_pages() * zswap_max_pool_percent / 100 <
> - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static bool zswap_can_accept(void)
> -{
> - return totalram_pages() * zswap_accept_thr_percent / 100 *
> - zswap_max_pool_percent / 100 >
> - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static u64 get_zswap_pool_size(struct zswap_pool *pool)
> -{
> - u64 pool_size = 0;
> - int i;
> -
> - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> - pool_size += zpool_get_total_size(pool->zpools[i]);
> -
> - return pool_size;
> -}
> -
> -static void zswap_update_total_size(void)
> -{
> - struct zswap_pool *pool;
> - u64 total = 0;
> -
> - rcu_read_lock();
> -
> - list_for_each_entry_rcu(pool, &zswap_pools, list)
> - total += get_zswap_pool_size(pool);
> -
> - rcu_read_unlock();
> -
> - zswap_pool_total_size = total;
> -}
> -
> /*********************************
> * pool functions
> **********************************/
> @@ -540,6 +499,28 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> return NULL;
> }
>
> +static unsigned long zswap_max_pages(void)
> +{
> + return totalram_pages() * zswap_max_pool_percent / 100;
> +}
> +
> +unsigned long zswap_total_pages(void)
> +{
> + struct zswap_pool *pool;
> + u64 total = 0;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(pool, &zswap_pools, list) {
> + int i;
> +
> + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> + total += zpool_get_total_size(pool->zpools[i]);
> + }
> + rcu_read_unlock();
> +
> + return total >> PAGE_SHIFT;
> +}
> +
> /*********************************
> * param callbacks
> **********************************/
> @@ -912,7 +893,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
> }
> zswap_entry_cache_free(entry);
> atomic_dec(&zswap_stored_pages);
> - zswap_update_total_size();
> }
>
> /*
> @@ -1317,7 +1297,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
> nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
> #else
> /* use pool stats instead of memcg stats */
> - nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
> + nr_backing = zswap_total_pages();
> nr_stored = atomic_read(&zswap_nr_stored);
> #endif
>
> @@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
> {
> struct mem_cgroup *memcg;
> int ret, failures = 0;
> + unsigned long thr;
> +
> + /* Reclaim down to the accept threshold */
> + thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
>
> /* global reclaim will select cgroup in a round-robin fashion. */
> do {
> @@ -1432,10 +1416,9 @@ static void shrink_worker(struct work_struct *w)
> break;
> if (ret && ++failures == MAX_RECLAIM_RETRIES)
> break;
> -
> resched:
> cond_resched();
> - } while (!zswap_can_accept());
> + } while (zswap_total_pages() > thr);
> }
>
> static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
> @@ -1476,6 +1459,7 @@ bool zswap_store(struct folio *folio)
> struct zswap_entry *entry, *dupentry;
> struct obj_cgroup *objcg = NULL;
> struct mem_cgroup *memcg = NULL;
> + unsigned long max_pages, cur_pages;
>
> VM_WARN_ON_ONCE(!folio_test_locked(folio));
> VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1487,6 +1471,7 @@ bool zswap_store(struct folio *folio)
> if (!zswap_enabled)
> goto check_old;
>
> + /* Check cgroup limits */
> objcg = get_obj_cgroup_from_folio(folio);
> if (objcg && !obj_cgroup_may_zswap(objcg)) {
> memcg = get_mem_cgroup_from_objcg(objcg);
> @@ -1497,15 +1482,20 @@ bool zswap_store(struct folio *folio)
> mem_cgroup_put(memcg);
> }
>
> - /* reclaim space if needed */
> - if (zswap_is_full()) {
> + /* Check global limits */
> + cur_pages = zswap_total_pages();
> + max_pages = zswap_max_pages();
> +
> + if (cur_pages >= max_pages) {
> zswap_pool_limit_hit++;
> zswap_pool_reached_full = true;
> goto shrink;
> }
>
> if (zswap_pool_reached_full) {
> - if (!zswap_can_accept())
> + unsigned long thr = max_pages * zswap_accept_thr_percent / 100;
> +
> + if (cur_pages > thr)
> goto shrink;
> else
> zswap_pool_reached_full = false;
> @@ -1581,7 +1571,6 @@ bool zswap_store(struct folio *folio)
>
> /* update stats */
> atomic_inc(&zswap_stored_pages);
> - zswap_update_total_size();
> count_vm_event(ZSWPOUT);
>
> return true;
> @@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)
>
> static struct dentry *zswap_debugfs_root;
>
> +static int debugfs_get_total_size(void *data, u64 *val)
> +{
> + *val = zswap_total_pages() * PAGE_SIZE;
> + return 0;
> +}
> +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
> +
> static int zswap_debugfs_init(void)
> {
> if (!debugfs_initialized())
> @@ -1732,8 +1728,8 @@ static int zswap_debugfs_init(void)
> zswap_debugfs_root, &zswap_reject_compress_poor);
> debugfs_create_u64("written_back_pages", 0444,
> zswap_debugfs_root, &zswap_written_back_pages);
> - debugfs_create_u64("pool_total_size", 0444,
> - zswap_debugfs_root, &zswap_pool_total_size);
> + debugfs_create_file("pool_total_size", 0444,
> + zswap_debugfs_root, NULL, &total_size_fops);
> debugfs_create_atomic_t("stored_pages", 0444,
> zswap_debugfs_root, &zswap_stored_pages);
> debugfs_create_atomic_t("same_filled_pages", 0444,
> --
> 2.44.0
>
^ permalink raw reply [flat|nested] 12+ messages in thread