* [PATCH 1/2] mm: Split BLK_FEAT_SYNCHRONOUS and SWP_SYNCHRONOUS_IO into separate read and write flags
2025-03-07 12:01 [PATCH 0/2] Improve Zram by separating compression context from kswapd Qun-Wei Lin
@ 2025-03-07 12:01 ` Qun-Wei Lin
2025-03-07 12:01 ` [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression Qun-Wei Lin
` (3 subsequent siblings)
4 siblings, 0 replies; 39+ messages in thread
From: Qun-Wei Lin @ 2025-03-07 12:01 UTC (permalink / raw)
To: Jens Axboe, Minchan Kim, Sergey Senozhatsky, Vishal Verma,
Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Huang, Ying, Kairui Song, Dan Schatzberg,
Barry Song, Al Viro
Cc: linux-kernel, linux-block, nvdimm, linux-mm, linux-arm-kernel,
linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu,
Qun-Wei Lin
This patch splits the BLK_FEAT_SYNCHRONOUS feature flag into two
separate flags: BLK_FEAT_READ_SYNCHRONOUS and
BLK_FEAT_WRITE_SYNCHRONOUS. Similarly, the SWP_SYNCHRONOUS_IO flag is
split into SWP_READ_SYNCHRONOUS_IO and SWP_WRITE_SYNCHRONOUS_IO.
These changes are motivated by the need to better accommodate certain
swap devices that support synchronous read operations but asynchronous write
operations.
The existing BLK_FEAT_SYNCHRONOUS and SWP_SYNCHRONOUS_IO flags are not
sufficient for these devices, as they enforce synchronous behavior for
both read and write operations.
Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
---
drivers/block/brd.c | 3 ++-
drivers/block/zram/zram_drv.c | 5 +++--
drivers/nvdimm/btt.c | 3 ++-
drivers/nvdimm/pmem.c | 5 +++--
include/linux/blkdev.h | 24 ++++++++++++++++--------
include/linux/swap.h | 31 ++++++++++++++++---------------
mm/memory.c | 4 ++--
mm/page_io.c | 6 +++---
mm/swapfile.c | 7 +++++--
9 files changed, 52 insertions(+), 36 deletions(-)
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 292f127cae0a..66920b9d4701 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -370,7 +370,8 @@ static int brd_alloc(int i)
.max_hw_discard_sectors = UINT_MAX,
.max_discard_segments = 1,
.discard_granularity = PAGE_SIZE,
- .features = BLK_FEAT_SYNCHRONOUS |
+ .features = BLK_FEAT_READ_SYNCHRONOUS |
+ BLK_FEAT_WRITE_SYNCHRONOUS |
BLK_FEAT_NOWAIT,
};
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 3dee026988dc..2e1a70f2f4bd 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -2535,8 +2535,9 @@ static int zram_add(void)
#if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE
.max_write_zeroes_sectors = UINT_MAX,
#endif
- .features = BLK_FEAT_STABLE_WRITES |
- BLK_FEAT_SYNCHRONOUS,
+ .features = BLK_FEAT_STABLE_WRITES |
+ BLK_FEAT_READ_SYNCHRONOUS |
+ BLK_FEAT_WRITE_SYNCHRONOUS,
};
struct zram *zram;
int ret, device_id;
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 423dcd190906..1665d98f51af 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1501,7 +1501,8 @@ static int btt_blk_init(struct btt *btt)
.logical_block_size = btt->sector_size,
.max_hw_sectors = UINT_MAX,
.max_integrity_segments = 1,
- .features = BLK_FEAT_SYNCHRONOUS,
+ .features = BLK_FEAT_READ_SYNCHRONOUS |
+ BLK_FEAT_WRITE_SYNCHRONOUS,
};
int rc;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index d81faa9d89c9..81a57d7ca746 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -455,8 +455,9 @@ static int pmem_attach_disk(struct device *dev,
.logical_block_size = pmem_sector_size(ndns),
.physical_block_size = PAGE_SIZE,
.max_hw_sectors = UINT_MAX,
- .features = BLK_FEAT_WRITE_CACHE |
- BLK_FEAT_SYNCHRONOUS,
+ .features = BLK_FEAT_WRITE_CACHE |
+ BLK_FEAT_READ_SYNCHRONOUS |
+ BLK_FEAT_WRITE_SYNCHRONOUS,
};
int nid = dev_to_node(dev), fua;
struct resource *res = &nsio->res;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 08a727b40816..3070f2e9d862 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -305,20 +305,23 @@ typedef unsigned int __bitwise blk_features_t;
/* don't modify data until writeback is done */
#define BLK_FEAT_STABLE_WRITES ((__force blk_features_t)(1u << 5))
-/* always completes in submit context */
-#define BLK_FEAT_SYNCHRONOUS ((__force blk_features_t)(1u << 6))
+/* read operations always completes in submit context */
+#define BLK_FEAT_READ_SYNCHRONOUS ((__force blk_features_t)(1u << 6))
+
+/* write operations always completes in submit context */
+#define BLK_FEAT_WRITE_SYNCHRONOUS ((__force blk_features_t)(1u << 7))
/* supports REQ_NOWAIT */
-#define BLK_FEAT_NOWAIT ((__force blk_features_t)(1u << 7))
+#define BLK_FEAT_NOWAIT ((__force blk_features_t)(1u << 8))
/* supports DAX */
-#define BLK_FEAT_DAX ((__force blk_features_t)(1u << 8))
+#define BLK_FEAT_DAX ((__force blk_features_t)(1u << 9))
/* supports I/O polling */
-#define BLK_FEAT_POLL ((__force blk_features_t)(1u << 9))
+#define BLK_FEAT_POLL ((__force blk_features_t)(1u << 10))
/* is a zoned device */
-#define BLK_FEAT_ZONED ((__force blk_features_t)(1u << 10))
+#define BLK_FEAT_ZONED ((__force blk_features_t)(1u << 11))
/* supports PCI(e) p2p requests */
#define BLK_FEAT_PCI_P2PDMA ((__force blk_features_t)(1u << 12))
@@ -1321,9 +1324,14 @@ static inline bool bdev_nonrot(struct block_device *bdev)
return blk_queue_nonrot(bdev_get_queue(bdev));
}
-static inline bool bdev_synchronous(struct block_device *bdev)
+static inline bool bdev_read_synchronous(struct block_device *bdev)
+{
+ return bdev->bd_disk->queue->limits.features & BLK_FEAT_READ_SYNCHRONOUS;
+}
+
+static inline bool bdev_write_synchronous(struct block_device *bdev)
{
- return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS;
+ return bdev->bd_disk->queue->limits.features & BLK_FEAT_WRITE_SYNCHRONOUS;
}
static inline bool bdev_stable_writes(struct block_device *bdev)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f3e0ac20c2e8..2068b6973648 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -205,21 +205,22 @@ struct swap_extent {
offsetof(union swap_header, info.badpages)) / sizeof(int))
enum {
- SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
- SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
- SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */
- SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
- SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
- SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
- SWP_BLKDEV = (1 << 6), /* its a block device */
- SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */
- SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */
- SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */
- SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
- SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
- SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
- /* add others here before... */
- SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */
+ SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
+ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
+ SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */
+ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
+ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
+ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
+ SWP_BLKDEV = (1 << 6), /* its a block device */
+ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */
+ SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */
+ SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */
+ SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
+ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
+ SWP_READ_SYNCHRONOUS_IO = (1 << 12), /* synchronous read IO is efficient */
+ SWP_WRITE_SYNCHRONOUS_IO = (1 << 13), /* synchronous write IO is efficient */
+ SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */
+ /* add others here before... */
};
#define SWAP_CLUSTER_MAX 32UL
diff --git a/mm/memory.c b/mm/memory.c
index 75c2dfd04f72..56c864d5d787 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4293,7 +4293,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
swapcache = folio;
if (!folio) {
- if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
+ if (data_race(si->flags & SWP_READ_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
/* skip swapcache */
folio = alloc_swap_folio(vmf);
@@ -4430,7 +4430,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_nomap;
}
- /* allocated large folios for SWP_SYNCHRONOUS_IO */
+ /* allocated large folios for SWP_READ_SYNCHRONOUS_IO */
if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
unsigned long nr = folio_nr_pages(folio);
unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
diff --git a/mm/page_io.c b/mm/page_io.c
index 4b4ea8e49cf6..d692eafdd90c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -465,10 +465,10 @@ void __swap_writepage(struct folio *folio, struct writeback_control *wbc)
swap_writepage_fs(folio, wbc);
/*
* ->flags can be updated non-atomicially (scan_swap_map_slots),
- * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race
+ * but that will never affect SWP_WRITE_SYNCHRONOUS_IO, so the data_race
* is safe.
*/
- else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
+ else if (data_race(sis->flags & SWP_WRITE_SYNCHRONOUS_IO))
swap_writepage_bdev_sync(folio, wbc, sis);
else
swap_writepage_bdev_async(folio, wbc, sis);
@@ -616,7 +616,7 @@ static void swap_read_folio_bdev_async(struct folio *folio,
void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
{
struct swap_info_struct *sis = swp_swap_info(folio->swap);
- bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO;
+ bool synchronous = sis->flags & SWP_READ_SYNCHRONOUS_IO;
bool workingset = folio_test_workingset(folio);
unsigned long pflags;
bool in_thrashing;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b0a9071cfe1d..902e5698af44 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3488,8 +3488,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (si->bdev && bdev_stable_writes(si->bdev))
si->flags |= SWP_STABLE_WRITES;
- if (si->bdev && bdev_synchronous(si->bdev))
- si->flags |= SWP_SYNCHRONOUS_IO;
+ if (si->bdev && bdev_read_synchronous(si->bdev))
+ si->flags |= SWP_READ_SYNCHRONOUS_IO;
+
+ if (si->bdev && bdev_write_synchronous(si->bdev))
+ si->flags |= SWP_WRITE_SYNCHRONOUS_IO;
if (si->bdev && bdev_nonrot(si->bdev)) {
si->flags |= SWP_SOLIDSTATE;
--
2.45.2
^ permalink raw reply [flat|nested] 39+ messages in thread* [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
2025-03-07 12:01 [PATCH 0/2] Improve Zram by separating compression context from kswapd Qun-Wei Lin
2025-03-07 12:01 ` [PATCH 1/2] mm: Split BLK_FEAT_SYNCHRONOUS and SWP_SYNCHRONOUS_IO into separate read and write flags Qun-Wei Lin
@ 2025-03-07 12:01 ` Qun-Wei Lin
2025-03-07 19:41 ` Barry Song
2025-03-07 19:34 ` [PATCH 0/2] Improve Zram by separating compression context from kswapd Barry Song
` (2 subsequent siblings)
4 siblings, 1 reply; 39+ messages in thread
From: Qun-Wei Lin @ 2025-03-07 12:01 UTC (permalink / raw)
To: Jens Axboe, Minchan Kim, Sergey Senozhatsky, Vishal Verma,
Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Huang, Ying, Kairui Song, Dan Schatzberg,
Barry Song, Al Viro
Cc: linux-kernel, linux-block, nvdimm, linux-mm, linux-arm-kernel,
linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu,
Qun-Wei Lin
Introduced Kcompressd to offload zram page compression, improving
system efficiency by handling compression separately from memory
reclaiming. Added necessary configurations and dependencies.
Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
---
drivers/block/zram/Kconfig | 11 ++
drivers/block/zram/Makefile | 3 +-
drivers/block/zram/kcompressd.c | 340 ++++++++++++++++++++++++++++++++
drivers/block/zram/kcompressd.h | 25 +++
drivers/block/zram/zram_drv.c | 22 ++-
5 files changed, 397 insertions(+), 4 deletions(-)
create mode 100644 drivers/block/zram/kcompressd.c
create mode 100644 drivers/block/zram/kcompressd.h
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index 402b7b175863..f0a1b574f770 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -145,3 +145,14 @@ config ZRAM_MULTI_COMP
re-compress pages using a potentially slower but more effective
compression algorithm. Note, that IDLE page recompression
requires ZRAM_TRACK_ENTRY_ACTIME.
+
+config KCOMPRESSD
+ tristate "Kcompressd: Accelerated zram compression"
+ depends on ZRAM
+ help
+ Kcompressd creates multiple daemons to accelerate the compression of pages
+ in zram, offloading this time-consuming task from the zram driver.
+
+ This approach improves system efficiency by handling page compression separately,
+ which was originally done by kswapd or direct reclaim.
+
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index 0fdefd576691..23baa5dfceb9 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -9,4 +9,5 @@ zram-$(CONFIG_ZRAM_BACKEND_ZSTD) += backend_zstd.o
zram-$(CONFIG_ZRAM_BACKEND_DEFLATE) += backend_deflate.o
zram-$(CONFIG_ZRAM_BACKEND_842) += backend_842.o
-obj-$(CONFIG_ZRAM) += zram.o
+obj-$(CONFIG_ZRAM) += zram.o
+obj-$(CONFIG_KCOMPRESSD) += kcompressd.o
diff --git a/drivers/block/zram/kcompressd.c b/drivers/block/zram/kcompressd.c
new file mode 100644
index 000000000000..195b7e386869
--- /dev/null
+++ b/drivers/block/zram/kcompressd.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 MediaTek Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/bitops.h>
+#include <linux/freezer.h>
+#include <linux/kernel.h>
+#include <linux/psi.h>
+#include <linux/kfifo.h>
+#include <linux/swap.h>
+#include <linux/delay.h>
+
+#include "kcompressd.h"
+
+#define INIT_QUEUE_SIZE 4096
+#define DEFAULT_NR_KCOMPRESSD 4
+
+static atomic_t enable_kcompressd;
+static unsigned int nr_kcompressd;
+static unsigned int queue_size_per_kcompressd;
+static struct kcompress *kcompress;
+
+enum run_state {
+ KCOMPRESSD_NOT_STARTED = 0,
+ KCOMPRESSD_RUNNING,
+ KCOMPRESSD_SLEEPING,
+};
+
+struct kcompressd_para {
+ wait_queue_head_t *kcompressd_wait;
+ struct kfifo *write_fifo;
+ atomic_t *running;
+};
+
+static struct kcompressd_para *kcompressd_para;
+static BLOCKING_NOTIFIER_HEAD(kcompressd_notifier_list);
+
+struct write_work {
+ void *mem;
+ struct bio *bio;
+ compress_callback cb;
+};
+
+int kcompressd_enabled(void)
+{
+ return likely(atomic_read(&enable_kcompressd));
+}
+EXPORT_SYMBOL(kcompressd_enabled);
+
+static void kcompressd_try_to_sleep(struct kcompressd_para *p)
+{
+ DEFINE_WAIT(wait);
+
+ if (!kfifo_is_empty(p->write_fifo))
+ return;
+
+ if (freezing(current) || kthread_should_stop())
+ return;
+
+ atomic_set(p->running, KCOMPRESSD_SLEEPING);
+ prepare_to_wait(p->kcompressd_wait, &wait, TASK_INTERRUPTIBLE);
+
+ /*
+ * After a short sleep, check if it was a premature sleep. If not, then
+ * go fully to sleep until explicitly woken up.
+ */
+ if (!kthread_should_stop() && kfifo_is_empty(p->write_fifo))
+ schedule();
+
+ finish_wait(p->kcompressd_wait, &wait);
+ atomic_set(p->running, KCOMPRESSD_RUNNING);
+}
+
+static int kcompressd(void *para)
+{
+ struct task_struct *tsk = current;
+ struct kcompressd_para *p = (struct kcompressd_para *)para;
+
+ tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
+ set_freezable();
+
+ while (!kthread_should_stop()) {
+ bool ret;
+
+ kcompressd_try_to_sleep(p);
+ ret = try_to_freeze();
+ if (kthread_should_stop())
+ break;
+
+ if (ret)
+ continue;
+
+ while (!kfifo_is_empty(p->write_fifo)) {
+ struct write_work entry;
+
+ if (sizeof(struct write_work) == kfifo_out(p->write_fifo,
+ &entry, sizeof(struct write_work))) {
+ entry.cb(entry.mem, entry.bio);
+ bio_put(entry.bio);
+ }
+ }
+
+ }
+
+ tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
+ atomic_set(p->running, KCOMPRESSD_NOT_STARTED);
+ return 0;
+}
+
+static int init_write_queue(void)
+{
+ int i;
+ unsigned int queue_len = queue_size_per_kcompressd * sizeof(struct write_work);
+
+ for (i = 0; i < nr_kcompressd; i++) {
+ if (kfifo_alloc(&kcompress[i].write_fifo,
+ queue_len, GFP_KERNEL)) {
+ pr_err("Failed to alloc kfifo %d\n", i);
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+static void clean_bio_queue(int idx)
+{
+ struct write_work entry;
+
+ while (sizeof(struct write_work) == kfifo_out(&kcompress[idx].write_fifo,
+ &entry, sizeof(struct write_work))) {
+ bio_put(entry.bio);
+ entry.cb(entry.mem, entry.bio);
+ }
+ kfifo_free(&kcompress[idx].write_fifo);
+}
+
+static int kcompress_update(void)
+{
+ int i;
+ int ret;
+
+ kcompress = kvmalloc_array(nr_kcompressd, sizeof(struct kcompress), GFP_KERNEL);
+ if (!kcompress)
+ return -ENOMEM;
+
+ kcompressd_para = kvmalloc_array(nr_kcompressd, sizeof(struct kcompressd_para), GFP_KERNEL);
+ if (!kcompressd_para)
+ return -ENOMEM;
+
+ ret = init_write_queue();
+ if (ret) {
+ pr_err("Initialization of writing to FIFOs failed!!\n");
+ return ret;
+ }
+
+ for (i = 0; i < nr_kcompressd; i++) {
+ init_waitqueue_head(&kcompress[i].kcompressd_wait);
+ kcompressd_para[i].kcompressd_wait = &kcompress[i].kcompressd_wait;
+ kcompressd_para[i].write_fifo = &kcompress[i].write_fifo;
+ kcompressd_para[i].running = &kcompress[i].running;
+ }
+
+ return 0;
+}
+
+static void stop_all_kcompressd_thread(void)
+{
+ int i;
+
+ for (i = 0; i < nr_kcompressd; i++) {
+ kthread_stop(kcompress[i].kcompressd);
+ kcompress[i].kcompressd = NULL;
+ clean_bio_queue(i);
+ }
+}
+
+static int do_nr_kcompressd_handler(const char *val,
+ const struct kernel_param *kp)
+{
+ int ret;
+
+ atomic_set(&enable_kcompressd, false);
+
+ stop_all_kcompressd_thread();
+
+ ret = param_set_int(val, kp);
+ if (!ret) {
+ pr_err("Invalid number of kcompressd.\n");
+ return -EINVAL;
+ }
+
+ ret = init_write_queue();
+ if (ret) {
+ pr_err("Initialization of writing to FIFOs failed!!\n");
+ return ret;
+ }
+
+ atomic_set(&enable_kcompressd, true);
+
+ return 0;
+}
+
+static const struct kernel_param_ops param_ops_change_nr_kcompressd = {
+ .set = &do_nr_kcompressd_handler,
+ .get = ¶m_get_uint,
+ .free = NULL,
+};
+
+module_param_cb(nr_kcompressd, ¶m_ops_change_nr_kcompressd,
+ &nr_kcompressd, 0644);
+MODULE_PARM_DESC(nr_kcompressd, "Number of pre-created daemon for page compression");
+
+static int do_queue_size_per_kcompressd_handler(const char *val,
+ const struct kernel_param *kp)
+{
+ int ret;
+
+ atomic_set(&enable_kcompressd, false);
+
+ stop_all_kcompressd_thread();
+
+ ret = param_set_int(val, kp);
+ if (!ret) {
+ pr_err("Invalid queue size for kcompressd.\n");
+ return -EINVAL;
+ }
+
+ ret = init_write_queue();
+ if (ret) {
+ pr_err("Initialization of writing to FIFOs failed!!\n");
+ return ret;
+ }
+
+ pr_info("Queue size for kcompressd was changed: %d\n", queue_size_per_kcompressd);
+
+ atomic_set(&enable_kcompressd, true);
+ return 0;
+}
+
+static const struct kernel_param_ops param_ops_change_queue_size_per_kcompressd = {
+ .set = &do_queue_size_per_kcompressd_handler,
+ .get = ¶m_get_uint,
+ .free = NULL,
+};
+
+module_param_cb(queue_size_per_kcompressd, ¶m_ops_change_queue_size_per_kcompressd,
+ &queue_size_per_kcompressd, 0644);
+MODULE_PARM_DESC(queue_size_per_kcompressd,
+ "Size of queue for kcompressd");
+
+int schedule_bio_write(void *mem, struct bio *bio, compress_callback cb)
+{
+ int i;
+ bool submit_success = false;
+ size_t sz_work = sizeof(struct write_work);
+
+ struct write_work entry = {
+ .mem = mem,
+ .bio = bio,
+ .cb = cb
+ };
+
+ if (unlikely(!atomic_read(&enable_kcompressd)))
+ return -EBUSY;
+
+ if (!nr_kcompressd || !current_is_kswapd())
+ return -EBUSY;
+
+ bio_get(bio);
+
+ for (i = 0; i < nr_kcompressd; i++) {
+ submit_success =
+ (kfifo_avail(&kcompress[i].write_fifo) >= sz_work) &&
+ (sz_work == kfifo_in(&kcompress[i].write_fifo, &entry, sz_work));
+
+ if (submit_success) {
+ switch (atomic_read(&kcompress[i].running)) {
+ case KCOMPRESSD_NOT_STARTED:
+ atomic_set(&kcompress[i].running, KCOMPRESSD_RUNNING);
+ kcompress[i].kcompressd = kthread_run(kcompressd,
+ &kcompressd_para[i], "kcompressd:%d", i);
+ if (IS_ERR(kcompress[i].kcompressd)) {
+ atomic_set(&kcompress[i].running, KCOMPRESSD_NOT_STARTED);
+ pr_warn("Failed to start kcompressd:%d\n", i);
+ clean_bio_queue(i);
+ }
+ break;
+ case KCOMPRESSD_RUNNING:
+ break;
+ case KCOMPRESSD_SLEEPING:
+ wake_up_interruptible(&kcompress[i].kcompressd_wait);
+ break;
+ }
+ return 0;
+ }
+ }
+
+ bio_put(bio);
+ return -EBUSY;
+}
+EXPORT_SYMBOL(schedule_bio_write);
+
+static int __init kcompressd_init(void)
+{
+ int ret;
+
+ nr_kcompressd = DEFAULT_NR_KCOMPRESSD;
+ queue_size_per_kcompressd = INIT_QUEUE_SIZE;
+
+ ret = kcompress_update();
+ if (ret) {
+ pr_err("Init kcompressd failed!\n");
+ return ret;
+ }
+
+ atomic_set(&enable_kcompressd, true);
+ blocking_notifier_call_chain(&kcompressd_notifier_list, 0, NULL);
+ return 0;
+}
+
+static void __exit kcompressd_exit(void)
+{
+ atomic_set(&enable_kcompressd, false);
+ stop_all_kcompressd_thread();
+
+ kvfree(kcompress);
+ kvfree(kcompressd_para);
+}
+
+module_init(kcompressd_init);
+module_exit(kcompressd_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Qun-Wei Lin <qun-wei.lin@mediatek.com>");
+MODULE_DESCRIPTION("Separate the page compression from the memory reclaiming");
+
diff --git a/drivers/block/zram/kcompressd.h b/drivers/block/zram/kcompressd.h
new file mode 100644
index 000000000000..2fe0b424a7af
--- /dev/null
+++ b/drivers/block/zram/kcompressd.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024 MediaTek Inc.
+ */
+
+#ifndef _KCOMPRESSD_H_
+#define _KCOMPRESSD_H_
+
+#include <linux/rwsem.h>
+#include <linux/kfifo.h>
+#include <linux/atomic.h>
+
+typedef void (*compress_callback)(void *mem, struct bio *bio);
+
+struct kcompress {
+ struct task_struct *kcompressd;
+ wait_queue_head_t kcompressd_wait;
+ struct kfifo write_fifo;
+ atomic_t running;
+};
+
+int kcompressd_enabled(void);
+int schedule_bio_write(void *mem, struct bio *bio, compress_callback cb);
+#endif
+
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 2e1a70f2f4bd..bcd63ecb6ff2 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -35,6 +35,7 @@
#include <linux/part_stat.h>
#include <linux/kernel_read_file.h>
+#include "kcompressd.h"
#include "zram_drv.h"
static DEFINE_IDR(zram_index_idr);
@@ -2240,6 +2241,15 @@ static void zram_bio_write(struct zram *zram, struct bio *bio)
bio_endio(bio);
}
+#if IS_ENABLED(CONFIG_KCOMPRESSD)
+static void zram_bio_write_callback(void *mem, struct bio *bio)
+{
+ struct zram *zram = (struct zram *)mem;
+
+ zram_bio_write(zram, bio);
+}
+#endif
+
/*
* Handler function for all zram I/O requests.
*/
@@ -2252,6 +2262,10 @@ static void zram_submit_bio(struct bio *bio)
zram_bio_read(zram, bio);
break;
case REQ_OP_WRITE:
+#if IS_ENABLED(CONFIG_KCOMPRESSD)
+ if (kcompressd_enabled() && !schedule_bio_write(zram, bio, zram_bio_write_callback))
+ break;
+#endif
zram_bio_write(zram, bio);
break;
case REQ_OP_DISCARD:
@@ -2535,9 +2549,11 @@ static int zram_add(void)
#if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE
.max_write_zeroes_sectors = UINT_MAX,
#endif
- .features = BLK_FEAT_STABLE_WRITES |
- BLK_FEAT_READ_SYNCHRONOUS |
- BLK_FEAT_WRITE_SYNCHRONOUS,
+ .features = BLK_FEAT_STABLE_WRITES
+ | BLK_FEAT_READ_SYNCHRONOUS
+#if !IS_ENABLED(CONFIG_KCOMPRESSD)
+ | BLK_FEAT_WRITE_SYNCHRONOUS,
+#endif
};
struct zram *zram;
int ret, device_id;
--
2.45.2
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
2025-03-07 12:01 ` [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression Qun-Wei Lin
@ 2025-03-07 19:41 ` Barry Song
2025-03-07 23:13 ` Nhat Pham
2025-03-10 13:26 ` Qun-wei Lin (林群崴)
0 siblings, 2 replies; 39+ messages in thread
From: Barry Song @ 2025-03-07 19:41 UTC (permalink / raw)
To: Qun-Wei Lin
Cc: Jens Axboe, Minchan Kim, Sergey Senozhatsky, Vishal Verma,
Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Huang, Ying, Kairui Song, Dan Schatzberg, Al Viro,
linux-kernel, linux-block, nvdimm, linux-mm, linux-arm-kernel,
linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu
On Sat, Mar 8, 2025 at 1:02 AM Qun-Wei Lin <qun-wei.lin@mediatek.com> wrote:
>
> Introduced Kcompressd to offload zram page compression, improving
> system efficiency by handling compression separately from memory
> reclaiming. Added necessary configurations and dependencies.
>
> Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
> ---
> drivers/block/zram/Kconfig | 11 ++
> drivers/block/zram/Makefile | 3 +-
> drivers/block/zram/kcompressd.c | 340 ++++++++++++++++++++++++++++++++
> drivers/block/zram/kcompressd.h | 25 +++
> drivers/block/zram/zram_drv.c | 22 ++-
> 5 files changed, 397 insertions(+), 4 deletions(-)
> create mode 100644 drivers/block/zram/kcompressd.c
> create mode 100644 drivers/block/zram/kcompressd.h
>
> diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
> index 402b7b175863..f0a1b574f770 100644
> --- a/drivers/block/zram/Kconfig
> +++ b/drivers/block/zram/Kconfig
> @@ -145,3 +145,14 @@ config ZRAM_MULTI_COMP
> re-compress pages using a potentially slower but more effective
> compression algorithm. Note, that IDLE page recompression
> requires ZRAM_TRACK_ENTRY_ACTIME.
> +
> +config KCOMPRESSD
> + tristate "Kcompressd: Accelerated zram compression"
> + depends on ZRAM
> + help
> + Kcompressd creates multiple daemons to accelerate the compression of pages
> + in zram, offloading this time-consuming task from the zram driver.
> +
> + This approach improves system efficiency by handling page compression separately,
> + which was originally done by kswapd or direct reclaim.
For direct reclaim, we were previously able to compress using multiple CPUs
with multi-threading.
After your patch, it seems that only a single thread/CPU is used for compression
so it won't necessarily improve direct reclaim performance?
Even for kswapd, we used to have multiple threads like [kswapd0], [kswapd1],
and [kswapd2] for different nodes. Now, are we also limited to just one thread?
I also wonder if this could be handled at the vmscan level instead of the zram
level. then it might potentially help other sync devices or even zswap later.
But I agree that for phones, modifying zram seems like an easier starting
point. However, relying on a single thread isn't always the best approach.
> +
> diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
> index 0fdefd576691..23baa5dfceb9 100644
> --- a/drivers/block/zram/Makefile
> +++ b/drivers/block/zram/Makefile
> @@ -9,4 +9,5 @@ zram-$(CONFIG_ZRAM_BACKEND_ZSTD) += backend_zstd.o
> zram-$(CONFIG_ZRAM_BACKEND_DEFLATE) += backend_deflate.o
> zram-$(CONFIG_ZRAM_BACKEND_842) += backend_842.o
>
> -obj-$(CONFIG_ZRAM) += zram.o
> +obj-$(CONFIG_ZRAM) += zram.o
> +obj-$(CONFIG_KCOMPRESSD) += kcompressd.o
> diff --git a/drivers/block/zram/kcompressd.c b/drivers/block/zram/kcompressd.c
> new file mode 100644
> index 000000000000..195b7e386869
> --- /dev/null
> +++ b/drivers/block/zram/kcompressd.c
> @@ -0,0 +1,340 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2024 MediaTek Inc.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/bio.h>
> +#include <linux/bitops.h>
> +#include <linux/freezer.h>
> +#include <linux/kernel.h>
> +#include <linux/psi.h>
> +#include <linux/kfifo.h>
> +#include <linux/swap.h>
> +#include <linux/delay.h>
> +
> +#include "kcompressd.h"
> +
> +#define INIT_QUEUE_SIZE 4096
> +#define DEFAULT_NR_KCOMPRESSD 4
> +
> +static atomic_t enable_kcompressd;
> +static unsigned int nr_kcompressd;
> +static unsigned int queue_size_per_kcompressd;
> +static struct kcompress *kcompress;
> +
> +enum run_state {
> + KCOMPRESSD_NOT_STARTED = 0,
> + KCOMPRESSD_RUNNING,
> + KCOMPRESSD_SLEEPING,
> +};
> +
> +struct kcompressd_para {
> + wait_queue_head_t *kcompressd_wait;
> + struct kfifo *write_fifo;
> + atomic_t *running;
> +};
> +
> +static struct kcompressd_para *kcompressd_para;
> +static BLOCKING_NOTIFIER_HEAD(kcompressd_notifier_list);
> +
> +struct write_work {
> + void *mem;
> + struct bio *bio;
> + compress_callback cb;
> +};
> +
> +int kcompressd_enabled(void)
> +{
> + return likely(atomic_read(&enable_kcompressd));
> +}
> +EXPORT_SYMBOL(kcompressd_enabled);
> +
> +static void kcompressd_try_to_sleep(struct kcompressd_para *p)
> +{
> + DEFINE_WAIT(wait);
> +
> + if (!kfifo_is_empty(p->write_fifo))
> + return;
> +
> + if (freezing(current) || kthread_should_stop())
> + return;
> +
> + atomic_set(p->running, KCOMPRESSD_SLEEPING);
> + prepare_to_wait(p->kcompressd_wait, &wait, TASK_INTERRUPTIBLE);
> +
> + /*
> + * After a short sleep, check if it was a premature sleep. If not, then
> + * go fully to sleep until explicitly woken up.
> + */
> + if (!kthread_should_stop() && kfifo_is_empty(p->write_fifo))
> + schedule();
> +
> + finish_wait(p->kcompressd_wait, &wait);
> + atomic_set(p->running, KCOMPRESSD_RUNNING);
> +}
> +
> +static int kcompressd(void *para)
> +{
> + struct task_struct *tsk = current;
> + struct kcompressd_para *p = (struct kcompressd_para *)para;
> +
> + tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
> + set_freezable();
> +
> + while (!kthread_should_stop()) {
> + bool ret;
> +
> + kcompressd_try_to_sleep(p);
> + ret = try_to_freeze();
> + if (kthread_should_stop())
> + break;
> +
> + if (ret)
> + continue;
> +
> + while (!kfifo_is_empty(p->write_fifo)) {
> + struct write_work entry;
> +
> + if (sizeof(struct write_work) == kfifo_out(p->write_fifo,
> + &entry, sizeof(struct write_work))) {
> + entry.cb(entry.mem, entry.bio);
> + bio_put(entry.bio);
> + }
> + }
> +
> + }
> +
> + tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
> + atomic_set(p->running, KCOMPRESSD_NOT_STARTED);
> + return 0;
> +}
> +
> +static int init_write_queue(void)
> +{
> + int i;
> + unsigned int queue_len = queue_size_per_kcompressd * sizeof(struct write_work);
> +
> + for (i = 0; i < nr_kcompressd; i++) {
> + if (kfifo_alloc(&kcompress[i].write_fifo,
> + queue_len, GFP_KERNEL)) {
> + pr_err("Failed to alloc kfifo %d\n", i);
> + return -ENOMEM;
> + }
> + }
> + return 0;
> +}
> +
> +static void clean_bio_queue(int idx)
> +{
> + struct write_work entry;
> +
> + while (sizeof(struct write_work) == kfifo_out(&kcompress[idx].write_fifo,
> + &entry, sizeof(struct write_work))) {
> + bio_put(entry.bio);
> + entry.cb(entry.mem, entry.bio);
> + }
> + kfifo_free(&kcompress[idx].write_fifo);
> +}
> +
> +static int kcompress_update(void)
> +{
> + int i;
> + int ret;
> +
> + kcompress = kvmalloc_array(nr_kcompressd, sizeof(struct kcompress), GFP_KERNEL);
> + if (!kcompress)
> + return -ENOMEM;
> +
> + kcompressd_para = kvmalloc_array(nr_kcompressd, sizeof(struct kcompressd_para), GFP_KERNEL);
> + if (!kcompressd_para)
> + return -ENOMEM;
> +
> + ret = init_write_queue();
> + if (ret) {
> + pr_err("Initialization of writing to FIFOs failed!!\n");
> + return ret;
> + }
> +
> + for (i = 0; i < nr_kcompressd; i++) {
> + init_waitqueue_head(&kcompress[i].kcompressd_wait);
> + kcompressd_para[i].kcompressd_wait = &kcompress[i].kcompressd_wait;
> + kcompressd_para[i].write_fifo = &kcompress[i].write_fifo;
> + kcompressd_para[i].running = &kcompress[i].running;
> + }
> +
> + return 0;
> +}
> +
> +static void stop_all_kcompressd_thread(void)
> +{
> + int i;
> +
> + for (i = 0; i < nr_kcompressd; i++) {
> + kthread_stop(kcompress[i].kcompressd);
> + kcompress[i].kcompressd = NULL;
> + clean_bio_queue(i);
> + }
> +}
> +
> +static int do_nr_kcompressd_handler(const char *val,
> + const struct kernel_param *kp)
> +{
> + int ret;
> +
> + atomic_set(&enable_kcompressd, false);
> +
> + stop_all_kcompressd_thread();
> +
> + ret = param_set_int(val, kp);
> + if (!ret) {
> + pr_err("Invalid number of kcompressd.\n");
> + return -EINVAL;
> + }
> +
> + ret = init_write_queue();
> + if (ret) {
> + pr_err("Initialization of writing to FIFOs failed!!\n");
> + return ret;
> + }
> +
> + atomic_set(&enable_kcompressd, true);
> +
> + return 0;
> +}
> +
> +static const struct kernel_param_ops param_ops_change_nr_kcompressd = {
> + .set = &do_nr_kcompressd_handler,
> + .get = ¶m_get_uint,
> + .free = NULL,
> +};
> +
> +module_param_cb(nr_kcompressd, ¶m_ops_change_nr_kcompressd,
> + &nr_kcompressd, 0644);
> +MODULE_PARM_DESC(nr_kcompressd, "Number of pre-created daemon for page compression");
> +
> +static int do_queue_size_per_kcompressd_handler(const char *val,
> + const struct kernel_param *kp)
> +{
> + int ret;
> +
> + atomic_set(&enable_kcompressd, false);
> +
> + stop_all_kcompressd_thread();
> +
> + ret = param_set_int(val, kp);
> + if (!ret) {
> + pr_err("Invalid queue size for kcompressd.\n");
> + return -EINVAL;
> + }
> +
> + ret = init_write_queue();
> + if (ret) {
> + pr_err("Initialization of writing to FIFOs failed!!\n");
> + return ret;
> + }
> +
> + pr_info("Queue size for kcompressd was changed: %d\n", queue_size_per_kcompressd);
> +
> + atomic_set(&enable_kcompressd, true);
> + return 0;
> +}
> +
> +static const struct kernel_param_ops param_ops_change_queue_size_per_kcompressd = {
> + .set = &do_queue_size_per_kcompressd_handler,
> + .get = ¶m_get_uint,
> + .free = NULL,
> +};
> +
> +module_param_cb(queue_size_per_kcompressd, ¶m_ops_change_queue_size_per_kcompressd,
> + &queue_size_per_kcompressd, 0644);
> +MODULE_PARM_DESC(queue_size_per_kcompressd,
> + "Size of queue for kcompressd");
> +
> +int schedule_bio_write(void *mem, struct bio *bio, compress_callback cb)
> +{
> + int i;
> + bool submit_success = false;
> + size_t sz_work = sizeof(struct write_work);
> +
> + struct write_work entry = {
> + .mem = mem,
> + .bio = bio,
> + .cb = cb
> + };
> +
> + if (unlikely(!atomic_read(&enable_kcompressd)))
> + return -EBUSY;
> +
> + if (!nr_kcompressd || !current_is_kswapd())
> + return -EBUSY;
> +
> + bio_get(bio);
> +
> + for (i = 0; i < nr_kcompressd; i++) {
> + submit_success =
> + (kfifo_avail(&kcompress[i].write_fifo) >= sz_work) &&
> + (sz_work == kfifo_in(&kcompress[i].write_fifo, &entry, sz_work));
> +
> + if (submit_success) {
> + switch (atomic_read(&kcompress[i].running)) {
> + case KCOMPRESSD_NOT_STARTED:
> + atomic_set(&kcompress[i].running, KCOMPRESSD_RUNNING);
> + kcompress[i].kcompressd = kthread_run(kcompressd,
> + &kcompressd_para[i], "kcompressd:%d", i);
> + if (IS_ERR(kcompress[i].kcompressd)) {
> + atomic_set(&kcompress[i].running, KCOMPRESSD_NOT_STARTED);
> + pr_warn("Failed to start kcompressd:%d\n", i);
> + clean_bio_queue(i);
> + }
> + break;
> + case KCOMPRESSD_RUNNING:
> + break;
> + case KCOMPRESSD_SLEEPING:
> + wake_up_interruptible(&kcompress[i].kcompressd_wait);
> + break;
> + }
> + return 0;
> + }
> + }
> +
> + bio_put(bio);
> + return -EBUSY;
> +}
> +EXPORT_SYMBOL(schedule_bio_write);
> +
> +static int __init kcompressd_init(void)
> +{
> + int ret;
> +
> + nr_kcompressd = DEFAULT_NR_KCOMPRESSD;
> + queue_size_per_kcompressd = INIT_QUEUE_SIZE;
> +
> + ret = kcompress_update();
> + if (ret) {
> + pr_err("Init kcompressd failed!\n");
> + return ret;
> + }
> +
> + atomic_set(&enable_kcompressd, true);
> + blocking_notifier_call_chain(&kcompressd_notifier_list, 0, NULL);
> + return 0;
> +}
> +
> +static void __exit kcompressd_exit(void)
> +{
> + atomic_set(&enable_kcompressd, false);
> + stop_all_kcompressd_thread();
> +
> + kvfree(kcompress);
> + kvfree(kcompressd_para);
> +}
> +
> +module_init(kcompressd_init);
> +module_exit(kcompressd_exit);
> +
> +MODULE_LICENSE("Dual BSD/GPL");
> +MODULE_AUTHOR("Qun-Wei Lin <qun-wei.lin@mediatek.com>");
> +MODULE_DESCRIPTION("Separate the page compression from the memory reclaiming");
> +
> diff --git a/drivers/block/zram/kcompressd.h b/drivers/block/zram/kcompressd.h
> new file mode 100644
> index 000000000000..2fe0b424a7af
> --- /dev/null
> +++ b/drivers/block/zram/kcompressd.h
> @@ -0,0 +1,25 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2024 MediaTek Inc.
> + */
> +
> +#ifndef _KCOMPRESSD_H_
> +#define _KCOMPRESSD_H_
> +
> +#include <linux/rwsem.h>
> +#include <linux/kfifo.h>
> +#include <linux/atomic.h>
> +
> +typedef void (*compress_callback)(void *mem, struct bio *bio);
> +
> +struct kcompress {
> + struct task_struct *kcompressd;
> + wait_queue_head_t kcompressd_wait;
> + struct kfifo write_fifo;
> + atomic_t running;
> +};
> +
> +int kcompressd_enabled(void);
> +int schedule_bio_write(void *mem, struct bio *bio, compress_callback cb);
> +#endif
> +
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 2e1a70f2f4bd..bcd63ecb6ff2 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -35,6 +35,7 @@
> #include <linux/part_stat.h>
> #include <linux/kernel_read_file.h>
>
> +#include "kcompressd.h"
> #include "zram_drv.h"
>
> static DEFINE_IDR(zram_index_idr);
> @@ -2240,6 +2241,15 @@ static void zram_bio_write(struct zram *zram, struct bio *bio)
> bio_endio(bio);
> }
>
> +#if IS_ENABLED(CONFIG_KCOMPRESSD)
> +static void zram_bio_write_callback(void *mem, struct bio *bio)
> +{
> + struct zram *zram = (struct zram *)mem;
> +
> + zram_bio_write(zram, bio);
> +}
> +#endif
> +
> /*
> * Handler function for all zram I/O requests.
> */
> @@ -2252,6 +2262,10 @@ static void zram_submit_bio(struct bio *bio)
> zram_bio_read(zram, bio);
> break;
> case REQ_OP_WRITE:
> +#if IS_ENABLED(CONFIG_KCOMPRESSD)
> + if (kcompressd_enabled() && !schedule_bio_write(zram, bio, zram_bio_write_callback))
> + break;
> +#endif
> zram_bio_write(zram, bio);
> break;
> case REQ_OP_DISCARD:
> @@ -2535,9 +2549,11 @@ static int zram_add(void)
> #if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE
> .max_write_zeroes_sectors = UINT_MAX,
> #endif
> - .features = BLK_FEAT_STABLE_WRITES |
> - BLK_FEAT_READ_SYNCHRONOUS |
> - BLK_FEAT_WRITE_SYNCHRONOUS,
> + .features = BLK_FEAT_STABLE_WRITES
> + | BLK_FEAT_READ_SYNCHRONOUS
> +#if !IS_ENABLED(CONFIG_KCOMPRESSD)
> + | BLK_FEAT_WRITE_SYNCHRONOUS,
> +#endif
> };
> struct zram *zram;
> int ret, device_id;
> --
> 2.45.2
>
Thanks
Barry
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
2025-03-07 19:41 ` Barry Song
@ 2025-03-07 23:13 ` Nhat Pham
2025-03-07 23:14 ` Nhat Pham
` (2 more replies)
2025-03-10 13:26 ` Qun-wei Lin (林群崴)
1 sibling, 3 replies; 39+ messages in thread
From: Nhat Pham @ 2025-03-07 23:13 UTC (permalink / raw)
To: Barry Song
Cc: Qun-Wei Lin, Jens Axboe, Minchan Kim, Sergey Senozhatsky,
Vishal Verma, Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Huang, Ying, Kairui Song, Dan Schatzberg, Al Viro,
linux-kernel, linux-block, nvdimm, linux-mm, linux-arm-kernel,
linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu
On Fri, Mar 7, 2025 at 11:41 AM Barry Song <21cnbao@gmail.com> wrote:
>
> On Sat, Mar 8, 2025 at 1:02 AM Qun-Wei Lin <qun-wei.lin@mediatek.com> wrote:
> >
> > Introduced Kcompressd to offload zram page compression, improving
> > system efficiency by handling compression separately from memory
> > reclaiming. Added necessary configurations and dependencies.
> >
> > Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
> > ---
> > drivers/block/zram/Kconfig | 11 ++
> > drivers/block/zram/Makefile | 3 +-
> > drivers/block/zram/kcompressd.c | 340 ++++++++++++++++++++++++++++++++
> > drivers/block/zram/kcompressd.h | 25 +++
> > drivers/block/zram/zram_drv.c | 22 ++-
> > 5 files changed, 397 insertions(+), 4 deletions(-)
> > create mode 100644 drivers/block/zram/kcompressd.c
> > create mode 100644 drivers/block/zram/kcompressd.h
> >
> > diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
> > index 402b7b175863..f0a1b574f770 100644
> > --- a/drivers/block/zram/Kconfig
> > +++ b/drivers/block/zram/Kconfig
> > @@ -145,3 +145,14 @@ config ZRAM_MULTI_COMP
> > re-compress pages using a potentially slower but more effective
> > compression algorithm. Note, that IDLE page recompression
> > requires ZRAM_TRACK_ENTRY_ACTIME.
> > +
> > +config KCOMPRESSD
> > + tristate "Kcompressd: Accelerated zram compression"
> > + depends on ZRAM
> > + help
> > + Kcompressd creates multiple daemons to accelerate the compression of pages
> > + in zram, offloading this time-consuming task from the zram driver.
> > +
> > + This approach improves system efficiency by handling page compression separately,
> > + which was originally done by kswapd or direct reclaim.
>
> For direct reclaim, we were previously able to compress using multiple CPUs
> with multi-threading.
> After your patch, it seems that only a single thread/CPU is used for compression
> so it won't necessarily improve direct reclaim performance?
>
> Even for kswapd, we used to have multiple threads like [kswapd0], [kswapd1],
> and [kswapd2] for different nodes. Now, are we also limited to just one thread?
> I also wonder if this could be handled at the vmscan level instead of the zram
> level. then it might potentially help other sync devices or even zswap later.
Agree. A shared solution would be much appreciated. We can keep the
kcompressd idea, but have it accept IO work from multiple sources
(zram, zswap, whatever) through a shared API.
Otherwise we would need to reinvent the wheel multiple times :)
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
2025-03-07 23:13 ` Nhat Pham
@ 2025-03-07 23:14 ` Nhat Pham
2025-03-10 13:26 ` Qun-wei Lin (林群崴)
[not found] ` <20250309010541.3152-1-hdanton@sina.com>
2025-03-11 5:02 ` Sergey Senozhatsky
2 siblings, 1 reply; 39+ messages in thread
From: Nhat Pham @ 2025-03-07 23:14 UTC (permalink / raw)
To: Barry Song
Cc: Qun-Wei Lin, Jens Axboe, Minchan Kim, Sergey Senozhatsky,
Vishal Verma, Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Kairui Song, Dan Schatzberg, Al Viro, linux-kernel,
linux-block, nvdimm, linux-mm, linux-arm-kernel, linux-mediatek,
Casper Li, Chinwen Chang, Andrew Yang, James Hsu
On Fri, Mar 7, 2025 at 3:13 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
>
> Agree. A shared solution would be much appreciated. We can keep the
> kcompressd idea, but have it accept IO work from multiple sources
> (zram, zswap, whatever) through a shared API.
by IO I meant compress work (should be double quoted "IO"). But you
get the idea :)
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
2025-03-07 23:14 ` Nhat Pham
@ 2025-03-10 13:26 ` Qun-wei Lin (林群崴)
0 siblings, 0 replies; 39+ messages in thread
From: Qun-wei Lin (林群崴) @ 2025-03-10 13:26 UTC (permalink / raw)
To: 21cnbao, nphamcs
Cc: Chinwen Chang (張錦文),
Andrew Yang (楊智強),
chrisl, James Hsu (徐慶薰),
AngeloGioacchino Del Regno, akpm, linux-kernel, linux-mediatek,
ira.weiny, linux-mm, vishal.l.verma, dave.jiang, schatzberg.dan,
viro, ryan.roberts, minchan, axboe, linux-block, kasong, nvdimm,
linux-arm-kernel, matthias.bgg,
Casper Li (李中榮),
senozhatsky, dan.j.williams
On Fri, 2025-03-07 at 15:14 -0800, Nhat Pham wrote:
>
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
>
>
> On Fri, Mar 7, 2025 at 3:13 PM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> >
> > Agree. A shared solution would be much appreciated. We can keep the
> > kcompressd idea, but have it accept IO work from multiple sources
> > (zram, zswap, whatever) through a shared API.
>
> by IO I meant compress work (should be double quoted "IO"). But you
> get the idea :)
I also agree that we can evolve into a more general solution.
So this is also why we designed kcompressd to do writeback using
callbacks rather than hard coding it directly into zram.
But currently we only extend Zram. If anyone has good suggestions, we
would greatly appreciate your help.
Thanks!
Qun-wei
^ permalink raw reply [flat|nested] 39+ messages in thread
[parent not found: <20250309010541.3152-1-hdanton@sina.com>]
* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
[not found] ` <20250309010541.3152-1-hdanton@sina.com>
@ 2025-03-09 19:56 ` Nhat Pham
2025-03-09 20:44 ` Barry Song
0 siblings, 1 reply; 39+ messages in thread
From: Nhat Pham @ 2025-03-09 19:56 UTC (permalink / raw)
To: Hillf Danton
Cc: Barry Song, Qun-Wei Lin, Sergey Senozhatsky, linux-kernel, linux-mm
On Sat, Mar 8, 2025 at 5:05 PM Hillf Danton <hdanton@sina.com> wrote:
>
> Could you explain what nr_kcompressd means, Qun-Wei, to quiesce barking lads?
Who's the "barking lads" you are referring to? Please mind your language.
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
2025-03-09 19:56 ` Nhat Pham
@ 2025-03-09 20:44 ` Barry Song
2025-03-09 22:20 ` Nhat Pham
[not found] ` <20250310103427.3216-1-hdanton@sina.com>
0 siblings, 2 replies; 39+ messages in thread
From: Barry Song @ 2025-03-09 20:44 UTC (permalink / raw)
To: Nhat Pham
Cc: Hillf Danton, Qun-Wei Lin, Sergey Senozhatsky, linux-kernel, linux-mm
On Mon, Mar 10, 2025 at 8:56 AM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Sat, Mar 8, 2025 at 5:05 PM Hillf Danton <hdanton@sina.com> wrote:
> >
> > Could you explain what nr_kcompressd means, Qun-Wei, to quiesce barking lads?
>
> Who's the "barking lads" you are referring to? Please mind your language.
I also feel extremely uncomfortable. In Eastern culture, this is an extremely
vulgar word, more offensive than any others.
I strongly feel that this violates the mutual respect within the Linux
community. This is a serious case of verbal abuse.
Regardless of the existence of nr_kcompressd, it is still unacceptable to
invent an interface that requires users to figure out how to set it up, while
kswapd can launch threads based on NUMA nodes.
This should be transparent to users, just as kswapd does.
void __meminit kswapd_run(int nid)
{
...
if (!pgdat->kswapd) {
pgdat->kswapd = kthread_create_on_node(kswapd, pgdat,
nid, "kswapd%d", nid);
...
}
pgdat_kswapd_unlock(pgdat);
}
On the other hand, no one will know how to set up the proper number of
threads, while direct reclaim can utilize each CPU.
Therefore, listing nr_kcompressd does not change the essence of my
question.
Thanks
Barry
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
2025-03-09 20:44 ` Barry Song
@ 2025-03-09 22:20 ` Nhat Pham
2025-03-10 13:23 ` Qun-wei Lin (林群崴)
[not found] ` <20250310103427.3216-1-hdanton@sina.com>
1 sibling, 1 reply; 39+ messages in thread
From: Nhat Pham @ 2025-03-09 22:20 UTC (permalink / raw)
To: Barry Song
Cc: Hillf Danton, Qun-Wei Lin, Sergey Senozhatsky, linux-kernel, linux-mm
On Sun, Mar 9, 2025 at 1:44 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Mon, Mar 10, 2025 at 8:56 AM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Sat, Mar 8, 2025 at 5:05 PM Hillf Danton <hdanton@sina.com> wrote:
> > >
> > > Could you explain what nr_kcompressd means, Qun-Wei, to quiesce barking lads?
> >
> > Who's the "barking lads" you are referring to? Please mind your language.
>
> I also feel extremely uncomfortable. In Eastern culture, this is an extremely
> vulgar word, more offensive than any others.
>
> I strongly feel that this violates the mutual respect within the Linux
> community. This is a serious case of verbal abuse.
>
> Regardless of the existence of nr_kcompressd, it is still unacceptable to
> invent an interface that requires users to figure out how to set it up, while
> kswapd can launch threads based on NUMA nodes.
> This should be transparent to users, just as kswapd does.
>
> void __meminit kswapd_run(int nid)
>
> {
> ...
> if (!pgdat->kswapd) {
> pgdat->kswapd = kthread_create_on_node(kswapd, pgdat,
> nid, "kswapd%d", nid);
> ...
> }
> pgdat_kswapd_unlock(pgdat);
> }
>
> On the other hand, no one will know how to set up the proper number of
> threads, while direct reclaim can utilize each CPU.
Agree - how are users supposed to set this? The default puzzles me
too. Why 4? Does it work across architectures? Across workloads?
This makes no sense to me. Can we scale the number of threads in
proportion to the number of CPUs? Per-cpu kcompressd?
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
2025-03-09 22:20 ` Nhat Pham
@ 2025-03-10 13:23 ` Qun-wei Lin (林群崴)
0 siblings, 0 replies; 39+ messages in thread
From: Qun-wei Lin (林群崴) @ 2025-03-10 13:23 UTC (permalink / raw)
To: 21cnbao, nphamcs; +Cc: hdanton, senozhatsky, linux-kernel, linux-mm
On Sun, 2025-03-09 at 15:20 -0700, Nhat Pham wrote:
>
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
>
>
> On Sun, Mar 9, 2025 at 1:44 PM Barry Song <21cnbao@gmail.com> wrote:
> >
> > On Mon, Mar 10, 2025 at 8:56 AM Nhat Pham <nphamcs@gmail.com>
> > wrote:
> > >
> > > On Sat, Mar 8, 2025 at 5:05 PM Hillf Danton <hdanton@sina.com>
> > > wrote:
> > > >
> > > > Could you explain what nr_kcompressd means, Qun-Wei, to quiesce
> > > > barking lads?
> > >
> > > Who's the "barking lads" you are referring to? Please mind your
> > > language.
> >
> > I also feel extremely uncomfortable. In Eastern culture, this is an
> > extremely
> > vulgar word, more offensive than any others.
> >
> > I strongly feel that this violates the mutual respect within the
> > Linux
> > community. This is a serious case of verbal abuse.
> >
> > Regardless of the existence of nr_kcompressd, it is still
> > unacceptable to
> > invent an interface that requires users to figure out how to set it
> > up, while
> > kswapd can launch threads based on NUMA nodes.
> > This should be transparent to users, just as kswapd does.
> >
> > void __meminit kswapd_run(int nid)
> >
> > {
> > ...
> > if (!pgdat->kswapd) {
> > pgdat->kswapd = kthread_create_on_node(kswapd,
> > pgdat,
> > nid, "kswapd%d", nid);
> > ...
> > }
> > pgdat_kswapd_unlock(pgdat);
> > }
> >
> > On the other hand, no one will know how to set up the proper number
> > of
> > threads, while direct reclaim can utilize each CPU.
>
> Agree - how are users supposed to set this? The default puzzles me
> too. Why 4? Does it work across architectures? Across workloads?
>
> This makes no sense to me. Can we scale the number of threads in
> proportion to the number of CPUs? Per-cpu kcompressd?
The default value is actually designed to be the maximum number of
kcompressd that can run simultaneously. The current design is to create
the next kcompressd thread when all kfifos are full, and the number 4
is just a temporary setting.
You are right, changing it to match the number of CPUs might be better
to avoid confusion on how to set it up.
Best Regards,
Qun-wei
^ permalink raw reply [flat|nested] 39+ messages in thread
[parent not found: <20250310103427.3216-1-hdanton@sina.com>]
* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
[not found] ` <20250310103427.3216-1-hdanton@sina.com>
@ 2025-03-10 17:44 ` Barry Song
[not found] ` <20250310230902.3282-1-hdanton@sina.com>
0 siblings, 1 reply; 39+ messages in thread
From: Barry Song @ 2025-03-10 17:44 UTC (permalink / raw)
To: Hillf Danton
Cc: Nhat Pham, Qun-Wei Lin, Sergey Senozhatsky, linux-kernel, linux-mm
On Mon, Mar 10, 2025 at 6:34 PM Hillf Danton <hdanton@sina.com> wrote:
>
> On Mon, 10 Mar 2025 09:44:24 +1300 Barry Song <21cnbao@gmail.com>
> > I also feel extremely uncomfortable. In Eastern culture, this is an extremely
> > vulgar word, more offensive than any others.
> >
> If culture is not abused, feel free to show us how it is defined to be
> more offensive than any others in Eastern culture.
>
Having no manners is not your fault. The ignorant fear nothing.
> > I strongly feel that this violates the mutual respect within the Linux
> > community. This is a serious case of verbal abuse.
> >
> You will feel better I think after proving that your patch/comment is
> able to escape standing ovation, given respect is not free, lad.
It won't happen. A snake won't stop biting just because humans have
fed it.
I believe replying to you will only fuel your excitement, so this will be
the last email I ever respond to. Kindly do not reply.
Thanks
Barry
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
2025-03-07 23:13 ` Nhat Pham
2025-03-07 23:14 ` Nhat Pham
[not found] ` <20250309010541.3152-1-hdanton@sina.com>
@ 2025-03-11 5:02 ` Sergey Senozhatsky
2 siblings, 0 replies; 39+ messages in thread
From: Sergey Senozhatsky @ 2025-03-11 5:02 UTC (permalink / raw)
To: Nhat Pham
Cc: Barry Song, Qun-Wei Lin, Jens Axboe, Minchan Kim,
Sergey Senozhatsky, Vishal Verma, Dan Williams, Dave Jiang,
Ira Weiny, Andrew Morton, Matthias Brugger,
AngeloGioacchino Del Regno, Chris Li, Ryan Roberts, Huang, Ying,
Kairui Song, Dan Schatzberg, Al Viro, linux-kernel, linux-block,
nvdimm, linux-mm, linux-arm-kernel, linux-mediatek, Casper Li,
Chinwen Chang, Andrew Yang, James Hsu
On (25/03/07 15:13), Nhat Pham wrote:
> > > +config KCOMPRESSD
> > > + tristate "Kcompressd: Accelerated zram compression"
> > > + depends on ZRAM
> > > + help
> > > + Kcompressd creates multiple daemons to accelerate the compression of pages
> > > + in zram, offloading this time-consuming task from the zram driver.
> > > +
> > > + This approach improves system efficiency by handling page compression separately,
> > > + which was originally done by kswapd or direct reclaim.
> >
> > For direct reclaim, we were previously able to compress using multiple CPUs
> > with multi-threading.
> > After your patch, it seems that only a single thread/CPU is used for compression
> > so it won't necessarily improve direct reclaim performance?
> >
> > Even for kswapd, we used to have multiple threads like [kswapd0], [kswapd1],
> > and [kswapd2] for different nodes. Now, are we also limited to just one thread?
> > I also wonder if this could be handled at the vmscan level instead of the zram
> > level. then it might potentially help other sync devices or even zswap later.
>
> Agree. A shared solution would be much appreciated. We can keep the
> kcompressd idea, but have it accept IO work from multiple sources
> (zram, zswap, whatever) through a shared API.
I guess it also need to take swapoff into consideration (especially
if it takes I/O from multiple sources)?
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
2025-03-07 19:41 ` Barry Song
2025-03-07 23:13 ` Nhat Pham
@ 2025-03-10 13:26 ` Qun-wei Lin (林群崴)
2025-03-11 7:05 ` Barry Song
1 sibling, 1 reply; 39+ messages in thread
From: Qun-wei Lin (林群崴) @ 2025-03-10 13:26 UTC (permalink / raw)
To: 21cnbao
Cc: Andrew Yang (楊智強),
Casper Li (李中榮),
chrisl, James Hsu (徐慶薰),
AngeloGioacchino Del Regno, akpm, linux-kernel, linux-mediatek,
ira.weiny, linux-mm, dave.jiang, schatzberg.dan,
Chinwen Chang (張錦文),
viro, ryan.roberts, minchan, axboe, linux-block, kasong, nvdimm,
vishal.l.verma, matthias.bgg, linux-arm-kernel, ying.huang,
senozhatsky, dan.j.williams
On Sat, 2025-03-08 at 08:41 +1300, Barry Song wrote:
>
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
>
>
> On Sat, Mar 8, 2025 at 1:02 AM Qun-Wei Lin <qun-wei.lin@mediatek.com>
> wrote:
> >
> > Introduced Kcompressd to offload zram page compression, improving
> > system efficiency by handling compression separately from memory
> > reclaiming. Added necessary configurations and dependencies.
> >
> > Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
> > ---
> > drivers/block/zram/Kconfig | 11 ++
> > drivers/block/zram/Makefile | 3 +-
> > drivers/block/zram/kcompressd.c | 340
> > ++++++++++++++++++++++++++++++++
> > drivers/block/zram/kcompressd.h | 25 +++
> > drivers/block/zram/zram_drv.c | 22 ++-
> > 5 files changed, 397 insertions(+), 4 deletions(-)
> > create mode 100644 drivers/block/zram/kcompressd.c
> > create mode 100644 drivers/block/zram/kcompressd.h
> >
> > diff --git a/drivers/block/zram/Kconfig
> > b/drivers/block/zram/Kconfig
> > index 402b7b175863..f0a1b574f770 100644
> > --- a/drivers/block/zram/Kconfig
> > +++ b/drivers/block/zram/Kconfig
> > @@ -145,3 +145,14 @@ config ZRAM_MULTI_COMP
> > re-compress pages using a potentially slower but more
> > effective
> > compression algorithm. Note, that IDLE page recompression
> > requires ZRAM_TRACK_ENTRY_ACTIME.
> > +
> > +config KCOMPRESSD
> > + tristate "Kcompressd: Accelerated zram compression"
> > + depends on ZRAM
> > + help
> > + Kcompressd creates multiple daemons to accelerate the
> > compression of pages
> > + in zram, offloading this time-consuming task from the
> > zram driver.
> > +
> > + This approach improves system efficiency by handling page
> > compression separately,
> > + which was originally done by kswapd or direct reclaim.
>
> For direct reclaim, we were previously able to compress using
> multiple CPUs
> with multi-threading.
> After your patch, it seems that only a single thread/CPU is used for
> compression
> so it won't necessarily improve direct reclaim performance?
>
Our patch only splits the context of kswapd. When direct reclaim is
occurred, it is bypassed, so direct reclaim remains unchanged, with
each thread that needs memory directly reclaiming it.
> Even for kswapd, we used to have multiple threads like [kswapd0],
> [kswapd1],
> and [kswapd2] for different nodes. Now, are we also limited to just
> one thread?
We only considered a single kswapd here and didn't account for multiple
instances. Since I use kfifo to collect the bios, if there are multiple
kswapds, we need to add a lock to protect the bio queue. I can revise
this in the 2nd version, or do you have any other suggested approaches?
> I also wonder if this could be handled at the vmscan level instead of
> the zram
> level. then it might potentially help other sync devices or even
> zswap later.
>
> But I agree that for phones, modifying zram seems like an easier
> starting
> point. However, relying on a single thread isn't always the best
> approach.
>
>
> > +
> > diff --git a/drivers/block/zram/Makefile
> > b/drivers/block/zram/Makefile
> > index 0fdefd576691..23baa5dfceb9 100644
> > --- a/drivers/block/zram/Makefile
> > +++ b/drivers/block/zram/Makefile
> > @@ -9,4 +9,5 @@ zram-$(CONFIG_ZRAM_BACKEND_ZSTD) +=
> > backend_zstd.o
> > zram-$(CONFIG_ZRAM_BACKEND_DEFLATE) += backend_deflate.o
> > zram-$(CONFIG_ZRAM_BACKEND_842) += backend_842.o
> >
> > -obj-$(CONFIG_ZRAM) += zram.o
> > +obj-$(CONFIG_ZRAM) += zram.o
> > +obj-$(CONFIG_KCOMPRESSD) += kcompressd.o
> > diff --git a/drivers/block/zram/kcompressd.c
> > b/drivers/block/zram/kcompressd.c
> > new file mode 100644
> > index 000000000000..195b7e386869
> > --- /dev/null
> > +++ b/drivers/block/zram/kcompressd.c
> > @@ -0,0 +1,340 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Copyright (C) 2024 MediaTek Inc.
> > + */
> > +
> > +#include <linux/module.h>
> > +#include <linux/kernel.h>
> > +#include <linux/bio.h>
> > +#include <linux/bitops.h>
> > +#include <linux/freezer.h>
> > +#include <linux/kernel.h>
> > +#include <linux/psi.h>
> > +#include <linux/kfifo.h>
> > +#include <linux/swap.h>
> > +#include <linux/delay.h>
> > +
> > +#include "kcompressd.h"
> > +
> > +#define INIT_QUEUE_SIZE 4096
> > +#define DEFAULT_NR_KCOMPRESSD 4
> > +
> > +static atomic_t enable_kcompressd;
> > +static unsigned int nr_kcompressd;
> > +static unsigned int queue_size_per_kcompressd;
> > +static struct kcompress *kcompress;
> > +
> > +enum run_state {
> > + KCOMPRESSD_NOT_STARTED = 0,
> > + KCOMPRESSD_RUNNING,
> > + KCOMPRESSD_SLEEPING,
> > +};
> > +
> > +struct kcompressd_para {
> > + wait_queue_head_t *kcompressd_wait;
> > + struct kfifo *write_fifo;
> > + atomic_t *running;
> > +};
> > +
> > +static struct kcompressd_para *kcompressd_para;
> > +static BLOCKING_NOTIFIER_HEAD(kcompressd_notifier_list);
> > +
> > +struct write_work {
> > + void *mem;
> > + struct bio *bio;
> > + compress_callback cb;
> > +};
> > +
> > +int kcompressd_enabled(void)
> > +{
> > + return likely(atomic_read(&enable_kcompressd));
> > +}
> > +EXPORT_SYMBOL(kcompressd_enabled);
> > +
> > +static void kcompressd_try_to_sleep(struct kcompressd_para *p)
> > +{
> > + DEFINE_WAIT(wait);
> > +
> > + if (!kfifo_is_empty(p->write_fifo))
> > + return;
> > +
> > + if (freezing(current) || kthread_should_stop())
> > + return;
> > +
> > + atomic_set(p->running, KCOMPRESSD_SLEEPING);
> > + prepare_to_wait(p->kcompressd_wait, &wait,
> > TASK_INTERRUPTIBLE);
> > +
> > + /*
> > + * After a short sleep, check if it was a premature sleep.
> > If not, then
> > + * go fully to sleep until explicitly woken up.
> > + */
> > + if (!kthread_should_stop() && kfifo_is_empty(p-
> > >write_fifo))
> > + schedule();
> > +
> > + finish_wait(p->kcompressd_wait, &wait);
> > + atomic_set(p->running, KCOMPRESSD_RUNNING);
> > +}
> > +
> > +static int kcompressd(void *para)
> > +{
> > + struct task_struct *tsk = current;
> > + struct kcompressd_para *p = (struct kcompressd_para *)para;
> > +
> > + tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
> > + set_freezable();
> > +
> > + while (!kthread_should_stop()) {
> > + bool ret;
> > +
> > + kcompressd_try_to_sleep(p);
> > + ret = try_to_freeze();
> > + if (kthread_should_stop())
> > + break;
> > +
> > + if (ret)
> > + continue;
> > +
> > + while (!kfifo_is_empty(p->write_fifo)) {
> > + struct write_work entry;
> > +
> > + if (sizeof(struct write_work) ==
> > kfifo_out(p->write_fifo,
> > + &entry,
> > sizeof(struct write_work))) {
> > + entry.cb(entry.mem, entry.bio);
> > + bio_put(entry.bio);
> > + }
> > + }
> > +
> > + }
> > +
> > + tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
> > + atomic_set(p->running, KCOMPRESSD_NOT_STARTED);
> > + return 0;
> > +}
> > +
> > +static int init_write_queue(void)
> > +{
> > + int i;
> > + unsigned int queue_len = queue_size_per_kcompressd *
> > sizeof(struct write_work);
> > +
> > + for (i = 0; i < nr_kcompressd; i++) {
> > + if (kfifo_alloc(&kcompress[i].write_fifo,
> > + queue_len, GFP_KERNEL)) {
> > + pr_err("Failed to alloc kfifo %d\n", i);
> > + return -ENOMEM;
> > + }
> > + }
> > + return 0;
> > +}
> > +
> > +static void clean_bio_queue(int idx)
> > +{
> > + struct write_work entry;
> > +
> > + while (sizeof(struct write_work) ==
> > kfifo_out(&kcompress[idx].write_fifo,
> > + &entry, sizeof(struct write_work)))
> > {
> > + bio_put(entry.bio);
> > + entry.cb(entry.mem, entry.bio);
> > + }
> > + kfifo_free(&kcompress[idx].write_fifo);
> > +}
> > +
> > +static int kcompress_update(void)
> > +{
> > + int i;
> > + int ret;
> > +
> > + kcompress = kvmalloc_array(nr_kcompressd, sizeof(struct
> > kcompress), GFP_KERNEL);
> > + if (!kcompress)
> > + return -ENOMEM;
> > +
> > + kcompressd_para = kvmalloc_array(nr_kcompressd,
> > sizeof(struct kcompressd_para), GFP_KERNEL);
> > + if (!kcompressd_para)
> > + return -ENOMEM;
> > +
> > + ret = init_write_queue();
> > + if (ret) {
> > + pr_err("Initialization of writing to FIFOs
> > failed!!\n");
> > + return ret;
> > + }
> > +
> > + for (i = 0; i < nr_kcompressd; i++) {
> > + init_waitqueue_head(&kcompress[i].kcompressd_wait);
> > + kcompressd_para[i].kcompressd_wait =
> > &kcompress[i].kcompressd_wait;
> > + kcompressd_para[i].write_fifo =
> > &kcompress[i].write_fifo;
> > + kcompressd_para[i].running = &kcompress[i].running;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +static void stop_all_kcompressd_thread(void)
> > +{
> > + int i;
> > +
> > + for (i = 0; i < nr_kcompressd; i++) {
> > + kthread_stop(kcompress[i].kcompressd);
> > + kcompress[i].kcompressd = NULL;
> > + clean_bio_queue(i);
> > + }
> > +}
> > +
> > +static int do_nr_kcompressd_handler(const char *val,
> > + const struct kernel_param *kp)
> > +{
> > + int ret;
> > +
> > + atomic_set(&enable_kcompressd, false);
> > +
> > + stop_all_kcompressd_thread();
> > +
> > + ret = param_set_int(val, kp);
> > + if (!ret) {
> > + pr_err("Invalid number of kcompressd.\n");
> > + return -EINVAL;
> > + }
> > +
> > + ret = init_write_queue();
> > + if (ret) {
> > + pr_err("Initialization of writing to FIFOs
> > failed!!\n");
> > + return ret;
> > + }
> > +
> > + atomic_set(&enable_kcompressd, true);
> > +
> > + return 0;
> > +}
> > +
> > +static const struct kernel_param_ops
> > param_ops_change_nr_kcompressd = {
> > + .set = &do_nr_kcompressd_handler,
> > + .get = ¶m_get_uint,
> > + .free = NULL,
> > +};
> > +
> > +module_param_cb(nr_kcompressd, ¶m_ops_change_nr_kcompressd,
> > + &nr_kcompressd, 0644);
> > +MODULE_PARM_DESC(nr_kcompressd, "Number of pre-created daemon for
> > page compression");
> > +
> > +static int do_queue_size_per_kcompressd_handler(const char *val,
> > + const struct kernel_param *kp)
> > +{
> > + int ret;
> > +
> > + atomic_set(&enable_kcompressd, false);
> > +
> > + stop_all_kcompressd_thread();
> > +
> > + ret = param_set_int(val, kp);
> > + if (!ret) {
> > + pr_err("Invalid queue size for kcompressd.\n");
> > + return -EINVAL;
> > + }
> > +
> > + ret = init_write_queue();
> > + if (ret) {
> > + pr_err("Initialization of writing to FIFOs
> > failed!!\n");
> > + return ret;
> > + }
> > +
> > + pr_info("Queue size for kcompressd was changed: %d\n",
> > queue_size_per_kcompressd);
> > +
> > + atomic_set(&enable_kcompressd, true);
> > + return 0;
> > +}
> > +
> > +static const struct kernel_param_ops
> > param_ops_change_queue_size_per_kcompressd = {
> > + .set = &do_queue_size_per_kcompressd_handler,
> > + .get = ¶m_get_uint,
> > + .free = NULL,
> > +};
> > +
> > +module_param_cb(queue_size_per_kcompressd,
> > ¶m_ops_change_queue_size_per_kcompressd,
> > + &queue_size_per_kcompressd, 0644);
> > +MODULE_PARM_DESC(queue_size_per_kcompressd,
> > + "Size of queue for kcompressd");
> > +
> > +int schedule_bio_write(void *mem, struct bio *bio,
> > compress_callback cb)
> > +{
> > + int i;
> > + bool submit_success = false;
> > + size_t sz_work = sizeof(struct write_work);
> > +
> > + struct write_work entry = {
> > + .mem = mem,
> > + .bio = bio,
> > + .cb = cb
> > + };
> > +
> > + if (unlikely(!atomic_read(&enable_kcompressd)))
> > + return -EBUSY;
> > +
> > + if (!nr_kcompressd || !current_is_kswapd())
> > + return -EBUSY;
> > +
> > + bio_get(bio);
> > +
> > + for (i = 0; i < nr_kcompressd; i++) {
> > + submit_success =
> > + (kfifo_avail(&kcompress[i].write_fifo) >=
> > sz_work) &&
> > + (sz_work ==
> > kfifo_in(&kcompress[i].write_fifo, &entry, sz_work));
> > +
> > + if (submit_success) {
> > + switch (atomic_read(&kcompress[i].running))
> > {
> > + case KCOMPRESSD_NOT_STARTED:
> > + atomic_set(&kcompress[i].running,
> > KCOMPRESSD_RUNNING);
> > + kcompress[i].kcompressd =
> > kthread_run(kcompressd,
> > +
> > &kcompressd_para[i], "kcompressd:%d", i);
> > + if
> > (IS_ERR(kcompress[i].kcompressd)) {
> > +
> > atomic_set(&kcompress[i].running, KCOMPRESSD_NOT_STARTED);
> > + pr_warn("Failed to start
> > kcompressd:%d\n", i);
> > + clean_bio_queue(i);
> > + }
> > + break;
> > + case KCOMPRESSD_RUNNING:
> > + break;
> > + case KCOMPRESSD_SLEEPING:
> > +
> > wake_up_interruptible(&kcompress[i].kcompressd_wait);
> > + break;
> > + }
> > + return 0;
> > + }
> > + }
> > +
> > + bio_put(bio);
> > + return -EBUSY;
> > +}
> > +EXPORT_SYMBOL(schedule_bio_write);
> > +
> > +static int __init kcompressd_init(void)
> > +{
> > + int ret;
> > +
> > + nr_kcompressd = DEFAULT_NR_KCOMPRESSD;
> > + queue_size_per_kcompressd = INIT_QUEUE_SIZE;
> > +
> > + ret = kcompress_update();
> > + if (ret) {
> > + pr_err("Init kcompressd failed!\n");
> > + return ret;
> > + }
> > +
> > + atomic_set(&enable_kcompressd, true);
> > + blocking_notifier_call_chain(&kcompressd_notifier_list, 0,
> > NULL);
> > + return 0;
> > +}
> > +
> > +static void __exit kcompressd_exit(void)
> > +{
> > + atomic_set(&enable_kcompressd, false);
> > + stop_all_kcompressd_thread();
> > +
> > + kvfree(kcompress);
> > + kvfree(kcompressd_para);
> > +}
> > +
> > +module_init(kcompressd_init);
> > +module_exit(kcompressd_exit);
> > +
> > +MODULE_LICENSE("Dual BSD/GPL");
> > +MODULE_AUTHOR("Qun-Wei Lin <qun-wei.lin@mediatek.com>");
> > +MODULE_DESCRIPTION("Separate the page compression from the memory
> > reclaiming");
> > +
> > diff --git a/drivers/block/zram/kcompressd.h
> > b/drivers/block/zram/kcompressd.h
> > new file mode 100644
> > index 000000000000..2fe0b424a7af
> > --- /dev/null
> > +++ b/drivers/block/zram/kcompressd.h
> > @@ -0,0 +1,25 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * Copyright (C) 2024 MediaTek Inc.
> > + */
> > +
> > +#ifndef _KCOMPRESSD_H_
> > +#define _KCOMPRESSD_H_
> > +
> > +#include <linux/rwsem.h>
> > +#include <linux/kfifo.h>
> > +#include <linux/atomic.h>
> > +
> > +typedef void (*compress_callback)(void *mem, struct bio *bio);
> > +
> > +struct kcompress {
> > + struct task_struct *kcompressd;
> > + wait_queue_head_t kcompressd_wait;
> > + struct kfifo write_fifo;
> > + atomic_t running;
> > +};
> > +
> > +int kcompressd_enabled(void);
> > +int schedule_bio_write(void *mem, struct bio *bio,
> > compress_callback cb);
> > +#endif
> > +
> > diff --git a/drivers/block/zram/zram_drv.c
> > b/drivers/block/zram/zram_drv.c
> > index 2e1a70f2f4bd..bcd63ecb6ff2 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -35,6 +35,7 @@
> > #include <linux/part_stat.h>
> > #include <linux/kernel_read_file.h>
> >
> > +#include "kcompressd.h"
> > #include "zram_drv.h"
> >
> > static DEFINE_IDR(zram_index_idr);
> > @@ -2240,6 +2241,15 @@ static void zram_bio_write(struct zram
> > *zram, struct bio *bio)
> > bio_endio(bio);
> > }
> >
> > +#if IS_ENABLED(CONFIG_KCOMPRESSD)
> > +static void zram_bio_write_callback(void *mem, struct bio *bio)
> > +{
> > + struct zram *zram = (struct zram *)mem;
> > +
> > + zram_bio_write(zram, bio);
> > +}
> > +#endif
> > +
> > /*
> > * Handler function for all zram I/O requests.
> > */
> > @@ -2252,6 +2262,10 @@ static void zram_submit_bio(struct bio *bio)
> > zram_bio_read(zram, bio);
> > break;
> > case REQ_OP_WRITE:
> > +#if IS_ENABLED(CONFIG_KCOMPRESSD)
> > + if (kcompressd_enabled() &&
> > !schedule_bio_write(zram, bio, zram_bio_write_callback))
> > + break;
> > +#endif
> > zram_bio_write(zram, bio);
> > break;
> > case REQ_OP_DISCARD:
> > @@ -2535,9 +2549,11 @@ static int zram_add(void)
> > #if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE
> > .max_write_zeroes_sectors = UINT_MAX,
> > #endif
> > - .features =
> > BLK_FEAT_STABLE_WRITES |
> > -
> > BLK_FEAT_READ_SYNCHRONOUS |
> > -
> > BLK_FEAT_WRITE_SYNCHRONOUS,
> > + .features =
> > BLK_FEAT_STABLE_WRITES
> > + |
> > BLK_FEAT_READ_SYNCHRONOUS
> > +#if !IS_ENABLED(CONFIG_KCOMPRESSD)
> > + |
> > BLK_FEAT_WRITE_SYNCHRONOUS,
> > +#endif
> > };
> > struct zram *zram;
> > int ret, device_id;
> > --
> > 2.45.2
> >
>
> Thanks
> Barry
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
2025-03-10 13:26 ` Qun-wei Lin (林群崴)
@ 2025-03-11 7:05 ` Barry Song
2025-03-11 7:25 ` Barry Song
2025-03-11 14:33 ` Qun-wei Lin (林群崴)
0 siblings, 2 replies; 39+ messages in thread
From: Barry Song @ 2025-03-11 7:05 UTC (permalink / raw)
To: Qun-wei Lin (林群崴)
Cc: Andrew Yang (楊智強),
Casper Li (李中榮),
chrisl, James Hsu (徐慶薰),
AngeloGioacchino Del Regno, akpm, linux-kernel, linux-mediatek,
ira.weiny, linux-mm, dave.jiang, schatzberg.dan,
Chinwen Chang (張錦文),
viro, ryan.roberts, minchan, axboe, linux-block, kasong, nvdimm,
vishal.l.verma, matthias.bgg, linux-arm-kernel, ying.huang,
senozhatsky, dan.j.williams
On Tue, Mar 11, 2025 at 2:26 AM Qun-wei Lin (林群崴)
<Qun-wei.Lin@mediatek.com> wrote:
>
> On Sat, 2025-03-08 at 08:41 +1300, Barry Song wrote:
> >
> > External email : Please do not click links or open attachments until
> > you have verified the sender or the content.
> >
> >
> > On Sat, Mar 8, 2025 at 1:02 AM Qun-Wei Lin <qun-wei.lin@mediatek.com>
> > wrote:
> > >
> > > Introduced Kcompressd to offload zram page compression, improving
> > > system efficiency by handling compression separately from memory
> > > reclaiming. Added necessary configurations and dependencies.
> > >
> > > Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
> > > ---
> > > drivers/block/zram/Kconfig | 11 ++
> > > drivers/block/zram/Makefile | 3 +-
> > > drivers/block/zram/kcompressd.c | 340
> > > ++++++++++++++++++++++++++++++++
> > > drivers/block/zram/kcompressd.h | 25 +++
> > > drivers/block/zram/zram_drv.c | 22 ++-
> > > 5 files changed, 397 insertions(+), 4 deletions(-)
> > > create mode 100644 drivers/block/zram/kcompressd.c
> > > create mode 100644 drivers/block/zram/kcompressd.h
> > >
> > > diff --git a/drivers/block/zram/Kconfig
> > > b/drivers/block/zram/Kconfig
> > > index 402b7b175863..f0a1b574f770 100644
> > > --- a/drivers/block/zram/Kconfig
> > > +++ b/drivers/block/zram/Kconfig
> > > @@ -145,3 +145,14 @@ config ZRAM_MULTI_COMP
> > > re-compress pages using a potentially slower but more
> > > effective
> > > compression algorithm. Note, that IDLE page recompression
> > > requires ZRAM_TRACK_ENTRY_ACTIME.
> > > +
> > > +config KCOMPRESSD
> > > + tristate "Kcompressd: Accelerated zram compression"
> > > + depends on ZRAM
> > > + help
> > > + Kcompressd creates multiple daemons to accelerate the
> > > compression of pages
> > > + in zram, offloading this time-consuming task from the
> > > zram driver.
> > > +
> > > + This approach improves system efficiency by handling page
> > > compression separately,
> > > + which was originally done by kswapd or direct reclaim.
> >
> > For direct reclaim, we were previously able to compress using
> > multiple CPUs
> > with multi-threading.
> > After your patch, it seems that only a single thread/CPU is used for
> > compression
> > so it won't necessarily improve direct reclaim performance?
> >
>
> Our patch only splits the context of kswapd. When direct reclaim is
> occurred, it is bypassed, so direct reclaim remains unchanged, with
> each thread that needs memory directly reclaiming it.
Qun-wei, I’m getting a bit confused. Looking at the code in page_io.c,
you always call swap_writepage_bdev_async() no matter if it is kswapd
or direct reclaim:
- else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
+ else if (data_race(sis->flags & SWP_WRITE_SYNCHRONOUS_IO))
swap_writepage_bdev_sync(folio, wbc, sis);
else
swap_writepage_bdev_async(folio, wbc, sis);
In zram, I notice you are bypassing kcompressd by:
+ if (!nr_kcompressd || !current_is_kswapd())
+ return -EBUSY;
How will this work if no one is calling __end_swap_bio_write(&bio),
which is present in swap_writepage_bdev_sync()?
Am I missing something? Or is it done by zram_bio_write() ?
On the other hand, zram is a generic block device, and coupling its
code with kswapd/direct reclaim somehow violates layering
principles :-)
>
> > Even for kswapd, we used to have multiple threads like [kswapd0],
> > [kswapd1],
> > and [kswapd2] for different nodes. Now, are we also limited to just
> > one thread?
>
> We only considered a single kswapd here and didn't account for multiple
> instances. Since I use kfifo to collect the bios, if there are multiple
> kswapds, we need to add a lock to protect the bio queue. I can revise
> this in the 2nd version, or do you have any other suggested approaches?
I'm wondering if we can move the code to vmscan/page_io instead
of zram. If we're using a sync I/O swap device or have enabled zswap,
we could run reclamation in this separate thread, which should also be
NUMA-aware.
I would definitely be interested in prototyping it when I have the time.
>
> > I also wonder if this could be handled at the vmscan level instead of
> > the zram
> > level. then it might potentially help other sync devices or even
> > zswap later.
> >
> > But I agree that for phones, modifying zram seems like an easier
> > starting
> > point. However, relying on a single thread isn't always the best
> > approach.
> >
> >
> > > +
> > > diff --git a/drivers/block/zram/Makefile
> > > b/drivers/block/zram/Makefile
> > > index 0fdefd576691..23baa5dfceb9 100644
> > > --- a/drivers/block/zram/Makefile
> > > +++ b/drivers/block/zram/Makefile
> > > @@ -9,4 +9,5 @@ zram-$(CONFIG_ZRAM_BACKEND_ZSTD) +=
> > > backend_zstd.o
> > > zram-$(CONFIG_ZRAM_BACKEND_DEFLATE) += backend_deflate.o
> > > zram-$(CONFIG_ZRAM_BACKEND_842) += backend_842.o
> > >
> > > -obj-$(CONFIG_ZRAM) += zram.o
> > > +obj-$(CONFIG_ZRAM) += zram.o
> > > +obj-$(CONFIG_KCOMPRESSD) += kcompressd.o
> > > diff --git a/drivers/block/zram/kcompressd.c
> > > b/drivers/block/zram/kcompressd.c
> > > new file mode 100644
> > > index 000000000000..195b7e386869
> > > --- /dev/null
> > > +++ b/drivers/block/zram/kcompressd.c
> > > @@ -0,0 +1,340 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> > > +/*
> > > + * Copyright (C) 2024 MediaTek Inc.
> > > + */
> > > +
> > > +#include <linux/module.h>
> > > +#include <linux/kernel.h>
> > > +#include <linux/bio.h>
> > > +#include <linux/bitops.h>
> > > +#include <linux/freezer.h>
> > > +#include <linux/kernel.h>
> > > +#include <linux/psi.h>
> > > +#include <linux/kfifo.h>
> > > +#include <linux/swap.h>
> > > +#include <linux/delay.h>
> > > +
> > > +#include "kcompressd.h"
> > > +
> > > +#define INIT_QUEUE_SIZE 4096
> > > +#define DEFAULT_NR_KCOMPRESSD 4
> > > +
> > > +static atomic_t enable_kcompressd;
> > > +static unsigned int nr_kcompressd;
> > > +static unsigned int queue_size_per_kcompressd;
> > > +static struct kcompress *kcompress;
> > > +
> > > +enum run_state {
> > > + KCOMPRESSD_NOT_STARTED = 0,
> > > + KCOMPRESSD_RUNNING,
> > > + KCOMPRESSD_SLEEPING,
> > > +};
> > > +
> > > +struct kcompressd_para {
> > > + wait_queue_head_t *kcompressd_wait;
> > > + struct kfifo *write_fifo;
> > > + atomic_t *running;
> > > +};
> > > +
> > > +static struct kcompressd_para *kcompressd_para;
> > > +static BLOCKING_NOTIFIER_HEAD(kcompressd_notifier_list);
> > > +
> > > +struct write_work {
> > > + void *mem;
> > > + struct bio *bio;
> > > + compress_callback cb;
> > > +};
> > > +
> > > +int kcompressd_enabled(void)
> > > +{
> > > + return likely(atomic_read(&enable_kcompressd));
> > > +}
> > > +EXPORT_SYMBOL(kcompressd_enabled);
> > > +
> > > +static void kcompressd_try_to_sleep(struct kcompressd_para *p)
> > > +{
> > > + DEFINE_WAIT(wait);
> > > +
> > > + if (!kfifo_is_empty(p->write_fifo))
> > > + return;
> > > +
> > > + if (freezing(current) || kthread_should_stop())
> > > + return;
> > > +
> > > + atomic_set(p->running, KCOMPRESSD_SLEEPING);
> > > + prepare_to_wait(p->kcompressd_wait, &wait,
> > > TASK_INTERRUPTIBLE);
> > > +
> > > + /*
> > > + * After a short sleep, check if it was a premature sleep.
> > > If not, then
> > > + * go fully to sleep until explicitly woken up.
> > > + */
> > > + if (!kthread_should_stop() && kfifo_is_empty(p-
> > > >write_fifo))
> > > + schedule();
> > > +
> > > + finish_wait(p->kcompressd_wait, &wait);
> > > + atomic_set(p->running, KCOMPRESSD_RUNNING);
> > > +}
> > > +
> > > +static int kcompressd(void *para)
> > > +{
> > > + struct task_struct *tsk = current;
> > > + struct kcompressd_para *p = (struct kcompressd_para *)para;
> > > +
> > > + tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
> > > + set_freezable();
> > > +
> > > + while (!kthread_should_stop()) {
> > > + bool ret;
> > > +
> > > + kcompressd_try_to_sleep(p);
> > > + ret = try_to_freeze();
> > > + if (kthread_should_stop())
> > > + break;
> > > +
> > > + if (ret)
> > > + continue;
> > > +
> > > + while (!kfifo_is_empty(p->write_fifo)) {
> > > + struct write_work entry;
> > > +
> > > + if (sizeof(struct write_work) ==
> > > kfifo_out(p->write_fifo,
> > > + &entry,
> > > sizeof(struct write_work))) {
> > > + entry.cb(entry.mem, entry.bio);
> > > + bio_put(entry.bio);
> > > + }
> > > + }
> > > +
> > > + }
> > > +
> > > + tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
> > > + atomic_set(p->running, KCOMPRESSD_NOT_STARTED);
> > > + return 0;
> > > +}
> > > +
> > > +static int init_write_queue(void)
> > > +{
> > > + int i;
> > > + unsigned int queue_len = queue_size_per_kcompressd *
> > > sizeof(struct write_work);
> > > +
> > > + for (i = 0; i < nr_kcompressd; i++) {
> > > + if (kfifo_alloc(&kcompress[i].write_fifo,
> > > + queue_len, GFP_KERNEL)) {
> > > + pr_err("Failed to alloc kfifo %d\n", i);
> > > + return -ENOMEM;
> > > + }
> > > + }
> > > + return 0;
> > > +}
> > > +
> > > +static void clean_bio_queue(int idx)
> > > +{
> > > + struct write_work entry;
> > > +
> > > + while (sizeof(struct write_work) ==
> > > kfifo_out(&kcompress[idx].write_fifo,
> > > + &entry, sizeof(struct write_work)))
> > > {
> > > + bio_put(entry.bio);
> > > + entry.cb(entry.mem, entry.bio);
> > > + }
> > > + kfifo_free(&kcompress[idx].write_fifo);
> > > +}
> > > +
> > > +static int kcompress_update(void)
> > > +{
> > > + int i;
> > > + int ret;
> > > +
> > > + kcompress = kvmalloc_array(nr_kcompressd, sizeof(struct
> > > kcompress), GFP_KERNEL);
> > > + if (!kcompress)
> > > + return -ENOMEM;
> > > +
> > > + kcompressd_para = kvmalloc_array(nr_kcompressd,
> > > sizeof(struct kcompressd_para), GFP_KERNEL);
> > > + if (!kcompressd_para)
> > > + return -ENOMEM;
> > > +
> > > + ret = init_write_queue();
> > > + if (ret) {
> > > + pr_err("Initialization of writing to FIFOs
> > > failed!!\n");
> > > + return ret;
> > > + }
> > > +
> > > + for (i = 0; i < nr_kcompressd; i++) {
> > > + init_waitqueue_head(&kcompress[i].kcompressd_wait);
> > > + kcompressd_para[i].kcompressd_wait =
> > > &kcompress[i].kcompressd_wait;
> > > + kcompressd_para[i].write_fifo =
> > > &kcompress[i].write_fifo;
> > > + kcompressd_para[i].running = &kcompress[i].running;
> > > + }
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static void stop_all_kcompressd_thread(void)
> > > +{
> > > + int i;
> > > +
> > > + for (i = 0; i < nr_kcompressd; i++) {
> > > + kthread_stop(kcompress[i].kcompressd);
> > > + kcompress[i].kcompressd = NULL;
> > > + clean_bio_queue(i);
> > > + }
> > > +}
> > > +
> > > +static int do_nr_kcompressd_handler(const char *val,
> > > + const struct kernel_param *kp)
> > > +{
> > > + int ret;
> > > +
> > > + atomic_set(&enable_kcompressd, false);
> > > +
> > > + stop_all_kcompressd_thread();
> > > +
> > > + ret = param_set_int(val, kp);
> > > + if (!ret) {
> > > + pr_err("Invalid number of kcompressd.\n");
> > > + return -EINVAL;
> > > + }
> > > +
> > > + ret = init_write_queue();
> > > + if (ret) {
> > > + pr_err("Initialization of writing to FIFOs
> > > failed!!\n");
> > > + return ret;
> > > + }
> > > +
> > > + atomic_set(&enable_kcompressd, true);
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static const struct kernel_param_ops
> > > param_ops_change_nr_kcompressd = {
> > > + .set = &do_nr_kcompressd_handler,
> > > + .get = ¶m_get_uint,
> > > + .free = NULL,
> > > +};
> > > +
> > > +module_param_cb(nr_kcompressd, ¶m_ops_change_nr_kcompressd,
> > > + &nr_kcompressd, 0644);
> > > +MODULE_PARM_DESC(nr_kcompressd, "Number of pre-created daemon for
> > > page compression");
> > > +
> > > +static int do_queue_size_per_kcompressd_handler(const char *val,
> > > + const struct kernel_param *kp)
> > > +{
> > > + int ret;
> > > +
> > > + atomic_set(&enable_kcompressd, false);
> > > +
> > > + stop_all_kcompressd_thread();
> > > +
> > > + ret = param_set_int(val, kp);
> > > + if (!ret) {
> > > + pr_err("Invalid queue size for kcompressd.\n");
> > > + return -EINVAL;
> > > + }
> > > +
> > > + ret = init_write_queue();
> > > + if (ret) {
> > > + pr_err("Initialization of writing to FIFOs
> > > failed!!\n");
> > > + return ret;
> > > + }
> > > +
> > > + pr_info("Queue size for kcompressd was changed: %d\n",
> > > queue_size_per_kcompressd);
> > > +
> > > + atomic_set(&enable_kcompressd, true);
> > > + return 0;
> > > +}
> > > +
> > > +static const struct kernel_param_ops
> > > param_ops_change_queue_size_per_kcompressd = {
> > > + .set = &do_queue_size_per_kcompressd_handler,
> > > + .get = ¶m_get_uint,
> > > + .free = NULL,
> > > +};
> > > +
> > > +module_param_cb(queue_size_per_kcompressd,
> > > ¶m_ops_change_queue_size_per_kcompressd,
> > > + &queue_size_per_kcompressd, 0644);
> > > +MODULE_PARM_DESC(queue_size_per_kcompressd,
> > > + "Size of queue for kcompressd");
> > > +
> > > +int schedule_bio_write(void *mem, struct bio *bio,
> > > compress_callback cb)
> > > +{
> > > + int i;
> > > + bool submit_success = false;
> > > + size_t sz_work = sizeof(struct write_work);
> > > +
> > > + struct write_work entry = {
> > > + .mem = mem,
> > > + .bio = bio,
> > > + .cb = cb
> > > + };
> > > +
> > > + if (unlikely(!atomic_read(&enable_kcompressd)))
> > > + return -EBUSY;
> > > +
> > > + if (!nr_kcompressd || !current_is_kswapd())
> > > + return -EBUSY;
> > > +
> > > + bio_get(bio);
> > > +
> > > + for (i = 0; i < nr_kcompressd; i++) {
> > > + submit_success =
> > > + (kfifo_avail(&kcompress[i].write_fifo) >=
> > > sz_work) &&
> > > + (sz_work ==
> > > kfifo_in(&kcompress[i].write_fifo, &entry, sz_work));
> > > +
> > > + if (submit_success) {
> > > + switch (atomic_read(&kcompress[i].running))
> > > {
> > > + case KCOMPRESSD_NOT_STARTED:
> > > + atomic_set(&kcompress[i].running,
> > > KCOMPRESSD_RUNNING);
> > > + kcompress[i].kcompressd =
> > > kthread_run(kcompressd,
> > > +
> > > &kcompressd_para[i], "kcompressd:%d", i);
> > > + if
> > > (IS_ERR(kcompress[i].kcompressd)) {
> > > +
> > > atomic_set(&kcompress[i].running, KCOMPRESSD_NOT_STARTED);
> > > + pr_warn("Failed to start
> > > kcompressd:%d\n", i);
> > > + clean_bio_queue(i);
> > > + }
> > > + break;
> > > + case KCOMPRESSD_RUNNING:
> > > + break;
> > > + case KCOMPRESSD_SLEEPING:
> > > +
> > > wake_up_interruptible(&kcompress[i].kcompressd_wait);
> > > + break;
> > > + }
> > > + return 0;
> > > + }
> > > + }
> > > +
> > > + bio_put(bio);
> > > + return -EBUSY;
> > > +}
> > > +EXPORT_SYMBOL(schedule_bio_write);
> > > +
> > > +static int __init kcompressd_init(void)
> > > +{
> > > + int ret;
> > > +
> > > + nr_kcompressd = DEFAULT_NR_KCOMPRESSD;
> > > + queue_size_per_kcompressd = INIT_QUEUE_SIZE;
> > > +
> > > + ret = kcompress_update();
> > > + if (ret) {
> > > + pr_err("Init kcompressd failed!\n");
> > > + return ret;
> > > + }
> > > +
> > > + atomic_set(&enable_kcompressd, true);
> > > + blocking_notifier_call_chain(&kcompressd_notifier_list, 0,
> > > NULL);
> > > + return 0;
> > > +}
> > > +
> > > +static void __exit kcompressd_exit(void)
> > > +{
> > > + atomic_set(&enable_kcompressd, false);
> > > + stop_all_kcompressd_thread();
> > > +
> > > + kvfree(kcompress);
> > > + kvfree(kcompressd_para);
> > > +}
> > > +
> > > +module_init(kcompressd_init);
> > > +module_exit(kcompressd_exit);
> > > +
> > > +MODULE_LICENSE("Dual BSD/GPL");
> > > +MODULE_AUTHOR("Qun-Wei Lin <qun-wei.lin@mediatek.com>");
> > > +MODULE_DESCRIPTION("Separate the page compression from the memory
> > > reclaiming");
> > > +
> > > diff --git a/drivers/block/zram/kcompressd.h
> > > b/drivers/block/zram/kcompressd.h
> > > new file mode 100644
> > > index 000000000000..2fe0b424a7af
> > > --- /dev/null
> > > +++ b/drivers/block/zram/kcompressd.h
> > > @@ -0,0 +1,25 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > +/*
> > > + * Copyright (C) 2024 MediaTek Inc.
> > > + */
> > > +
> > > +#ifndef _KCOMPRESSD_H_
> > > +#define _KCOMPRESSD_H_
> > > +
> > > +#include <linux/rwsem.h>
> > > +#include <linux/kfifo.h>
> > > +#include <linux/atomic.h>
> > > +
> > > +typedef void (*compress_callback)(void *mem, struct bio *bio);
> > > +
> > > +struct kcompress {
> > > + struct task_struct *kcompressd;
> > > + wait_queue_head_t kcompressd_wait;
> > > + struct kfifo write_fifo;
> > > + atomic_t running;
> > > +};
> > > +
> > > +int kcompressd_enabled(void);
> > > +int schedule_bio_write(void *mem, struct bio *bio,
> > > compress_callback cb);
> > > +#endif
> > > +
> > > diff --git a/drivers/block/zram/zram_drv.c
> > > b/drivers/block/zram/zram_drv.c
> > > index 2e1a70f2f4bd..bcd63ecb6ff2 100644
> > > --- a/drivers/block/zram/zram_drv.c
> > > +++ b/drivers/block/zram/zram_drv.c
> > > @@ -35,6 +35,7 @@
> > > #include <linux/part_stat.h>
> > > #include <linux/kernel_read_file.h>
> > >
> > > +#include "kcompressd.h"
> > > #include "zram_drv.h"
> > >
> > > static DEFINE_IDR(zram_index_idr);
> > > @@ -2240,6 +2241,15 @@ static void zram_bio_write(struct zram
> > > *zram, struct bio *bio)
> > > bio_endio(bio);
> > > }
> > >
> > > +#if IS_ENABLED(CONFIG_KCOMPRESSD)
> > > +static void zram_bio_write_callback(void *mem, struct bio *bio)
> > > +{
> > > + struct zram *zram = (struct zram *)mem;
> > > +
> > > + zram_bio_write(zram, bio);
> > > +}
> > > +#endif
> > > +
> > > /*
> > > * Handler function for all zram I/O requests.
> > > */
> > > @@ -2252,6 +2262,10 @@ static void zram_submit_bio(struct bio *bio)
> > > zram_bio_read(zram, bio);
> > > break;
> > > case REQ_OP_WRITE:
> > > +#if IS_ENABLED(CONFIG_KCOMPRESSD)
> > > + if (kcompressd_enabled() &&
> > > !schedule_bio_write(zram, bio, zram_bio_write_callback))
> > > + break;
> > > +#endif
> > > zram_bio_write(zram, bio);
> > > break;
> > > case REQ_OP_DISCARD:
> > > @@ -2535,9 +2549,11 @@ static int zram_add(void)
> > > #if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE
> > > .max_write_zeroes_sectors = UINT_MAX,
> > > #endif
> > > - .features =
> > > BLK_FEAT_STABLE_WRITES |
> > > -
> > > BLK_FEAT_READ_SYNCHRONOUS |
> > > -
> > > BLK_FEAT_WRITE_SYNCHRONOUS,
> > > + .features =
> > > BLK_FEAT_STABLE_WRITES
> > > + |
> > > BLK_FEAT_READ_SYNCHRONOUS
> > > +#if !IS_ENABLED(CONFIG_KCOMPRESSD)
> > > + |
> > > BLK_FEAT_WRITE_SYNCHRONOUS,
> > > +#endif
> > > };
> > > struct zram *zram;
> > > int ret, device_id;
> > > --
> > > 2.45.2
> > >
> >
Thanks
Barry
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
2025-03-11 7:05 ` Barry Song
@ 2025-03-11 7:25 ` Barry Song
2025-03-11 14:33 ` Qun-wei Lin (林群崴)
1 sibling, 0 replies; 39+ messages in thread
From: Barry Song @ 2025-03-11 7:25 UTC (permalink / raw)
To: Qun-wei Lin (林群崴)
Cc: Andrew Yang (楊智強),
Casper Li (李中榮),
chrisl, James Hsu (徐慶薰),
AngeloGioacchino Del Regno, akpm, linux-kernel, linux-mediatek,
ira.weiny, linux-mm, dave.jiang, schatzberg.dan,
Chinwen Chang (張錦文),
viro, ryan.roberts, minchan, axboe, linux-block, kasong, nvdimm,
vishal.l.verma, matthias.bgg, linux-arm-kernel, ying.huang,
senozhatsky, dan.j.williams
On Tue, Mar 11, 2025 at 8:05 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Tue, Mar 11, 2025 at 2:26 AM Qun-wei Lin (林群崴)
> <Qun-wei.Lin@mediatek.com> wrote:
> >
> > On Sat, 2025-03-08 at 08:41 +1300, Barry Song wrote:
> > >
> > > External email : Please do not click links or open attachments until
> > > you have verified the sender or the content.
> > >
> > >
> > > On Sat, Mar 8, 2025 at 1:02 AM Qun-Wei Lin <qun-wei.lin@mediatek.com>
> > > wrote:
> > > >
> > > > Introduced Kcompressd to offload zram page compression, improving
> > > > system efficiency by handling compression separately from memory
> > > > reclaiming. Added necessary configurations and dependencies.
> > > >
> > > > Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
> > > > ---
> > > > drivers/block/zram/Kconfig | 11 ++
> > > > drivers/block/zram/Makefile | 3 +-
> > > > drivers/block/zram/kcompressd.c | 340
> > > > ++++++++++++++++++++++++++++++++
> > > > drivers/block/zram/kcompressd.h | 25 +++
> > > > drivers/block/zram/zram_drv.c | 22 ++-
> > > > 5 files changed, 397 insertions(+), 4 deletions(-)
> > > > create mode 100644 drivers/block/zram/kcompressd.c
> > > > create mode 100644 drivers/block/zram/kcompressd.h
> > > >
> > > > diff --git a/drivers/block/zram/Kconfig
> > > > b/drivers/block/zram/Kconfig
> > > > index 402b7b175863..f0a1b574f770 100644
> > > > --- a/drivers/block/zram/Kconfig
> > > > +++ b/drivers/block/zram/Kconfig
> > > > @@ -145,3 +145,14 @@ config ZRAM_MULTI_COMP
> > > > re-compress pages using a potentially slower but more
> > > > effective
> > > > compression algorithm. Note, that IDLE page recompression
> > > > requires ZRAM_TRACK_ENTRY_ACTIME.
> > > > +
> > > > +config KCOMPRESSD
> > > > + tristate "Kcompressd: Accelerated zram compression"
> > > > + depends on ZRAM
> > > > + help
> > > > + Kcompressd creates multiple daemons to accelerate the
> > > > compression of pages
> > > > + in zram, offloading this time-consuming task from the
> > > > zram driver.
> > > > +
> > > > + This approach improves system efficiency by handling page
> > > > compression separately,
> > > > + which was originally done by kswapd or direct reclaim.
> > >
> > > For direct reclaim, we were previously able to compress using
> > > multiple CPUs
> > > with multi-threading.
> > > After your patch, it seems that only a single thread/CPU is used for
> > > compression
> > > so it won't necessarily improve direct reclaim performance?
> > >
> >
> > Our patch only splits the context of kswapd. When direct reclaim is
> > occurred, it is bypassed, so direct reclaim remains unchanged, with
> > each thread that needs memory directly reclaiming it.
>
> Qun-wei, I’m getting a bit confused. Looking at the code in page_io.c,
> you always call swap_writepage_bdev_async() no matter if it is kswapd
> or direct reclaim:
>
> - else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
> + else if (data_race(sis->flags & SWP_WRITE_SYNCHRONOUS_IO))
> swap_writepage_bdev_sync(folio, wbc, sis);
> else
> swap_writepage_bdev_async(folio, wbc, sis);
>
> In zram, I notice you are bypassing kcompressd by:
>
> + if (!nr_kcompressd || !current_is_kswapd())
> + return -EBUSY;
>
> How will this work if no one is calling __end_swap_bio_write(&bio),
> which is present in swap_writepage_bdev_sync()?
> Am I missing something? Or is it done by zram_bio_write() ?
>
> On the other hand, zram is a generic block device, and coupling its
> code with kswapd/direct reclaim somehow violates layering
> principles :-)
>
> >
> > > Even for kswapd, we used to have multiple threads like [kswapd0],
> > > [kswapd1],
> > > and [kswapd2] for different nodes. Now, are we also limited to just
> > > one thread?
> >
> > We only considered a single kswapd here and didn't account for multiple
> > instances. Since I use kfifo to collect the bios, if there are multiple
> > kswapds, we need to add a lock to protect the bio queue. I can revise
> > this in the 2nd version, or do you have any other suggested approaches?
>
> I'm wondering if we can move the code to vmscan/page_io instead
> of zram. If we're using a sync I/O swap device or have enabled zswap,
> we could run reclamation in this separate thread, which should also be
Sorry for the typo:
s/reclamation/__swap_writepage/g
> NUMA-aware.
>
> I would definitely be interested in prototyping it when I have the time.
>
Thanks
Barry
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression
2025-03-11 7:05 ` Barry Song
2025-03-11 7:25 ` Barry Song
@ 2025-03-11 14:33 ` Qun-wei Lin (林群崴)
1 sibling, 0 replies; 39+ messages in thread
From: Qun-wei Lin (林群崴) @ 2025-03-11 14:33 UTC (permalink / raw)
To: 21cnbao
Cc: Andrew Yang (楊智強),
dan.j.williams, chrisl, James Hsu (徐慶薰),
AngeloGioacchino Del Regno, akpm, linux-kernel, linux-mediatek,
ira.weiny, linux-mm, dave.jiang, schatzberg.dan,
Chinwen Chang (張錦文),
Casper Li (李中榮),
ryan.roberts, minchan, viro, linux-block, kasong, axboe,
vishal.l.verma, nvdimm, matthias.bgg, ying.huang, senozhatsky,
linux-arm-kernel
On Tue, 2025-03-11 at 20:05 +1300, Barry Song wrote:
>
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
>
>
> On Tue, Mar 11, 2025 at 2:26 AM Qun-wei Lin (林群崴)
> <Qun-wei.Lin@mediatek.com> wrote:
> >
> > On Sat, 2025-03-08 at 08:41 +1300, Barry Song wrote:
> > >
> > > External email : Please do not click links or open attachments
> > > until
> > > you have verified the sender or the content.
> > >
> > >
> > > On Sat, Mar 8, 2025 at 1:02 AM Qun-Wei Lin
> > > <qun-wei.lin@mediatek.com>
> > > wrote:
> > > >
> > > > Introduced Kcompressd to offload zram page compression,
> > > > improving
> > > > system efficiency by handling compression separately from
> > > > memory
> > > > reclaiming. Added necessary configurations and dependencies.
> > > >
> > > > Signed-off-by: Qun-Wei Lin <qun-wei.lin@mediatek.com>
> > > > ---
> > > > drivers/block/zram/Kconfig | 11 ++
> > > > drivers/block/zram/Makefile | 3 +-
> > > > drivers/block/zram/kcompressd.c | 340
> > > > ++++++++++++++++++++++++++++++++
> > > > drivers/block/zram/kcompressd.h | 25 +++
> > > > drivers/block/zram/zram_drv.c | 22 ++-
> > > > 5 files changed, 397 insertions(+), 4 deletions(-)
> > > > create mode 100644 drivers/block/zram/kcompressd.c
> > > > create mode 100644 drivers/block/zram/kcompressd.h
> > > >
> > > > diff --git a/drivers/block/zram/Kconfig
> > > > b/drivers/block/zram/Kconfig
> > > > index 402b7b175863..f0a1b574f770 100644
> > > > --- a/drivers/block/zram/Kconfig
> > > > +++ b/drivers/block/zram/Kconfig
> > > > @@ -145,3 +145,14 @@ config ZRAM_MULTI_COMP
> > > > re-compress pages using a potentially slower but more
> > > > effective
> > > > compression algorithm. Note, that IDLE page
> > > > recompression
> > > > requires ZRAM_TRACK_ENTRY_ACTIME.
> > > > +
> > > > +config KCOMPRESSD
> > > > + tristate "Kcompressd: Accelerated zram compression"
> > > > + depends on ZRAM
> > > > + help
> > > > + Kcompressd creates multiple daemons to accelerate the
> > > > compression of pages
> > > > + in zram, offloading this time-consuming task from the
> > > > zram driver.
> > > > +
> > > > + This approach improves system efficiency by handling
> > > > page
> > > > compression separately,
> > > > + which was originally done by kswapd or direct
> > > > reclaim.
> > >
> > > For direct reclaim, we were previously able to compress using
> > > multiple CPUs
> > > with multi-threading.
> > > After your patch, it seems that only a single thread/CPU is used
> > > for
> > > compression
> > > so it won't necessarily improve direct reclaim performance?
> > >
> >
> > Our patch only splits the context of kswapd. When direct reclaim is
> > occurred, it is bypassed, so direct reclaim remains unchanged, with
> > each thread that needs memory directly reclaiming it.
>
> Qun-wei, I’m getting a bit confused. Looking at the code in
> page_io.c,
> you always call swap_writepage_bdev_async() no matter if it is kswapd
> or direct reclaim:
>
> - else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
> + else if (data_race(sis->flags & SWP_WRITE_SYNCHRONOUS_IO))
> swap_writepage_bdev_sync(folio, wbc, sis);
> else
> swap_writepage_bdev_async(folio, wbc, sis);
>
> In zram, I notice you are bypassing kcompressd by:
>
> + if (!nr_kcompressd || !current_is_kswapd())
> + return -EBUSY;
>
> How will this work if no one is calling __end_swap_bio_write(&bio),
> which is present in swap_writepage_bdev_sync()?
> Am I missing something? Or is it done by zram_bio_write() ?
>
In swap_writepage_bdev_async(), 'bio->bi_end_io = end_swap_bio_write'
is set. So yes, the bio_endio(bio) in zram_bio_write() will handle
this.
> On the other hand, zram is a generic block device, and coupling its
> code with kswapd/direct reclaim somehow violates layering
> principles :-)
OK, so it seems that moving kcompressd to the vmscan level might
resolve this issue, right?
> >
> > > Even for kswapd, we used to have multiple threads like [kswapd0],
> > > [kswapd1],
> > > and [kswapd2] for different nodes. Now, are we also limited to
> > > just
> > > one thread?
> >
> > We only considered a single kswapd here and didn't account for
> > multiple
> > instances. Since I use kfifo to collect the bios, if there are
> > multiple
> > kswapds, we need to add a lock to protect the bio queue. I can
> > revise
> > this in the 2nd version, or do you have any other suggested
> > approaches?
>
> I'm wondering if we can move the code to vmscan/page_io instead
> of zram. If we're using a sync I/O swap device or have enabled zswap,
> we could run reclamation in this separate thread, which should also
> be
> NUMA-aware.
>
> I would definitely be interested in prototyping it when I have the
> time.
Thank you so much!
>
> >
> > > I also wonder if this could be handled at the vmscan level
> > > instead of
> > > the zram
> > > level. then it might potentially help other sync devices or even
> > > zswap later.
> > >
> > > But I agree that for phones, modifying zram seems like an easier
> > > starting
> > > point. However, relying on a single thread isn't always the best
> > > approach.
> > >
> > >
> > > > +
> > > > diff --git a/drivers/block/zram/Makefile
> > > > b/drivers/block/zram/Makefile
> > > > index 0fdefd576691..23baa5dfceb9 100644
> > > > --- a/drivers/block/zram/Makefile
> > > > +++ b/drivers/block/zram/Makefile
> > > > @@ -9,4 +9,5 @@ zram-$(CONFIG_ZRAM_BACKEND_ZSTD) +=
> > > > backend_zstd.o
> > > > zram-$(CONFIG_ZRAM_BACKEND_DEFLATE) += backend_deflate.o
> > > > zram-$(CONFIG_ZRAM_BACKEND_842) +=
> > > > backend_842.o
> > > >
> > > > -obj-$(CONFIG_ZRAM) += zram.o
> > > > +obj-$(CONFIG_ZRAM) += zram.o
> > > > +obj-$(CONFIG_KCOMPRESSD) += kcompressd.o
> > > > diff --git a/drivers/block/zram/kcompressd.c
> > > > b/drivers/block/zram/kcompressd.c
> > > > new file mode 100644
> > > > index 000000000000..195b7e386869
> > > > --- /dev/null
> > > > +++ b/drivers/block/zram/kcompressd.c
> > > > @@ -0,0 +1,340 @@
> > > > +// SPDX-License-Identifier: GPL-2.0
> > > > +/*
> > > > + * Copyright (C) 2024 MediaTek Inc.
> > > > + */
> > > > +
> > > > +#include <linux/module.h>
> > > > +#include <linux/kernel.h>
> > > > +#include <linux/bio.h>
> > > > +#include <linux/bitops.h>
> > > > +#include <linux/freezer.h>
> > > > +#include <linux/kernel.h>
> > > > +#include <linux/psi.h>
> > > > +#include <linux/kfifo.h>
> > > > +#include <linux/swap.h>
> > > > +#include <linux/delay.h>
> > > > +
> > > > +#include "kcompressd.h"
> > > > +
> > > > +#define INIT_QUEUE_SIZE 4096
> > > > +#define DEFAULT_NR_KCOMPRESSD 4
> > > > +
> > > > +static atomic_t enable_kcompressd;
> > > > +static unsigned int nr_kcompressd;
> > > > +static unsigned int queue_size_per_kcompressd;
> > > > +static struct kcompress *kcompress;
> > > > +
> > > > +enum run_state {
> > > > + KCOMPRESSD_NOT_STARTED = 0,
> > > > + KCOMPRESSD_RUNNING,
> > > > + KCOMPRESSD_SLEEPING,
> > > > +};
> > > > +
> > > > +struct kcompressd_para {
> > > > + wait_queue_head_t *kcompressd_wait;
> > > > + struct kfifo *write_fifo;
> > > > + atomic_t *running;
> > > > +};
> > > > +
> > > > +static struct kcompressd_para *kcompressd_para;
> > > > +static BLOCKING_NOTIFIER_HEAD(kcompressd_notifier_list);
> > > > +
> > > > +struct write_work {
> > > > + void *mem;
> > > > + struct bio *bio;
> > > > + compress_callback cb;
> > > > +};
> > > > +
> > > > +int kcompressd_enabled(void)
> > > > +{
> > > > + return likely(atomic_read(&enable_kcompressd));
> > > > +}
> > > > +EXPORT_SYMBOL(kcompressd_enabled);
> > > > +
> > > > +static void kcompressd_try_to_sleep(struct kcompressd_para *p)
> > > > +{
> > > > + DEFINE_WAIT(wait);
> > > > +
> > > > + if (!kfifo_is_empty(p->write_fifo))
> > > > + return;
> > > > +
> > > > + if (freezing(current) || kthread_should_stop())
> > > > + return;
> > > > +
> > > > + atomic_set(p->running, KCOMPRESSD_SLEEPING);
> > > > + prepare_to_wait(p->kcompressd_wait, &wait,
> > > > TASK_INTERRUPTIBLE);
> > > > +
> > > > + /*
> > > > + * After a short sleep, check if it was a premature
> > > > sleep.
> > > > If not, then
> > > > + * go fully to sleep until explicitly woken up.
> > > > + */
> > > > + if (!kthread_should_stop() && kfifo_is_empty(p-
> > > > > write_fifo))
> > > > + schedule();
> > > > +
> > > > + finish_wait(p->kcompressd_wait, &wait);
> > > > + atomic_set(p->running, KCOMPRESSD_RUNNING);
> > > > +}
> > > > +
> > > > +static int kcompressd(void *para)
> > > > +{
> > > > + struct task_struct *tsk = current;
> > > > + struct kcompressd_para *p = (struct kcompressd_para
> > > > *)para;
> > > > +
> > > > + tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
> > > > + set_freezable();
> > > > +
> > > > + while (!kthread_should_stop()) {
> > > > + bool ret;
> > > > +
> > > > + kcompressd_try_to_sleep(p);
> > > > + ret = try_to_freeze();
> > > > + if (kthread_should_stop())
> > > > + break;
> > > > +
> > > > + if (ret)
> > > > + continue;
> > > > +
> > > > + while (!kfifo_is_empty(p->write_fifo)) {
> > > > + struct write_work entry;
> > > > +
> > > > + if (sizeof(struct write_work) ==
> > > > kfifo_out(p->write_fifo,
> > > > + &entry,
> > > > sizeof(struct write_work))) {
> > > > + entry.cb(entry.mem, entry.bio);
> > > > + bio_put(entry.bio);
> > > > + }
> > > > + }
> > > > +
> > > > + }
> > > > +
> > > > + tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
> > > > + atomic_set(p->running, KCOMPRESSD_NOT_STARTED);
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static int init_write_queue(void)
> > > > +{
> > > > + int i;
> > > > + unsigned int queue_len = queue_size_per_kcompressd *
> > > > sizeof(struct write_work);
> > > > +
> > > > + for (i = 0; i < nr_kcompressd; i++) {
> > > > + if (kfifo_alloc(&kcompress[i].write_fifo,
> > > > + queue_len, GFP_KERNEL))
> > > > {
> > > > + pr_err("Failed to alloc kfifo %d\n",
> > > > i);
> > > > + return -ENOMEM;
> > > > + }
> > > > + }
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static void clean_bio_queue(int idx)
> > > > +{
> > > > + struct write_work entry;
> > > > +
> > > > + while (sizeof(struct write_work) ==
> > > > kfifo_out(&kcompress[idx].write_fifo,
> > > > + &entry, sizeof(struct
> > > > write_work)))
> > > > {
> > > > + bio_put(entry.bio);
> > > > + entry.cb(entry.mem, entry.bio);
> > > > + }
> > > > + kfifo_free(&kcompress[idx].write_fifo);
> > > > +}
> > > > +
> > > > +static int kcompress_update(void)
> > > > +{
> > > > + int i;
> > > > + int ret;
> > > > +
> > > > + kcompress = kvmalloc_array(nr_kcompressd, sizeof(struct
> > > > kcompress), GFP_KERNEL);
> > > > + if (!kcompress)
> > > > + return -ENOMEM;
> > > > +
> > > > + kcompressd_para = kvmalloc_array(nr_kcompressd,
> > > > sizeof(struct kcompressd_para), GFP_KERNEL);
> > > > + if (!kcompressd_para)
> > > > + return -ENOMEM;
> > > > +
> > > > + ret = init_write_queue();
> > > > + if (ret) {
> > > > + pr_err("Initialization of writing to FIFOs
> > > > failed!!\n");
> > > > + return ret;
> > > > + }
> > > > +
> > > > + for (i = 0; i < nr_kcompressd; i++) {
> > > > +
> > > > init_waitqueue_head(&kcompress[i].kcompressd_wait);
> > > > + kcompressd_para[i].kcompressd_wait =
> > > > &kcompress[i].kcompressd_wait;
> > > > + kcompressd_para[i].write_fifo =
> > > > &kcompress[i].write_fifo;
> > > > + kcompressd_para[i].running =
> > > > &kcompress[i].running;
> > > > + }
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static void stop_all_kcompressd_thread(void)
> > > > +{
> > > > + int i;
> > > > +
> > > > + for (i = 0; i < nr_kcompressd; i++) {
> > > > + kthread_stop(kcompress[i].kcompressd);
> > > > + kcompress[i].kcompressd = NULL;
> > > > + clean_bio_queue(i);
> > > > + }
> > > > +}
> > > > +
> > > > +static int do_nr_kcompressd_handler(const char *val,
> > > > + const struct kernel_param *kp)
> > > > +{
> > > > + int ret;
> > > > +
> > > > + atomic_set(&enable_kcompressd, false);
> > > > +
> > > > + stop_all_kcompressd_thread();
> > > > +
> > > > + ret = param_set_int(val, kp);
> > > > + if (!ret) {
> > > > + pr_err("Invalid number of kcompressd.\n");
> > > > + return -EINVAL;
> > > > + }
> > > > +
> > > > + ret = init_write_queue();
> > > > + if (ret) {
> > > > + pr_err("Initialization of writing to FIFOs
> > > > failed!!\n");
> > > > + return ret;
> > > > + }
> > > > +
> > > > + atomic_set(&enable_kcompressd, true);
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static const struct kernel_param_ops
> > > > param_ops_change_nr_kcompressd = {
> > > > + .set = &do_nr_kcompressd_handler,
> > > > + .get = ¶m_get_uint,
> > > > + .free = NULL,
> > > > +};
> > > > +
> > > > +module_param_cb(nr_kcompressd,
> > > > ¶m_ops_change_nr_kcompressd,
> > > > + &nr_kcompressd, 0644);
> > > > +MODULE_PARM_DESC(nr_kcompressd, "Number of pre-created daemon
> > > > for
> > > > page compression");
> > > > +
> > > > +static int do_queue_size_per_kcompressd_handler(const char
> > > > *val,
> > > > + const struct kernel_param *kp)
> > > > +{
> > > > + int ret;
> > > > +
> > > > + atomic_set(&enable_kcompressd, false);
> > > > +
> > > > + stop_all_kcompressd_thread();
> > > > +
> > > > + ret = param_set_int(val, kp);
> > > > + if (!ret) {
> > > > + pr_err("Invalid queue size for kcompressd.\n");
> > > > + return -EINVAL;
> > > > + }
> > > > +
> > > > + ret = init_write_queue();
> > > > + if (ret) {
> > > > + pr_err("Initialization of writing to FIFOs
> > > > failed!!\n");
> > > > + return ret;
> > > > + }
> > > > +
> > > > + pr_info("Queue size for kcompressd was changed: %d\n",
> > > > queue_size_per_kcompressd);
> > > > +
> > > > + atomic_set(&enable_kcompressd, true);
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static const struct kernel_param_ops
> > > > param_ops_change_queue_size_per_kcompressd = {
> > > > + .set = &do_queue_size_per_kcompressd_handler,
> > > > + .get = ¶m_get_uint,
> > > > + .free = NULL,
> > > > +};
> > > > +
> > > > +module_param_cb(queue_size_per_kcompressd,
> > > > ¶m_ops_change_queue_size_per_kcompressd,
> > > > + &queue_size_per_kcompressd, 0644);
> > > > +MODULE_PARM_DESC(queue_size_per_kcompressd,
> > > > + "Size of queue for kcompressd");
> > > > +
> > > > +int schedule_bio_write(void *mem, struct bio *bio,
> > > > compress_callback cb)
> > > > +{
> > > > + int i;
> > > > + bool submit_success = false;
> > > > + size_t sz_work = sizeof(struct write_work);
> > > > +
> > > > + struct write_work entry = {
> > > > + .mem = mem,
> > > > + .bio = bio,
> > > > + .cb = cb
> > > > + };
> > > > +
> > > > + if (unlikely(!atomic_read(&enable_kcompressd)))
> > > > + return -EBUSY;
> > > > +
> > > > + if (!nr_kcompressd || !current_is_kswapd())
> > > > + return -EBUSY;
> > > > +
> > > > + bio_get(bio);
> > > > +
> > > > + for (i = 0; i < nr_kcompressd; i++) {
> > > > + submit_success =
> > > > + (kfifo_avail(&kcompress[i].write_fifo)
> > > > >=
> > > > sz_work) &&
> > > > + (sz_work ==
> > > > kfifo_in(&kcompress[i].write_fifo, &entry, sz_work));
> > > > +
> > > > + if (submit_success) {
> > > > + switch
> > > > (atomic_read(&kcompress[i].running))
> > > > {
> > > > + case KCOMPRESSD_NOT_STARTED:
> > > > +
> > > > atomic_set(&kcompress[i].running,
> > > > KCOMPRESSD_RUNNING);
> > > > + kcompress[i].kcompressd =
> > > > kthread_run(kcompressd,
> > > > +
> > > > &kcompressd_para[i], "kcompressd:%d", i);
> > > > + if
> > > > (IS_ERR(kcompress[i].kcompressd)) {
> > > > +
> > > > atomic_set(&kcompress[i].running, KCOMPRESSD_NOT_STARTED);
> > > > + pr_warn("Failed to
> > > > start
> > > > kcompressd:%d\n", i);
> > > > + clean_bio_queue(i);
> > > > + }
> > > > + break;
> > > > + case KCOMPRESSD_RUNNING:
> > > > + break;
> > > > + case KCOMPRESSD_SLEEPING:
> > > > +
> > > > wake_up_interruptible(&kcompress[i].kcompressd_wait);
> > > > + break;
> > > > + }
> > > > + return 0;
> > > > + }
> > > > + }
> > > > +
> > > > + bio_put(bio);
> > > > + return -EBUSY;
> > > > +}
> > > > +EXPORT_SYMBOL(schedule_bio_write);
> > > > +
> > > > +static int __init kcompressd_init(void)
> > > > +{
> > > > + int ret;
> > > > +
> > > > + nr_kcompressd = DEFAULT_NR_KCOMPRESSD;
> > > > + queue_size_per_kcompressd = INIT_QUEUE_SIZE;
> > > > +
> > > > + ret = kcompress_update();
> > > > + if (ret) {
> > > > + pr_err("Init kcompressd failed!\n");
> > > > + return ret;
> > > > + }
> > > > +
> > > > + atomic_set(&enable_kcompressd, true);
> > > > + blocking_notifier_call_chain(&kcompressd_notifier_list,
> > > > 0,
> > > > NULL);
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static void __exit kcompressd_exit(void)
> > > > +{
> > > > + atomic_set(&enable_kcompressd, false);
> > > > + stop_all_kcompressd_thread();
> > > > +
> > > > + kvfree(kcompress);
> > > > + kvfree(kcompressd_para);
> > > > +}
> > > > +
> > > > +module_init(kcompressd_init);
> > > > +module_exit(kcompressd_exit);
> > > > +
> > > > +MODULE_LICENSE("Dual BSD/GPL");
> > > > +MODULE_AUTHOR("Qun-Wei Lin <qun-wei.lin@mediatek.com>");
> > > > +MODULE_DESCRIPTION("Separate the page compression from the
> > > > memory
> > > > reclaiming");
> > > > +
> > > > diff --git a/drivers/block/zram/kcompressd.h
> > > > b/drivers/block/zram/kcompressd.h
> > > > new file mode 100644
> > > > index 000000000000..2fe0b424a7af
> > > > --- /dev/null
> > > > +++ b/drivers/block/zram/kcompressd.h
> > > > @@ -0,0 +1,25 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > +/*
> > > > + * Copyright (C) 2024 MediaTek Inc.
> > > > + */
> > > > +
> > > > +#ifndef _KCOMPRESSD_H_
> > > > +#define _KCOMPRESSD_H_
> > > > +
> > > > +#include <linux/rwsem.h>
> > > > +#include <linux/kfifo.h>
> > > > +#include <linux/atomic.h>
> > > > +
> > > > +typedef void (*compress_callback)(void *mem, struct bio *bio);
> > > > +
> > > > +struct kcompress {
> > > > + struct task_struct *kcompressd;
> > > > + wait_queue_head_t kcompressd_wait;
> > > > + struct kfifo write_fifo;
> > > > + atomic_t running;
> > > > +};
> > > > +
> > > > +int kcompressd_enabled(void);
> > > > +int schedule_bio_write(void *mem, struct bio *bio,
> > > > compress_callback cb);
> > > > +#endif
> > > > +
> > > > diff --git a/drivers/block/zram/zram_drv.c
> > > > b/drivers/block/zram/zram_drv.c
> > > > index 2e1a70f2f4bd..bcd63ecb6ff2 100644
> > > > --- a/drivers/block/zram/zram_drv.c
> > > > +++ b/drivers/block/zram/zram_drv.c
> > > > @@ -35,6 +35,7 @@
> > > > #include <linux/part_stat.h>
> > > > #include <linux/kernel_read_file.h>
> > > >
> > > > +#include "kcompressd.h"
> > > > #include "zram_drv.h"
> > > >
> > > > static DEFINE_IDR(zram_index_idr);
> > > > @@ -2240,6 +2241,15 @@ static void zram_bio_write(struct zram
> > > > *zram, struct bio *bio)
> > > > bio_endio(bio);
> > > > }
> > > >
> > > > +#if IS_ENABLED(CONFIG_KCOMPRESSD)
> > > > +static void zram_bio_write_callback(void *mem, struct bio
> > > > *bio)
> > > > +{
> > > > + struct zram *zram = (struct zram *)mem;
> > > > +
> > > > + zram_bio_write(zram, bio);
> > > > +}
> > > > +#endif
> > > > +
> > > > /*
> > > > * Handler function for all zram I/O requests.
> > > > */
> > > > @@ -2252,6 +2262,10 @@ static void zram_submit_bio(struct bio
> > > > *bio)
> > > > zram_bio_read(zram, bio);
> > > > break;
> > > > case REQ_OP_WRITE:
> > > > +#if IS_ENABLED(CONFIG_KCOMPRESSD)
> > > > + if (kcompressd_enabled() &&
> > > > !schedule_bio_write(zram, bio, zram_bio_write_callback))
> > > > + break;
> > > > +#endif
> > > > zram_bio_write(zram, bio);
> > > > break;
> > > > case REQ_OP_DISCARD:
> > > > @@ -2535,9 +2549,11 @@ static int zram_add(void)
> > > > #if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE
> > > > .max_write_zeroes_sectors = UINT_MAX,
> > > > #endif
> > > > - .features =
> > > > BLK_FEAT_STABLE_WRITES |
> > > > -
> > > > BLK_FEAT_READ_SYNCHRONOUS |
> > > > -
> > > > BLK_FEAT_WRITE_SYNCHRONOUS,
> > > > + .features =
> > > > BLK_FEAT_STABLE_WRITES
> > > > + |
> > > > BLK_FEAT_READ_SYNCHRONOUS
> > > > +#if !IS_ENABLED(CONFIG_KCOMPRESSD)
> > > > + |
> > > > BLK_FEAT_WRITE_SYNCHRONOUS,
> > > > +#endif
> > > > };
> > > > struct zram *zram;
> > > > int ret, device_id;
> > > > --
> > > > 2.45.2
> > > >
> > >
>
> Thanks
> Barry
Best Regards,
Qun-wei
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-07 12:01 [PATCH 0/2] Improve Zram by separating compression context from kswapd Qun-Wei Lin
2025-03-07 12:01 ` [PATCH 1/2] mm: Split BLK_FEAT_SYNCHRONOUS and SWP_SYNCHRONOUS_IO into separate read and write flags Qun-Wei Lin
2025-03-07 12:01 ` [PATCH 2/2] kcompressd: Add Kcompressd for accelerated zram compression Qun-Wei Lin
@ 2025-03-07 19:34 ` Barry Song
2025-03-10 13:21 ` Qun-wei Lin (林群崴)
2025-03-07 23:03 ` Nhat Pham
2025-03-12 18:11 ` Minchan Kim
4 siblings, 1 reply; 39+ messages in thread
From: Barry Song @ 2025-03-07 19:34 UTC (permalink / raw)
To: Qun-Wei Lin
Cc: Jens Axboe, Minchan Kim, Sergey Senozhatsky, Vishal Verma,
Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Huang, Ying, Kairui Song, Dan Schatzberg, Al Viro,
linux-kernel, linux-block, nvdimm, linux-mm, linux-arm-kernel,
linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu
On Sat, Mar 8, 2025 at 1:02 AM Qun-Wei Lin <qun-wei.lin@mediatek.com> wrote:
>
> This patch series introduces a new mechanism called kcompressd to
> improve the efficiency of memory reclaiming in the operating system. The
> main goal is to separate the tasks of page scanning and page compression
> into distinct processes or threads, thereby reducing the load on the
> kswapd thread and enhancing overall system performance under high memory
> pressure conditions.
>
> Problem:
> In the current system, the kswapd thread is responsible for both
> scanning the LRU pages and compressing pages into the ZRAM. This
> combined responsibility can lead to significant performance bottlenecks,
> especially under high memory pressure. The kswapd thread becomes a
> single point of contention, causing delays in memory reclaiming and
> overall system performance degradation.
>
> Target:
> The target of this invention is to improve the efficiency of memory
> reclaiming. By separating the tasks of page scanning and page
> compression into distinct processes or threads, the system can handle
> memory pressure more effectively.
Sounds great. However, we also have a time window where folios under
writeback are kept, whereas previously, writeback was done synchronously
without your patch. This may temporarily increase memory usage until the
kept folios are re-scanned.
So, you’ve observed that folio_rotate_reclaimable() runs shortly while the
async thread completes compression? Then the kept folios are shortly
re-scanned?
>
> Patch 1:
> - Introduces 2 new feature flags, BLK_FEAT_READ_SYNCHRONOUS and
> SWP_READ_SYNCHRONOUS_IO.
>
> Patch 2:
> - Implemented the core functionality of Kcompressd and made necessary
> modifications to the zram driver to support it.
>
> In our handheld devices, we found that applying this mechanism under high
> memory pressure scenarios can increase the rate of pgsteal_anon per second
> by over 260% compared to the situation with only kswapd.
Sounds really great.
What compression algorithm is being used? I assume that after switching to a
different compression algorithms, the benefits will change significantly. For
example, Zstd might not show as much improvement.
How was the CPU usage ratio between page scan/unmap and compression
observed before applying this patch?
>
> Qun-Wei Lin (2):
> mm: Split BLK_FEAT_SYNCHRONOUS and SWP_SYNCHRONOUS_IO into separate
> read and write flags
> kcompressd: Add Kcompressd for accelerated zram compression
>
> drivers/block/brd.c | 3 +-
> drivers/block/zram/Kconfig | 11 ++
> drivers/block/zram/Makefile | 3 +-
> drivers/block/zram/kcompressd.c | 340 ++++++++++++++++++++++++++++++++
> drivers/block/zram/kcompressd.h | 25 +++
> drivers/block/zram/zram_drv.c | 21 +-
> drivers/nvdimm/btt.c | 3 +-
> drivers/nvdimm/pmem.c | 5 +-
> include/linux/blkdev.h | 24 ++-
> include/linux/swap.h | 31 +--
> mm/memory.c | 4 +-
> mm/page_io.c | 6 +-
> mm/swapfile.c | 7 +-
> 13 files changed, 446 insertions(+), 37 deletions(-)
> create mode 100644 drivers/block/zram/kcompressd.c
> create mode 100644 drivers/block/zram/kcompressd.h
>
> --
> 2.45.2
>
Thanks
Barry
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-07 19:34 ` [PATCH 0/2] Improve Zram by separating compression context from kswapd Barry Song
@ 2025-03-10 13:21 ` Qun-wei Lin (林群崴)
0 siblings, 0 replies; 39+ messages in thread
From: Qun-wei Lin (林群崴) @ 2025-03-10 13:21 UTC (permalink / raw)
To: 21cnbao
Cc: Andrew Yang (楊智強),
Casper Li (李中榮),
chrisl, James Hsu (徐慶薰),
AngeloGioacchino Del Regno, akpm, linux-kernel, linux-mediatek,
ira.weiny, linux-mm, dave.jiang, schatzberg.dan,
Chinwen Chang (張錦文),
viro, ryan.roberts, minchan, axboe, linux-block, kasong, nvdimm,
vishal.l.verma, matthias.bgg, linux-arm-kernel, ying.huang,
senozhatsky, dan.j.williams
On Sat, 2025-03-08 at 08:34 +1300, Barry Song wrote:
>
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
>
>
> On Sat, Mar 8, 2025 at 1:02 AM Qun-Wei Lin <qun-wei.lin@mediatek.com>
> wrote:
> >
> > This patch series introduces a new mechanism called kcompressd to
> > improve the efficiency of memory reclaiming in the operating
> > system. The
> > main goal is to separate the tasks of page scanning and page
> > compression
> > into distinct processes or threads, thereby reducing the load on
> > the
> > kswapd thread and enhancing overall system performance under high
> > memory
> > pressure conditions.
> >
> > Problem:
> > In the current system, the kswapd thread is responsible for both
> > scanning the LRU pages and compressing pages into the ZRAM. This
> > combined responsibility can lead to significant performance
> > bottlenecks,
> > especially under high memory pressure. The kswapd thread becomes a
> > single point of contention, causing delays in memory reclaiming
> > and
> > overall system performance degradation.
> >
> > Target:
> > The target of this invention is to improve the efficiency of
> > memory
> > reclaiming. By separating the tasks of page scanning and page
> > compression into distinct processes or threads, the system can
> > handle
> > memory pressure more effectively.
>
> Sounds great. However, we also have a time window where folios under
> writeback are kept, whereas previously, writeback was done
> synchronously
> without your patch. This may temporarily increase memory usage until
> the
> kept folios are re-scanned.
>
> So, you’ve observed that folio_rotate_reclaimable() runs shortly
> while the
> async thread completes compression? Then the kept folios are shortly
> re-scanned?
>
Yes, these folios may need to be re-scanned, so
folio_rotate_reclaimable() will be run. This can be observed from the
increase in pgrotated in /proc/vmstat.
> >
> > Patch 1:
> > - Introduces 2 new feature flags, BLK_FEAT_READ_SYNCHRONOUS and
> > SWP_READ_SYNCHRONOUS_IO.
> >
> > Patch 2:
> > - Implemented the core functionality of Kcompressd and made
> > necessary
> > modifications to the zram driver to support it.
> >
> > In our handheld devices, we found that applying this mechanism
> > under high
> > memory pressure scenarios can increase the rate of pgsteal_anon per
> > second
> > by over 260% compared to the situation with only kswapd.
>
> Sounds really great.
>
> What compression algorithm is being used? I assume that after
> switching to a
> different compression algorithms, the benefits will change
> significantly. For
> example, Zstd might not show as much improvement.
> How was the CPU usage ratio between page scan/unmap and compression
> observed before applying this patch?
>
The original tests were based on LZ4.
We have observed that the CPU time spent on scanning the LRU and
compressing folios is approximately in 3:7.
We also try ZSTD as the zram backend, but the the number of anonymous
folios reclaimed per second did not differ significantly from LZ4 (the
benefits were far less compared to what could be achieved with parallel
processing). Even with ZSTD, we were still able to reach around 800,000
pgsteal_anon per second using kcompressd.
> >
> > Qun-Wei Lin (2):
> > mm: Split BLK_FEAT_SYNCHRONOUS and SWP_SYNCHRONOUS_IO into
> > separate
> > read and write flags
> > kcompressd: Add Kcompressd for accelerated zram compression
> >
> > drivers/block/brd.c | 3 +-
> > drivers/block/zram/Kconfig | 11 ++
> > drivers/block/zram/Makefile | 3 +-
> > drivers/block/zram/kcompressd.c | 340
> > ++++++++++++++++++++++++++++++++
> > drivers/block/zram/kcompressd.h | 25 +++
> > drivers/block/zram/zram_drv.c | 21 +-
> > drivers/nvdimm/btt.c | 3 +-
> > drivers/nvdimm/pmem.c | 5 +-
> > include/linux/blkdev.h | 24 ++-
> > include/linux/swap.h | 31 +--
> > mm/memory.c | 4 +-
> > mm/page_io.c | 6 +-
> > mm/swapfile.c | 7 +-
> > 13 files changed, 446 insertions(+), 37 deletions(-)
> > create mode 100644 drivers/block/zram/kcompressd.c
> > create mode 100644 drivers/block/zram/kcompressd.h
> >
> > --
> > 2.45.2
> >
>
> Thanks
> Barry
Best Regards,
Qun-wei
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-07 12:01 [PATCH 0/2] Improve Zram by separating compression context from kswapd Qun-Wei Lin
` (2 preceding siblings ...)
2025-03-07 19:34 ` [PATCH 0/2] Improve Zram by separating compression context from kswapd Barry Song
@ 2025-03-07 23:03 ` Nhat Pham
2025-03-08 5:41 ` Barry Song
2025-03-12 18:11 ` Minchan Kim
4 siblings, 1 reply; 39+ messages in thread
From: Nhat Pham @ 2025-03-07 23:03 UTC (permalink / raw)
To: Qun-Wei Lin
Cc: Jens Axboe, Minchan Kim, Sergey Senozhatsky, Vishal Verma,
Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Huang, Ying, Kairui Song, Dan Schatzberg,
Barry Song, Al Viro, linux-kernel, linux-block, nvdimm, linux-mm,
linux-arm-kernel, linux-mediatek, Casper Li, Chinwen Chang,
Andrew Yang, James Hsu
On Fri, Mar 7, 2025 at 4:02 AM Qun-Wei Lin <qun-wei.lin@mediatek.com> wrote:
>
> This patch series introduces a new mechanism called kcompressd to
> improve the efficiency of memory reclaiming in the operating system. The
> main goal is to separate the tasks of page scanning and page compression
> into distinct processes or threads, thereby reducing the load on the
> kswapd thread and enhancing overall system performance under high memory
> pressure conditions.
Please excuse my ignorance, but from your cover letter I still don't
quite get what is the problem here? And how would decouple compression
and scanning help?
>
> Problem:
> In the current system, the kswapd thread is responsible for both
> scanning the LRU pages and compressing pages into the ZRAM. This
> combined responsibility can lead to significant performance bottlenecks,
What bottleneck are we talking about? Is one stage slower than the other?
> especially under high memory pressure. The kswapd thread becomes a
> single point of contention, causing delays in memory reclaiming and
> overall system performance degradation.
>
> Target:
> The target of this invention is to improve the efficiency of memory
> reclaiming. By separating the tasks of page scanning and page
> compression into distinct processes or threads, the system can handle
> memory pressure more effectively.
I'm not a zram maintainer, so I'm definitely not trying to stop this
patch. But whatever problem zram is facing will likely occur with
zswap too, so I'd like to learn more :)
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-07 23:03 ` Nhat Pham
@ 2025-03-08 5:41 ` Barry Song
2025-03-10 13:22 ` Qun-wei Lin (林群崴)
2025-03-11 4:58 ` Sergey Senozhatsky
0 siblings, 2 replies; 39+ messages in thread
From: Barry Song @ 2025-03-08 5:41 UTC (permalink / raw)
To: Nhat Pham
Cc: Qun-Wei Lin, Jens Axboe, Minchan Kim, Sergey Senozhatsky,
Vishal Verma, Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Huang, Ying, Kairui Song, Dan Schatzberg, Al Viro,
linux-kernel, linux-block, nvdimm, linux-mm, linux-arm-kernel,
linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu
On Sat, Mar 8, 2025 at 12:03 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Fri, Mar 7, 2025 at 4:02 AM Qun-Wei Lin <qun-wei.lin@mediatek.com> wrote:
> >
> > This patch series introduces a new mechanism called kcompressd to
> > improve the efficiency of memory reclaiming in the operating system. The
> > main goal is to separate the tasks of page scanning and page compression
> > into distinct processes or threads, thereby reducing the load on the
> > kswapd thread and enhancing overall system performance under high memory
> > pressure conditions.
>
> Please excuse my ignorance, but from your cover letter I still don't
> quite get what is the problem here? And how would decouple compression
> and scanning help?
My understanding is as follows:
When kswapd attempts to reclaim M anonymous folios and N file folios,
the process involves the following steps:
* t1: Time to scan and unmap anonymous folios
* t2: Time to compress anonymous folios
* t3: Time to reclaim file folios
Currently, these steps are executed sequentially, meaning the total time
required to reclaim M + N folios is t1 + t2 + t3.
However, Qun-Wei's patch enables t1 + t3 and t2 to run in parallel,
reducing the total time to max(t1 + t3, t2). This likely improves the
reclamation speed, potentially reducing allocation stalls.
I don’t have concrete data on this. Does Qun-Wei have detailed
performance data?
>
> >
> > Problem:
> > In the current system, the kswapd thread is responsible for both
> > scanning the LRU pages and compressing pages into the ZRAM. This
> > combined responsibility can lead to significant performance bottlenecks,
>
> What bottleneck are we talking about? Is one stage slower than the other?
>
> > especially under high memory pressure. The kswapd thread becomes a
> > single point of contention, causing delays in memory reclaiming and
> > overall system performance degradation.
> >
> > Target:
> > The target of this invention is to improve the efficiency of memory
> > reclaiming. By separating the tasks of page scanning and page
> > compression into distinct processes or threads, the system can handle
> > memory pressure more effectively.
>
> I'm not a zram maintainer, so I'm definitely not trying to stop this
> patch. But whatever problem zram is facing will likely occur with
> zswap too, so I'd like to learn more :)
Right, this is likely something that could be addressed more generally
for zswap and zram.
Thanks
Barry
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-08 5:41 ` Barry Song
@ 2025-03-10 13:22 ` Qun-wei Lin (林群崴)
2025-03-10 16:58 ` Nhat Pham
2025-03-11 4:58 ` Sergey Senozhatsky
1 sibling, 1 reply; 39+ messages in thread
From: Qun-wei Lin (林群崴) @ 2025-03-10 13:22 UTC (permalink / raw)
To: 21cnbao, nphamcs
Cc: Andrew Yang (楊智強),
Casper Li (李中榮),
chrisl, James Hsu (徐慶薰),
AngeloGioacchino Del Regno, akpm, linux-kernel, linux-mediatek,
ira.weiny, linux-mm, dave.jiang, schatzberg.dan,
Chinwen Chang (張錦文),
viro, ryan.roberts, minchan, axboe, linux-block, kasong, nvdimm,
vishal.l.verma, matthias.bgg, linux-arm-kernel, ying.huang,
senozhatsky, dan.j.williams
On Sat, 2025-03-08 at 18:41 +1300, Barry Song wrote:
>
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
>
>
> On Sat, Mar 8, 2025 at 12:03 PM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Fri, Mar 7, 2025 at 4:02 AM Qun-Wei Lin
> > <qun-wei.lin@mediatek.com> wrote:
> > >
> > > This patch series introduces a new mechanism called kcompressd to
> > > improve the efficiency of memory reclaiming in the operating
> > > system. The
> > > main goal is to separate the tasks of page scanning and page
> > > compression
> > > into distinct processes or threads, thereby reducing the load on
> > > the
> > > kswapd thread and enhancing overall system performance under high
> > > memory
> > > pressure conditions.
> >
> > Please excuse my ignorance, but from your cover letter I still
> > don't
> > quite get what is the problem here? And how would decouple
> > compression
> > and scanning help?
>
> My understanding is as follows:
>
> When kswapd attempts to reclaim M anonymous folios and N file folios,
> the process involves the following steps:
>
> * t1: Time to scan and unmap anonymous folios
> * t2: Time to compress anonymous folios
> * t3: Time to reclaim file folios
>
> Currently, these steps are executed sequentially, meaning the total
> time
> required to reclaim M + N folios is t1 + t2 + t3.
>
> However, Qun-Wei's patch enables t1 + t3 and t2 to run in parallel,
> reducing the total time to max(t1 + t3, t2). This likely improves the
> reclamation speed, potentially reducing allocation stalls.
>
> I don’t have concrete data on this. Does Qun-Wei have detailed
> performance data?
>
Thank you for your explanation. Compared to the original single kswapd,
we expect t1 to have a slight increase in re-scan time. However, since
our kcompressd can focus on compression tasks and we can have multiple
kcompressd instances (kcompressd0, kcompressd1, ...) running in
parallel, we anticipate that the number of times a folio needs be re-
scanned will not be too many.
In our experiments, we fixed the CPU and DRAM at a certain frequency.
We created a high memory pressure enviroment using a memory eater and
recorded the increase in pgsteal_anon per second, which was around 300,
000. Then we applied our patch and measured again, that pgsteal_anon/s
increased to over 800,000.
> >
> > >
> > > Problem:
> > > In the current system, the kswapd thread is responsible for both
> > > scanning the LRU pages and compressing pages into the ZRAM. This
> > > combined responsibility can lead to significant performance
> > > bottlenecks,
> >
> > What bottleneck are we talking about? Is one stage slower than the
> > other?
> >
> > > especially under high memory pressure. The kswapd thread becomes
> > > a
> > > single point of contention, causing delays in memory reclaiming
> > > and
> > > overall system performance degradation.
> > >
> > > Target:
> > > The target of this invention is to improve the efficiency of
> > > memory
> > > reclaiming. By separating the tasks of page scanning and page
> > > compression into distinct processes or threads, the system can
> > > handle
> > > memory pressure more effectively.
> >
> > I'm not a zram maintainer, so I'm definitely not trying to stop
> > this
> > patch. But whatever problem zram is facing will likely occur with
> > zswap too, so I'd like to learn more :)
>
> Right, this is likely something that could be addressed more
> generally
> for zswap and zram.
>
Yes, we also hope to extend this to other swap devices, but currently,
we have only modified zram. We are not very familiar with zswap and
would like to ask if anyone has any suggestions for modifications?
> Thanks
> Barry
Best Regards,
Qun-wei
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-10 13:22 ` Qun-wei Lin (林群崴)
@ 2025-03-10 16:58 ` Nhat Pham
2025-03-10 17:30 ` Nhat Pham
0 siblings, 1 reply; 39+ messages in thread
From: Nhat Pham @ 2025-03-10 16:58 UTC (permalink / raw)
To: Qun-wei Lin (林群崴)
Cc: 21cnbao, Andrew Yang (楊智強),
Casper Li (李中榮),
chrisl, James Hsu (徐慶薰),
AngeloGioacchino Del Regno, akpm, linux-kernel, linux-mediatek,
ira.weiny, linux-mm, dave.jiang, schatzberg.dan,
Chinwen Chang (張錦文),
viro, ryan.roberts, minchan, axboe, linux-block, kasong, nvdimm,
vishal.l.verma, matthias.bgg, linux-arm-kernel, senozhatsky,
dan.j.williams
On Mon, Mar 10, 2025 at 6:22 AM Qun-wei Lin (林群崴)
<Qun-wei.Lin@mediatek.com> wrote:
>
>
> Thank you for your explanation. Compared to the original single kswapd,
> we expect t1 to have a slight increase in re-scan time. However, since
> our kcompressd can focus on compression tasks and we can have multiple
> kcompressd instances (kcompressd0, kcompressd1, ...) running in
> parallel, we anticipate that the number of times a folio needs be re-
> scanned will not be too many.
>
> In our experiments, we fixed the CPU and DRAM at a certain frequency.
> We created a high memory pressure enviroment using a memory eater and
> recorded the increase in pgsteal_anon per second, which was around 300,
> 000. Then we applied our patch and measured again, that pgsteal_anon/s
> increased to over 800,000.
>
> > >
> > > >
> > > > Problem:
> > > > In the current system, the kswapd thread is responsible for both
> > > > scanning the LRU pages and compressing pages into the ZRAM. This
> > > > combined responsibility can lead to significant performance
> > > > bottlenecks,
> > >
> > > What bottleneck are we talking about? Is one stage slower than the
> > > other?
> > >
> > > > especially under high memory pressure. The kswapd thread becomes
> > > > a
> > > > single point of contention, causing delays in memory reclaiming
> > > > and
> > > > overall system performance degradation.
> > > >
> > > > Target:
> > > > The target of this invention is to improve the efficiency of
> > > > memory
> > > > reclaiming. By separating the tasks of page scanning and page
> > > > compression into distinct processes or threads, the system can
> > > > handle
> > > > memory pressure more effectively.
> > >
> > > I'm not a zram maintainer, so I'm definitely not trying to stop
> > > this
> > > patch. But whatever problem zram is facing will likely occur with
> > > zswap too, so I'd like to learn more :)
> >
> > Right, this is likely something that could be addressed more
> > generally
> > for zswap and zram.
> >
>
> Yes, we also hope to extend this to other swap devices, but currently,
> we have only modified zram. We are not very familiar with zswap and
> would like to ask if anyone has any suggestions for modifications?
>
My understanding is right now schedule_bio_write is the work
submission API right? We can make it generic, having it accept a
callback and a generic untyped pointer which can be casted into a
backend-specific context struct. For zram it would contain struct zram
and the bio. For zswap, depending on at which point do you want to
begin offloading the work - it could simply be just the folio itself
if we offload early, or a more complicated scheme.
> > Thanks
> > Barry
>
> Best Regards,
> Qun-wei
>
>
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-10 16:58 ` Nhat Pham
@ 2025-03-10 17:30 ` Nhat Pham
0 siblings, 0 replies; 39+ messages in thread
From: Nhat Pham @ 2025-03-10 17:30 UTC (permalink / raw)
To: Qun-wei Lin (林群崴)
Cc: 21cnbao, Andrew Yang (楊智強),
Casper Li (李中榮),
chrisl, James Hsu (徐慶薰),
AngeloGioacchino Del Regno, akpm, linux-kernel, linux-mediatek,
ira.weiny, linux-mm, dave.jiang, schatzberg.dan,
Chinwen Chang (張錦文),
viro, ryan.roberts, minchan, axboe, linux-block, kasong, nvdimm,
vishal.l.verma, matthias.bgg, linux-arm-kernel, senozhatsky,
dan.j.williams
On Mon, Mar 10, 2025 at 9:58 AM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Mon, Mar 10, 2025 at 6:22 AM Qun-wei Lin (林群崴)
> <Qun-wei.Lin@mediatek.com> wrote:
> >
> >
> > Thank you for your explanation. Compared to the original single kswapd,
> > we expect t1 to have a slight increase in re-scan time. However, since
> > our kcompressd can focus on compression tasks and we can have multiple
> > kcompressd instances (kcompressd0, kcompressd1, ...) running in
> > parallel, we anticipate that the number of times a folio needs be re-
> > scanned will not be too many.
> >
> > In our experiments, we fixed the CPU and DRAM at a certain frequency.
> > We created a high memory pressure enviroment using a memory eater and
> > recorded the increase in pgsteal_anon per second, which was around 300,
> > 000. Then we applied our patch and measured again, that pgsteal_anon/s
> > increased to over 800,000.
> >
> > > >
> > > > >
> > > > > Problem:
> > > > > In the current system, the kswapd thread is responsible for both
> > > > > scanning the LRU pages and compressing pages into the ZRAM. This
> > > > > combined responsibility can lead to significant performance
> > > > > bottlenecks,
> > > >
> > > > What bottleneck are we talking about? Is one stage slower than the
> > > > other?
> > > >
> > > > > especially under high memory pressure. The kswapd thread becomes
> > > > > a
> > > > > single point of contention, causing delays in memory reclaiming
> > > > > and
> > > > > overall system performance degradation.
> > > > >
> > > > > Target:
> > > > > The target of this invention is to improve the efficiency of
> > > > > memory
> > > > > reclaiming. By separating the tasks of page scanning and page
> > > > > compression into distinct processes or threads, the system can
> > > > > handle
> > > > > memory pressure more effectively.
> > > >
> > > > I'm not a zram maintainer, so I'm definitely not trying to stop
> > > > this
> > > > patch. But whatever problem zram is facing will likely occur with
> > > > zswap too, so I'd like to learn more :)
> > >
> > > Right, this is likely something that could be addressed more
> > > generally
> > > for zswap and zram.
> > >
> >
> > Yes, we also hope to extend this to other swap devices, but currently,
> > we have only modified zram. We are not very familiar with zswap and
> > would like to ask if anyone has any suggestions for modifications?
> >
>
> My understanding is right now schedule_bio_write is the work
> submission API right? We can make it generic, having it accept a
> callback and a generic untyped pointer which can be casted into a
> backend-specific context struct. For zram it would contain struct zram
> and the bio. For zswap, depending on at which point do you want to
> begin offloading the work - it could simply be just the folio itself
> if we offload early, or a more complicated scheme.
To expand a bit - zswap_store() is where all the logic lives. It's
fairly straightforward: checking zswap cgroup limits, acquire the
zswap pool (a combination of compression algorithm and backend memory
allocator, which is just zsmalloc now), perform compression, then ask
for a slot from zsmalloc and store it there.
You can probably just offload the whole thing here, or perform some
steps of the sequence before offloading the rest :) One slight
complication is don't forget to fallback to disk swapping - unlike
zram, zswap is originally designed as a "cache" for underlying swap
files on disk, which we can fallback to if the compression attempt
fails. Everything should be fairly straightforward though :)
>
>
>
> > > Thanks
> > > Barry
> >
> > Best Regards,
> > Qun-wei
> >
> >
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-08 5:41 ` Barry Song
2025-03-10 13:22 ` Qun-wei Lin (林群崴)
@ 2025-03-11 4:58 ` Sergey Senozhatsky
2025-03-11 9:33 ` Barry Song
1 sibling, 1 reply; 39+ messages in thread
From: Sergey Senozhatsky @ 2025-03-11 4:58 UTC (permalink / raw)
To: Qun-Wei Lin
Cc: Nhat Pham, Barry Song, Jens Axboe, Minchan Kim,
Sergey Senozhatsky, Vishal Verma, Dan Williams, Dave Jiang,
Ira Weiny, Andrew Morton, Matthias Brugger,
AngeloGioacchino Del Regno, Chris Li, Ryan Roberts, Huang, Ying,
Kairui Song, Dan Schatzberg, Al Viro, linux-kernel, linux-block,
nvdimm, linux-mm, linux-arm-kernel, linux-mediatek, Casper Li,
Chinwen Chang, Andrew Yang, James Hsu
On (25/03/08 18:41), Barry Song wrote:
> On Sat, Mar 8, 2025 at 12:03 PM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Fri, Mar 7, 2025 at 4:02 AM Qun-Wei Lin <qun-wei.lin@mediatek.com> wrote:
> > >
> > > This patch series introduces a new mechanism called kcompressd to
> > > improve the efficiency of memory reclaiming in the operating system. The
> > > main goal is to separate the tasks of page scanning and page compression
> > > into distinct processes or threads, thereby reducing the load on the
> > > kswapd thread and enhancing overall system performance under high memory
> > > pressure conditions.
> >
> > Please excuse my ignorance, but from your cover letter I still don't
> > quite get what is the problem here? And how would decouple compression
> > and scanning help?
>
> My understanding is as follows:
>
> When kswapd attempts to reclaim M anonymous folios and N file folios,
> the process involves the following steps:
>
> * t1: Time to scan and unmap anonymous folios
> * t2: Time to compress anonymous folios
> * t3: Time to reclaim file folios
>
> Currently, these steps are executed sequentially, meaning the total time
> required to reclaim M + N folios is t1 + t2 + t3.
>
> However, Qun-Wei's patch enables t1 + t3 and t2 to run in parallel,
> reducing the total time to max(t1 + t3, t2). This likely improves the
> reclamation speed, potentially reducing allocation stalls.
If compression kthread-s can run (have CPUs to be scheduled on).
This looks a bit like a bottleneck. Is there anything that
guarantees forward progress? Also, if compression kthreads
constantly preempt kswapd, then it might not be worth it to
have compression kthreads, I assume?
If we have a pagefault and need to map a page that is still in
the compression queue (not compressed and stored in zram yet, e.g.
dut to scheduling latency + slow compression algorithm) then what
happens?
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-11 4:58 ` Sergey Senozhatsky
@ 2025-03-11 9:33 ` Barry Song
2025-03-11 14:12 ` Qun-wei Lin (林群崴)
0 siblings, 1 reply; 39+ messages in thread
From: Barry Song @ 2025-03-11 9:33 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Qun-Wei Lin, Nhat Pham, Jens Axboe, Minchan Kim, Vishal Verma,
Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Huang, Ying, Kairui Song, Dan Schatzberg, Al Viro,
linux-kernel, linux-block, nvdimm, linux-mm, linux-arm-kernel,
linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu
On Tue, Mar 11, 2025 at 5:58 PM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> On (25/03/08 18:41), Barry Song wrote:
> > On Sat, Mar 8, 2025 at 12:03 PM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > On Fri, Mar 7, 2025 at 4:02 AM Qun-Wei Lin <qun-wei.lin@mediatek.com> wrote:
> > > >
> > > > This patch series introduces a new mechanism called kcompressd to
> > > > improve the efficiency of memory reclaiming in the operating system. The
> > > > main goal is to separate the tasks of page scanning and page compression
> > > > into distinct processes or threads, thereby reducing the load on the
> > > > kswapd thread and enhancing overall system performance under high memory
> > > > pressure conditions.
> > >
> > > Please excuse my ignorance, but from your cover letter I still don't
> > > quite get what is the problem here? And how would decouple compression
> > > and scanning help?
> >
> > My understanding is as follows:
> >
> > When kswapd attempts to reclaim M anonymous folios and N file folios,
> > the process involves the following steps:
> >
> > * t1: Time to scan and unmap anonymous folios
> > * t2: Time to compress anonymous folios
> > * t3: Time to reclaim file folios
> >
> > Currently, these steps are executed sequentially, meaning the total time
> > required to reclaim M + N folios is t1 + t2 + t3.
> >
> > However, Qun-Wei's patch enables t1 + t3 and t2 to run in parallel,
> > reducing the total time to max(t1 + t3, t2). This likely improves the
> > reclamation speed, potentially reducing allocation stalls.
>
> If compression kthread-s can run (have CPUs to be scheduled on).
> This looks a bit like a bottleneck. Is there anything that
> guarantees forward progress? Also, if compression kthreads
> constantly preempt kswapd, then it might not be worth it to
> have compression kthreads, I assume?
Thanks for your critical insights, all of which are valuable.
Qun-Wei is likely working on an Android case where the CPU is
relatively idle in many scenarios (though there are certainly cases
where all CPUs are busy), but free memory is quite limited.
We may soon see benefits for these types of use cases. I expect
Android might have the opportunity to adopt it before it's fully
ready upstream.
If the workload keeps all CPUs busy, I suppose this async thread
won’t help, but at least we might find a way to mitigate regression.
We likely need to collect more data on various scenarios—when
CPUs are relatively idle and when all CPUs are busy—and
determine the proper approach based on the data, which we
currently lack :-)
>
> If we have a pagefault and need to map a page that is still in
> the compression queue (not compressed and stored in zram yet, e.g.
> dut to scheduling latency + slow compression algorithm) then what
> happens?
This is happening now even without the patch? Right now we are
having 4 steps:
1. add_to_swap: The folio is added to the swapcache.
2. try_to_unmap: PTEs are converted to swap entries.
3. pageout: The folio is written back.
4. Swapcache is cleared.
If a swap-in occurs between 2 and 4, doesn't that mean
we've already encountered the case where we hit
the swapcache for a folio undergoing compression?
It seems we might have an opportunity to terminate
compression if the request is still in the queue and
compression hasn’t started for a folio yet? seems
quite difficult to do?
Thanks
Barry
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-11 9:33 ` Barry Song
@ 2025-03-11 14:12 ` Qun-wei Lin (林群崴)
2025-03-12 5:19 ` Sergey Senozhatsky
0 siblings, 1 reply; 39+ messages in thread
From: Qun-wei Lin (林群崴) @ 2025-03-11 14:12 UTC (permalink / raw)
To: 21cnbao, senozhatsky
Cc: Chinwen Chang (張錦文),
Andrew Yang (楊智強),
Casper Li (李中榮),
nphamcs, chrisl, James Hsu (徐慶薰),
AngeloGioacchino Del Regno, akpm, linux-kernel, linux-mediatek,
ira.weiny, linux-mm, dave.jiang, vishal.l.verma, schatzberg.dan,
viro, ryan.roberts, minchan, axboe, linux-block, kasong, nvdimm,
linux-arm-kernel, matthias.bgg, ying.huang, dan.j.williams
On Tue, 2025-03-11 at 22:33 +1300, Barry Song wrote:
>
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
>
>
> On Tue, Mar 11, 2025 at 5:58 PM Sergey Senozhatsky
> <senozhatsky@chromium.org> wrote:
> >
> > On (25/03/08 18:41), Barry Song wrote:
> > > On Sat, Mar 8, 2025 at 12:03 PM Nhat Pham <nphamcs@gmail.com>
> > > wrote:
> > > >
> > > > On Fri, Mar 7, 2025 at 4:02 AM Qun-Wei Lin
> > > > <qun-wei.lin@mediatek.com> wrote:
> > > > >
> > > > > This patch series introduces a new mechanism called
> > > > > kcompressd to
> > > > > improve the efficiency of memory reclaiming in the operating
> > > > > system. The
> > > > > main goal is to separate the tasks of page scanning and page
> > > > > compression
> > > > > into distinct processes or threads, thereby reducing the load
> > > > > on the
> > > > > kswapd thread and enhancing overall system performance under
> > > > > high memory
> > > > > pressure conditions.
> > > >
> > > > Please excuse my ignorance, but from your cover letter I still
> > > > don't
> > > > quite get what is the problem here? And how would decouple
> > > > compression
> > > > and scanning help?
> > >
> > > My understanding is as follows:
> > >
> > > When kswapd attempts to reclaim M anonymous folios and N file
> > > folios,
> > > the process involves the following steps:
> > >
> > > * t1: Time to scan and unmap anonymous folios
> > > * t2: Time to compress anonymous folios
> > > * t3: Time to reclaim file folios
> > >
> > > Currently, these steps are executed sequentially, meaning the
> > > total time
> > > required to reclaim M + N folios is t1 + t2 + t3.
> > >
> > > However, Qun-Wei's patch enables t1 + t3 and t2 to run in
> > > parallel,
> > > reducing the total time to max(t1 + t3, t2). This likely improves
> > > the
> > > reclamation speed, potentially reducing allocation stalls.
> >
> > If compression kthread-s can run (have CPUs to be scheduled on).
> > This looks a bit like a bottleneck. Is there anything that
> > guarantees forward progress? Also, if compression kthreads
> > constantly preempt kswapd, then it might not be worth it to
> > have compression kthreads, I assume?
>
> Thanks for your critical insights, all of which are valuable.
>
> Qun-Wei is likely working on an Android case where the CPU is
> relatively idle in many scenarios (though there are certainly cases
> where all CPUs are busy), but free memory is quite limited.
> We may soon see benefits for these types of use cases. I expect
> Android might have the opportunity to adopt it before it's fully
> ready upstream.
>
> If the workload keeps all CPUs busy, I suppose this async thread
> won’t help, but at least we might find a way to mitigate regression.
>
> We likely need to collect more data on various scenarios—when
> CPUs are relatively idle and when all CPUs are busy—and
> determine the proper approach based on the data, which we
> currently lack :-)
>
Thanks for the explaining!
> >
> > If we have a pagefault and need to map a page that is still in
> > the compression queue (not compressed and stored in zram yet, e.g.
> > dut to scheduling latency + slow compression algorithm) then what
> > happens?
>
> This is happening now even without the patch? Right now we are
> having 4 steps:
> 1. add_to_swap: The folio is added to the swapcache.
> 2. try_to_unmap: PTEs are converted to swap entries.
> 3. pageout: The folio is written back.
> 4. Swapcache is cleared.
>
> If a swap-in occurs between 2 and 4, doesn't that mean
> we've already encountered the case where we hit
> the swapcache for a folio undergoing compression?
>
> It seems we might have an opportunity to terminate
> compression if the request is still in the queue and
> compression hasn’t started for a folio yet? seems
> quite difficult to do?
As Barry explained, these folios that are being compressed are in the
swapcache. If a refault occurs during the compression process, its
correctness is already guaranteed by the swap subsystem (similar to
other asynchronous swap devices).
Indeed, terminating a folio that is already in the queue waiting for
compression is a challenging task. Will this require some modifications
to the current architecture of swap subsystem?
>
> Thanks
> Barry
Best Regards,
Qun-wei
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-11 14:12 ` Qun-wei Lin (林群崴)
@ 2025-03-12 5:19 ` Sergey Senozhatsky
0 siblings, 0 replies; 39+ messages in thread
From: Sergey Senozhatsky @ 2025-03-12 5:19 UTC (permalink / raw)
To: Qun-wei Lin (林群崴)
Cc: 21cnbao, senozhatsky, Chinwen Chang (張錦文),
Andrew Yang (楊智強),
Casper Li (李中榮),
nphamcs, chrisl, James Hsu (徐慶薰),
AngeloGioacchino Del Regno, akpm, linux-kernel, linux-mediatek,
ira.weiny, linux-mm, dave.jiang, vishal.l.verma, schatzberg.dan,
viro, ryan.roberts, minchan, axboe, linux-block, kasong, nvdimm,
linux-arm-kernel, matthias.bgg, ying.huang, dan.j.williams
On (25/03/11 14:12), Qun-wei Lin (林群崴) wrote:
> > > If compression kthread-s can run (have CPUs to be scheduled on).
> > > This looks a bit like a bottleneck. Is there anything that
> > > guarantees forward progress? Also, if compression kthreads
> > > constantly preempt kswapd, then it might not be worth it to
> > > have compression kthreads, I assume?
> >
> > Thanks for your critical insights, all of which are valuable.
> >
> > Qun-Wei is likely working on an Android case where the CPU is
> > relatively idle in many scenarios (though there are certainly cases
> > where all CPUs are busy), but free memory is quite limited.
> > We may soon see benefits for these types of use cases. I expect
> > Android might have the opportunity to adopt it before it's fully
> > ready upstream.
> >
> > If the workload keeps all CPUs busy, I suppose this async thread
> > won’t help, but at least we might find a way to mitigate regression.
> >
> > We likely need to collect more data on various scenarios—when
> > CPUs are relatively idle and when all CPUs are busy—and
> > determine the proper approach based on the data, which we
> > currently lack :-)
Right. The scan/unmap can move very fast (a rabbit) while the
compressor can move rather slow (a tortoise.) There is some
benefit in the fact that kswap does compression directly, I'd
presume.
Another thing to consider, perhaps, is that not every page is
necessarily required to go through the compressor queue and stay
there until the woken-up compressor finally picks it up just to
realize that the page is filled with 0xff (or any other pattern).
At least on the zram side such pages are not compressed and stored
as an 8-byte pattern in the zram meta table (w/o using any zsmalloc
memory.)
> > > If we have a pagefault and need to map a page that is still in
> > > the compression queue (not compressed and stored in zram yet, e.g.
> > > dut to scheduling latency + slow compression algorithm) then what
> > > happens?
> >
> > This is happening now even without the patch? Right now we are
> > having 4 steps:
> > 1. add_to_swap: The folio is added to the swapcache.
> > 2. try_to_unmap: PTEs are converted to swap entries.
> > 3. pageout: The folio is written back.
> > 4. Swapcache is cleared.
> >
> > If a swap-in occurs between 2 and 4, doesn't that mean
> > we've already encountered the case where we hit
> > the swapcache for a folio undergoing compression?
> >
> > It seems we might have an opportunity to terminate
> > compression if the request is still in the queue and
> > compression hasn’t started for a folio yet? seems
> > quite difficult to do?
>
> As Barry explained, these folios that are being compressed are in the
> swapcache. If a refault occurs during the compression process, its
> correctness is already guaranteed by the swap subsystem (similar to
> other asynchronous swap devices).
Right. I just was thinking that now there is a wake_up between
scan/unmap and compress. Not sure how much trouble this can make.
> Indeed, terminating a folio that is already in the queue waiting for
> compression is a challenging task. Will this require some modifications
> to the current architecture of swap subsystem?
Yeah, I'll leave it mm folks to decide :)
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-07 12:01 [PATCH 0/2] Improve Zram by separating compression context from kswapd Qun-Wei Lin
` (3 preceding siblings ...)
2025-03-07 23:03 ` Nhat Pham
@ 2025-03-12 18:11 ` Minchan Kim
2025-03-13 3:09 ` Sergey Senozhatsky
4 siblings, 1 reply; 39+ messages in thread
From: Minchan Kim @ 2025-03-12 18:11 UTC (permalink / raw)
To: Qun-Wei Lin
Cc: Jens Axboe, Sergey Senozhatsky, Vishal Verma, Dan Williams,
Dave Jiang, Ira Weiny, Andrew Morton, Matthias Brugger,
AngeloGioacchino Del Regno, Chris Li, Ryan Roberts, Huang, Ying,
Kairui Song, Dan Schatzberg, Barry Song, Al Viro, linux-kernel,
linux-block, nvdimm, linux-mm, linux-arm-kernel, linux-mediatek,
Casper Li, Chinwen Chang, Andrew Yang, James Hsu
Hi Qun-Wei
On Fri, Mar 07, 2025 at 08:01:02PM +0800, Qun-Wei Lin wrote:
> This patch series introduces a new mechanism called kcompressd to
> improve the efficiency of memory reclaiming in the operating system. The
> main goal is to separate the tasks of page scanning and page compression
> into distinct processes or threads, thereby reducing the load on the
> kswapd thread and enhancing overall system performance under high memory
> pressure conditions.
>
> Problem:
> In the current system, the kswapd thread is responsible for both
> scanning the LRU pages and compressing pages into the ZRAM. This
> combined responsibility can lead to significant performance bottlenecks,
> especially under high memory pressure. The kswapd thread becomes a
> single point of contention, causing delays in memory reclaiming and
> overall system performance degradation.
Isn't it general problem if backend for swap is slow(but synchronous)?
I think zram need to support asynchrnous IO(can do introduce multiple
threads to compress batched pages) and doesn't declare it's
synchrnous device for the case.
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-12 18:11 ` Minchan Kim
@ 2025-03-13 3:09 ` Sergey Senozhatsky
2025-03-13 3:45 ` Barry Song
2025-03-13 3:52 ` Barry Song
0 siblings, 2 replies; 39+ messages in thread
From: Sergey Senozhatsky @ 2025-03-13 3:09 UTC (permalink / raw)
To: Minchan Kim
Cc: Qun-Wei Lin, Jens Axboe, Sergey Senozhatsky, Vishal Verma,
Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Huang, Ying, Kairui Song, Dan Schatzberg,
Barry Song, Al Viro, linux-kernel, linux-block, nvdimm, linux-mm,
linux-arm-kernel, linux-mediatek, Casper Li, Chinwen Chang,
Andrew Yang, James Hsu
On (25/03/12 11:11), Minchan Kim wrote:
> On Fri, Mar 07, 2025 at 08:01:02PM +0800, Qun-Wei Lin wrote:
> > This patch series introduces a new mechanism called kcompressd to
> > improve the efficiency of memory reclaiming in the operating system. The
> > main goal is to separate the tasks of page scanning and page compression
> > into distinct processes or threads, thereby reducing the load on the
> > kswapd thread and enhancing overall system performance under high memory
> > pressure conditions.
> >
> > Problem:
> > In the current system, the kswapd thread is responsible for both
> > scanning the LRU pages and compressing pages into the ZRAM. This
> > combined responsibility can lead to significant performance bottlenecks,
> > especially under high memory pressure. The kswapd thread becomes a
> > single point of contention, causing delays in memory reclaiming and
> > overall system performance degradation.
>
> Isn't it general problem if backend for swap is slow(but synchronous)?
> I think zram need to support asynchrnous IO(can do introduce multiple
> threads to compress batched pages) and doesn't declare it's
> synchrnous device for the case.
The current conclusion is that kcompressd will sit above zram,
because zram is not the only compressing swap backend we have.
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-13 3:09 ` Sergey Senozhatsky
@ 2025-03-13 3:45 ` Barry Song
2025-03-13 16:07 ` Minchan Kim
2025-03-13 3:52 ` Barry Song
1 sibling, 1 reply; 39+ messages in thread
From: Barry Song @ 2025-03-13 3:45 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Minchan Kim, Qun-Wei Lin, Jens Axboe, Vishal Verma, Dan Williams,
Dave Jiang, Ira Weiny, Andrew Morton, Matthias Brugger,
AngeloGioacchino Del Regno, Chris Li, Ryan Roberts, Huang, Ying,
Kairui Song, Dan Schatzberg, Al Viro, linux-kernel, linux-block,
nvdimm, linux-mm, linux-arm-kernel, linux-mediatek, Casper Li,
Chinwen Chang, Andrew Yang, James Hsu
On Thu, Mar 13, 2025 at 4:09 PM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> On (25/03/12 11:11), Minchan Kim wrote:
> > On Fri, Mar 07, 2025 at 08:01:02PM +0800, Qun-Wei Lin wrote:
> > > This patch series introduces a new mechanism called kcompressd to
> > > improve the efficiency of memory reclaiming in the operating system. The
> > > main goal is to separate the tasks of page scanning and page compression
> > > into distinct processes or threads, thereby reducing the load on the
> > > kswapd thread and enhancing overall system performance under high memory
> > > pressure conditions.
> > >
> > > Problem:
> > > In the current system, the kswapd thread is responsible for both
> > > scanning the LRU pages and compressing pages into the ZRAM. This
> > > combined responsibility can lead to significant performance bottlenecks,
> > > especially under high memory pressure. The kswapd thread becomes a
> > > single point of contention, causing delays in memory reclaiming and
> > > overall system performance degradation.
> >
> > Isn't it general problem if backend for swap is slow(but synchronous)?
> > I think zram need to support asynchrnous IO(can do introduce multiple
> > threads to compress batched pages) and doesn't declare it's
> > synchrnous device for the case.
>
> The current conclusion is that kcompressd will sit above zram,
> because zram is not the only compressing swap backend we have.
also. it is not good to hack zram to be aware of if it is kswapd
, direct reclaim , proactive reclaim and block device with
mounted filesystem.
so i am thinking sth as below
page_io.c
if (sync_device or zswap_enabled())
schedule swap_writepage to a separate per-node thread
btw, ran the current patchset with one thread(not default 4)
on phones and saw 50%+ allocstall reduction. so the idea
looks like a good direction to go.
Thanks
Barry
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-13 3:45 ` Barry Song
@ 2025-03-13 16:07 ` Minchan Kim
2025-03-13 16:58 ` Barry Song
0 siblings, 1 reply; 39+ messages in thread
From: Minchan Kim @ 2025-03-13 16:07 UTC (permalink / raw)
To: Barry Song
Cc: Sergey Senozhatsky, Qun-Wei Lin, Jens Axboe, Vishal Verma,
Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Huang, Ying, Kairui Song, Dan Schatzberg, Al Viro,
linux-kernel, linux-block, nvdimm, linux-mm, linux-arm-kernel,
linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu
On Thu, Mar 13, 2025 at 04:45:54PM +1300, Barry Song wrote:
> On Thu, Mar 13, 2025 at 4:09 PM Sergey Senozhatsky
> <senozhatsky@chromium.org> wrote:
> >
> > On (25/03/12 11:11), Minchan Kim wrote:
> > > On Fri, Mar 07, 2025 at 08:01:02PM +0800, Qun-Wei Lin wrote:
> > > > This patch series introduces a new mechanism called kcompressd to
> > > > improve the efficiency of memory reclaiming in the operating system. The
> > > > main goal is to separate the tasks of page scanning and page compression
> > > > into distinct processes or threads, thereby reducing the load on the
> > > > kswapd thread and enhancing overall system performance under high memory
> > > > pressure conditions.
> > > >
> > > > Problem:
> > > > In the current system, the kswapd thread is responsible for both
> > > > scanning the LRU pages and compressing pages into the ZRAM. This
> > > > combined responsibility can lead to significant performance bottlenecks,
> > > > especially under high memory pressure. The kswapd thread becomes a
> > > > single point of contention, causing delays in memory reclaiming and
> > > > overall system performance degradation.
> > >
> > > Isn't it general problem if backend for swap is slow(but synchronous)?
> > > I think zram need to support asynchrnous IO(can do introduce multiple
> > > threads to compress batched pages) and doesn't declare it's
> > > synchrnous device for the case.
> >
> > The current conclusion is that kcompressd will sit above zram,
> > because zram is not the only compressing swap backend we have.
Then, how handles the file IO case?
>
> also. it is not good to hack zram to be aware of if it is kswapd
> , direct reclaim , proactive reclaim and block device with
> mounted filesystem.
Why shouldn't zram be aware of that instead of just introducing
queues in the zram with multiple compression threads?
>
> so i am thinking sth as below
>
> page_io.c
>
> if (sync_device or zswap_enabled())
> schedule swap_writepage to a separate per-node thread
I am not sure that's a good idea to mix a feature to solve different
layers. That wouldn't be only swap problem. Such an parallelism under
device is common technique these days and it would help file IO cases.
Furthermore, it would open the chance for zram to try compress
multiple pages at once.
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-13 16:07 ` Minchan Kim
@ 2025-03-13 16:58 ` Barry Song
2025-03-13 17:33 ` Minchan Kim
0 siblings, 1 reply; 39+ messages in thread
From: Barry Song @ 2025-03-13 16:58 UTC (permalink / raw)
To: Minchan Kim
Cc: Sergey Senozhatsky, Qun-Wei Lin, Jens Axboe, Vishal Verma,
Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Huang, Ying, Kairui Song, Dan Schatzberg, Al Viro,
linux-kernel, linux-block, nvdimm, linux-mm, linux-arm-kernel,
linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu
On Fri, Mar 14, 2025 at 5:07 AM Minchan Kim <minchan@kernel.org> wrote:
>
> On Thu, Mar 13, 2025 at 04:45:54PM +1300, Barry Song wrote:
> > On Thu, Mar 13, 2025 at 4:09 PM Sergey Senozhatsky
> > <senozhatsky@chromium.org> wrote:
> > >
> > > On (25/03/12 11:11), Minchan Kim wrote:
> > > > On Fri, Mar 07, 2025 at 08:01:02PM +0800, Qun-Wei Lin wrote:
> > > > > This patch series introduces a new mechanism called kcompressd to
> > > > > improve the efficiency of memory reclaiming in the operating system. The
> > > > > main goal is to separate the tasks of page scanning and page compression
> > > > > into distinct processes or threads, thereby reducing the load on the
> > > > > kswapd thread and enhancing overall system performance under high memory
> > > > > pressure conditions.
> > > > >
> > > > > Problem:
> > > > > In the current system, the kswapd thread is responsible for both
> > > > > scanning the LRU pages and compressing pages into the ZRAM. This
> > > > > combined responsibility can lead to significant performance bottlenecks,
> > > > > especially under high memory pressure. The kswapd thread becomes a
> > > > > single point of contention, causing delays in memory reclaiming and
> > > > > overall system performance degradation.
> > > >
> > > > Isn't it general problem if backend for swap is slow(but synchronous)?
> > > > I think zram need to support asynchrnous IO(can do introduce multiple
> > > > threads to compress batched pages) and doesn't declare it's
> > > > synchrnous device for the case.
> > >
> > > The current conclusion is that kcompressd will sit above zram,
> > > because zram is not the only compressing swap backend we have.
>
> Then, how handles the file IO case?
I didn't quite catch your question :-)
>
> >
> > also. it is not good to hack zram to be aware of if it is kswapd
> > , direct reclaim , proactive reclaim and block device with
> > mounted filesystem.
>
> Why shouldn't zram be aware of that instead of just introducing
> queues in the zram with multiple compression threads?
>
My view is the opposite of yours :-)
Integrating kswapd, direct reclaim, etc., into the zram driver
would violate layering principles. zram is purely a block device
driver, and how it is used should be handled separately. Callers have
greater flexibility to determine its usage, similar to how different
I/O models exist in user space.
Currently, Qun-Wei's patch checks whether the current thread is kswapd.
If it is, compression is performed asynchronously by threads;
otherwise, it is done in the current thread. In the future, we may
have additional reclaim threads, such as for damon or
madv_pageout, etc.
> >
> > so i am thinking sth as below
> >
> > page_io.c
> >
> > if (sync_device or zswap_enabled())
> > schedule swap_writepage to a separate per-node thread
>
> I am not sure that's a good idea to mix a feature to solve different
> layers. That wouldn't be only swap problem. Such an parallelism under
> device is common technique these days and it would help file IO cases.
>
zswap and zram share the same needs, and handling this in page_io
can benefit both through common code. It is up to the callers to decide
the I/O model.
I agree that "parallelism under the device" is a common technique,
but our case is different—the device achieves parallelism with
offload hardware, whereas we rely on CPUs, which can be scarce.
These threads may also preempt CPUs that are critically needed
by other non-compression tasks, and burst power consumption
can sometimes be difficult to control.
> Furthermore, it would open the chance for zram to try compress
> multiple pages at once.
We are already in this situation when multiple callers use zram simultaneously,
such as during direct reclaim or with a mounted filesystem.
Of course, this allows multiple pages to be compressed simultaneously,
even if the user is single-threaded. However, determining when to enable
these threads and whether they will be effective is challenging, as it
depends on system load. For example, Qun-Wei's patch chose not to use
threads for direct reclaim as, I guess, it might be harmful.
Thanks
Barry
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-13 16:58 ` Barry Song
@ 2025-03-13 17:33 ` Minchan Kim
2025-03-13 20:37 ` Barry Song
0 siblings, 1 reply; 39+ messages in thread
From: Minchan Kim @ 2025-03-13 17:33 UTC (permalink / raw)
To: Barry Song
Cc: Sergey Senozhatsky, Qun-Wei Lin, Jens Axboe, Vishal Verma,
Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Huang, Ying, Kairui Song, Dan Schatzberg, Al Viro,
linux-kernel, linux-block, nvdimm, linux-mm, linux-arm-kernel,
linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu
On Fri, Mar 14, 2025 at 05:58:00AM +1300, Barry Song wrote:
> On Fri, Mar 14, 2025 at 5:07 AM Minchan Kim <minchan@kernel.org> wrote:
> >
> > On Thu, Mar 13, 2025 at 04:45:54PM +1300, Barry Song wrote:
> > > On Thu, Mar 13, 2025 at 4:09 PM Sergey Senozhatsky
> > > <senozhatsky@chromium.org> wrote:
> > > >
> > > > On (25/03/12 11:11), Minchan Kim wrote:
> > > > > On Fri, Mar 07, 2025 at 08:01:02PM +0800, Qun-Wei Lin wrote:
> > > > > > This patch series introduces a new mechanism called kcompressd to
> > > > > > improve the efficiency of memory reclaiming in the operating system. The
> > > > > > main goal is to separate the tasks of page scanning and page compression
> > > > > > into distinct processes or threads, thereby reducing the load on the
> > > > > > kswapd thread and enhancing overall system performance under high memory
> > > > > > pressure conditions.
> > > > > >
> > > > > > Problem:
> > > > > > In the current system, the kswapd thread is responsible for both
> > > > > > scanning the LRU pages and compressing pages into the ZRAM. This
> > > > > > combined responsibility can lead to significant performance bottlenecks,
> > > > > > especially under high memory pressure. The kswapd thread becomes a
> > > > > > single point of contention, causing delays in memory reclaiming and
> > > > > > overall system performance degradation.
> > > > >
> > > > > Isn't it general problem if backend for swap is slow(but synchronous)?
> > > > > I think zram need to support asynchrnous IO(can do introduce multiple
> > > > > threads to compress batched pages) and doesn't declare it's
> > > > > synchrnous device for the case.
> > > >
> > > > The current conclusion is that kcompressd will sit above zram,
> > > > because zram is not the only compressing swap backend we have.
> >
> > Then, how handles the file IO case?
>
> I didn't quite catch your question :-)
Sorry for not clear.
What I meant was zram is also used for fs backend storage, not only
for swapbackend. The multiple simultaneous compression can help the case,
too.
>
> >
> > >
> > > also. it is not good to hack zram to be aware of if it is kswapd
> > > , direct reclaim , proactive reclaim and block device with
> > > mounted filesystem.
> >
> > Why shouldn't zram be aware of that instead of just introducing
> > queues in the zram with multiple compression threads?
> >
>
> My view is the opposite of yours :-)
>
> Integrating kswapd, direct reclaim, etc., into the zram driver
> would violate layering principles. zram is purely a block device
That's the my question. What's the reason zram need to know about
kswapd, direct_reclaim and so on? I didn't understand your input.
> driver, and how it is used should be handled separately. Callers have
> greater flexibility to determine its usage, similar to how different
> I/O models exist in user space.
>
> Currently, Qun-Wei's patch checks whether the current thread is kswapd.
> If it is, compression is performed asynchronously by threads;
> otherwise, it is done in the current thread. In the future, we may
Okay, then, why should we do that without following normal asynchrnous
disk storage? VM justs put the IO request and sometimes congestion
control. Why is other logic needed for the case?
> have additional reclaim threads, such as for damon or
> madv_pageout, etc.
>
> > >
> > > so i am thinking sth as below
> > >
> > > page_io.c
> > >
> > > if (sync_device or zswap_enabled())
> > > schedule swap_writepage to a separate per-node thread
> >
> > I am not sure that's a good idea to mix a feature to solve different
> > layers. That wouldn't be only swap problem. Such an parallelism under
> > device is common technique these days and it would help file IO cases.
> >
>
> zswap and zram share the same needs, and handling this in page_io
> can benefit both through common code. It is up to the callers to decide
> the I/O model.
>
> I agree that "parallelism under the device" is a common technique,
> but our case is different—the device achieves parallelism with
> offload hardware, whereas we rely on CPUs, which can be scarce.
> These threads may also preempt CPUs that are critically needed
> by other non-compression tasks, and burst power consumption
> can sometimes be difficult to control.
That's general problem for common resources in the system and always
trace-off domain in the workload areas. Eng folks has tried to tune
them statically/dynamically depending on system behavior considering
what they priorites.
>
> > Furthermore, it would open the chance for zram to try compress
> > multiple pages at once.
>
> We are already in this situation when multiple callers use zram simultaneously,
> such as during direct reclaim or with a mounted filesystem.
>
> Of course, this allows multiple pages to be compressed simultaneously,
> even if the user is single-threaded. However, determining when to enable
> these threads and whether they will be effective is challenging, as it
> depends on system load. For example, Qun-Wei's patch chose not to use
> threads for direct reclaim as, I guess, it might be harmful.
Direct reclaim is already harmful and that's why VM has the logic
to throttle writeback or other special logics for kswapd or direct
reclaim path for th IO, which could be applied into the zram, too.
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-13 17:33 ` Minchan Kim
@ 2025-03-13 20:37 ` Barry Song
0 siblings, 0 replies; 39+ messages in thread
From: Barry Song @ 2025-03-13 20:37 UTC (permalink / raw)
To: Minchan Kim
Cc: Sergey Senozhatsky, Qun-Wei Lin, Jens Axboe, Vishal Verma,
Dan Williams, Dave Jiang, Ira Weiny, Andrew Morton,
Matthias Brugger, AngeloGioacchino Del Regno, Chris Li,
Ryan Roberts, Huang, Ying, Kairui Song, Dan Schatzberg, Al Viro,
linux-kernel, linux-block, nvdimm, linux-mm, linux-arm-kernel,
linux-mediatek, Casper Li, Chinwen Chang, Andrew Yang, James Hsu
On Fri, Mar 14, 2025 at 6:33 AM Minchan Kim <minchan@kernel.org> wrote:
>
> On Fri, Mar 14, 2025 at 05:58:00AM +1300, Barry Song wrote:
> > On Fri, Mar 14, 2025 at 5:07 AM Minchan Kim <minchan@kernel.org> wrote:
> > >
> > > On Thu, Mar 13, 2025 at 04:45:54PM +1300, Barry Song wrote:
> > > > On Thu, Mar 13, 2025 at 4:09 PM Sergey Senozhatsky
> > > > <senozhatsky@chromium.org> wrote:
> > > > >
> > > > > On (25/03/12 11:11), Minchan Kim wrote:
> > > > > > On Fri, Mar 07, 2025 at 08:01:02PM +0800, Qun-Wei Lin wrote:
> > > > > > > This patch series introduces a new mechanism called kcompressd to
> > > > > > > improve the efficiency of memory reclaiming in the operating system. The
> > > > > > > main goal is to separate the tasks of page scanning and page compression
> > > > > > > into distinct processes or threads, thereby reducing the load on the
> > > > > > > kswapd thread and enhancing overall system performance under high memory
> > > > > > > pressure conditions.
> > > > > > >
> > > > > > > Problem:
> > > > > > > In the current system, the kswapd thread is responsible for both
> > > > > > > scanning the LRU pages and compressing pages into the ZRAM. This
> > > > > > > combined responsibility can lead to significant performance bottlenecks,
> > > > > > > especially under high memory pressure. The kswapd thread becomes a
> > > > > > > single point of contention, causing delays in memory reclaiming and
> > > > > > > overall system performance degradation.
> > > > > >
> > > > > > Isn't it general problem if backend for swap is slow(but synchronous)?
> > > > > > I think zram need to support asynchrnous IO(can do introduce multiple
> > > > > > threads to compress batched pages) and doesn't declare it's
> > > > > > synchrnous device for the case.
> > > > >
> > > > > The current conclusion is that kcompressd will sit above zram,
> > > > > because zram is not the only compressing swap backend we have.
> > >
> > > Then, how handles the file IO case?
> >
> > I didn't quite catch your question :-)
>
> Sorry for not clear.
>
> What I meant was zram is also used for fs backend storage, not only
> for swapbackend. The multiple simultaneous compression can help the case,
> too.
I agree that multiple asynchronous threads might transparently improve
userspace read/write performance with just one thread or a very few threads.
However, it's unclear how genuine the requirement is. On the other hand,
in such cases, userspace can always optimize read/write bandwidth, for
example, by using aio_write() or similar methods if they do care about
the read/write bandwidth.
Once the user has multiple threads (close to the number of CPU cores),
asynchronous multi-threading won't offer any benefit and will only result
in increased context switching. I guess that is caused by the fundamental
difference between zram and other real devices with hardware offloads -
that zram always relies on the CPU and operates synchronously(no
offload, no interrupt from HW to notify the completion of compression).
>
> >
> > >
> > > >
> > > > also. it is not good to hack zram to be aware of if it is kswapd
> > > > , direct reclaim , proactive reclaim and block device with
> > > > mounted filesystem.
> > >
> > > Why shouldn't zram be aware of that instead of just introducing
> > > queues in the zram with multiple compression threads?
> > >
> >
> > My view is the opposite of yours :-)
> >
> > Integrating kswapd, direct reclaim, etc., into the zram driver
> > would violate layering principles. zram is purely a block device
>
> That's the my question. What's the reason zram need to know about
> kswapd, direct_reclaim and so on? I didn't understand your input.
Qun-Wei's patch 2/2, which modifies the zram driver, contains the following
code within the zram driver:
+int schedule_bio_write(void *mem, struct bio *bio, compress_callback cb)
+{
+ ...
+
+ if (!nr_kcompressd || !current_is_kswapd())
+ return -EBUSY;
+
+}
It's clear that Qun-Wei decided to disable asynchronous threading unless
the user is kswapd. Qun-Wei might be able to provide more insight on this
decision.
My guess is:
1. Determining the optimal number of threads is challenging due to varying
CPU topologies and software workloads. For example, if there are 8 threads
writing to zram, the default 4 threads might be slower than using all 8 threads
synchronously. For servers, we could have hundreds of CPUs.
On the other hand, if there is only one thread writing to zram, using 4 threads
might interfere with other workloads too much and cause the phone to heat up
quickly.
2. kswapd is the user that truly benefits from asynchronous threads. Since
it handles asynchronous memory reclamation, speeding up its process
reduces the likelihood of entering slowpath / direct reclamation. This is
where it has the greatest potential to make a positive impact.
>
> > driver, and how it is used should be handled separately. Callers have
> > greater flexibility to determine its usage, similar to how different
> > I/O models exist in user space.
> >
> > Currently, Qun-Wei's patch checks whether the current thread is kswapd.
> > If it is, compression is performed asynchronously by threads;
> > otherwise, it is done in the current thread. In the future, we may
>
> Okay, then, why should we do that without following normal asynchrnous
> disk storage? VM justs put the IO request and sometimes congestion
> control. Why is other logic needed for the case?
It seems there is still some uncertainty about why current_is_kswapd()
is necessary, so let's get input from Qun-Wei as well.
Despite all the discussions, one important point remains: zswap might
also need this asynchronous thread. For months, Yosry and Nhat have
been urging the zram and zswap teams to collaborate on those shared
requirements. Having one per-node thread for each kswapd could be the
low-hanging fruit for both zswap and zram.
Additionally, I don't see how the prototype I proposed here [1] would conflict
with potential future optimizations in zram, particularly those aimed at
improving filesystem read/write performance through multiple asynchronous
threads, if that is indeed a valid requirement.
[1] https://lore.kernel.org/lkml/20250313093005.13998-1-21cnbao@gmail.com/
>
> > have additional reclaim threads, such as for damon or
> > madv_pageout, etc.
> >
> > > >
> > > > so i am thinking sth as below
> > > >
> > > > page_io.c
> > > >
> > > > if (sync_device or zswap_enabled())
> > > > schedule swap_writepage to a separate per-node thread
> > >
> > > I am not sure that's a good idea to mix a feature to solve different
> > > layers. That wouldn't be only swap problem. Such an parallelism under
> > > device is common technique these days and it would help file IO cases.
> > >
> >
> > zswap and zram share the same needs, and handling this in page_io
> > can benefit both through common code. It is up to the callers to decide
> > the I/O model.
> >
> > I agree that "parallelism under the device" is a common technique,
> > but our case is different—the device achieves parallelism with
> > offload hardware, whereas we rely on CPUs, which can be scarce.
> > These threads may also preempt CPUs that are critically needed
> > by other non-compression tasks, and burst power consumption
> > can sometimes be difficult to control.
>
> That's general problem for common resources in the system and always
> trace-off domain in the workload areas. Eng folks has tried to tune
> them statically/dynamically depending on system behavior considering
> what they priorites.
Right, but haven't we yet taken on the task of tuning multi-threaded zram?
>
> >
> > > Furthermore, it would open the chance for zram to try compress
> > > multiple pages at once.
> >
> > We are already in this situation when multiple callers use zram simultaneously,
> > such as during direct reclaim or with a mounted filesystem.
> >
> > Of course, this allows multiple pages to be compressed simultaneously,
> > even if the user is single-threaded. However, determining when to enable
> > these threads and whether they will be effective is challenging, as it
> > depends on system load. For example, Qun-Wei's patch chose not to use
> > threads for direct reclaim as, I guess, it might be harmful.
>
> Direct reclaim is already harmful and that's why VM has the logic
> to throttle writeback or other special logics for kswapd or direct
> reclaim path for th IO, which could be applied into the zram, too.
I'm not entirely sure that the existing congestion or throttled writeback
can automatically tune itself effectively with non-offload resources. For
offload resources, the number of queues and the bandwidth remain stable,
but for CPUs, they fluctuate based on changes in system workloads.
Thanks
Barry
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-13 3:09 ` Sergey Senozhatsky
2025-03-13 3:45 ` Barry Song
@ 2025-03-13 3:52 ` Barry Song
2025-03-13 9:30 ` Barry Song
1 sibling, 1 reply; 39+ messages in thread
From: Barry Song @ 2025-03-13 3:52 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Minchan Kim, Qun-Wei Lin, Jens Axboe, Vishal Verma, Dan Williams,
Dave Jiang, Ira Weiny, Andrew Morton, Matthias Brugger,
AngeloGioacchino Del Regno, Chris Li, Ryan Roberts, Huang, Ying,
Kairui Song, Dan Schatzberg, Al Viro, linux-kernel, linux-block,
nvdimm, linux-mm, linux-arm-kernel, linux-mediatek, Casper Li,
Chinwen Chang, Andrew Yang, James Hsu
On Thu, Mar 13, 2025 at 4:09 PM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> On (25/03/12 11:11), Minchan Kim wrote:
> > On Fri, Mar 07, 2025 at 08:01:02PM +0800, Qun-Wei Lin wrote:
> > > This patch series introduces a new mechanism called kcompressd to
> > > improve the efficiency of memory reclaiming in the operating system. The
> > > main goal is to separate the tasks of page scanning and page compression
> > > into distinct processes or threads, thereby reducing the load on the
> > > kswapd thread and enhancing overall system performance under high memory
> > > pressure conditions.
> > >
> > > Problem:
> > > In the current system, the kswapd thread is responsible for both
> > > scanning the LRU pages and compressing pages into the ZRAM. This
> > > combined responsibility can lead to significant performance bottlenecks,
> > > especially under high memory pressure. The kswapd thread becomes a
> > > single point of contention, causing delays in memory reclaiming and
> > > overall system performance degradation.
> >
> > Isn't it general problem if backend for swap is slow(but synchronous)?
> > I think zram need to support asynchrnous IO(can do introduce multiple
> > threads to compress batched pages) and doesn't declare it's
> > synchrnous device for the case.
>
> The current conclusion is that kcompressd will sit above zram,
> because zram is not the only compressing swap backend we have.
also. it is not good to hack zram to be aware of if it is kswapd
, direct reclaim , proactive reclaim and block device with
mounted filesystem.
so i am thinking sth as below
page_io.c
if (sync_device or zswap_enabled())
schedule swap_writepage to a separate per-node thread
btw, ran the current patchset with one thread(not default 4)
on phones and saw 50%+ allocstall reduction. so the idea
looks like a good direction to go.
Thanks
Barry
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH 0/2] Improve Zram by separating compression context from kswapd
2025-03-13 3:52 ` Barry Song
@ 2025-03-13 9:30 ` Barry Song
0 siblings, 0 replies; 39+ messages in thread
From: Barry Song @ 2025-03-13 9:30 UTC (permalink / raw)
To: minchan, qun-wei.lin, senozhatsky, nphamcs
Cc: 21cnbao, akpm, andrew.yang, angelogioacchino.delregno, axboe,
casper.li, chinwen.chang, chrisl, dan.j.williams, dave.jiang,
ira.weiny, james.hsu, kasong, linux-arm-kernel, linux-block,
linux-kernel, linux-mediatek, linux-mm, matthias.bgg, nvdimm,
ryan.roberts, schatzberg.dan, viro, vishal.l.verma, ying.huang
On Thu, Mar 13, 2025 at 4:52 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Thu, Mar 13, 2025 at 4:09 PM Sergey Senozhatsky
> <senozhatsky@chromium.org> wrote:
> >
> > On (25/03/12 11:11), Minchan Kim wrote:
> > > On Fri, Mar 07, 2025 at 08:01:02PM +0800, Qun-Wei Lin wrote:
> > > > This patch series introduces a new mechanism called kcompressd to
> > > > improve the efficiency of memory reclaiming in the operating system. The
> > > > main goal is to separate the tasks of page scanning and page compression
> > > > into distinct processes or threads, thereby reducing the load on the
> > > > kswapd thread and enhancing overall system performance under high memory
> > > > pressure conditions.
> > > >
> > > > Problem:
> > > > In the current system, the kswapd thread is responsible for both
> > > > scanning the LRU pages and compressing pages into the ZRAM. This
> > > > combined responsibility can lead to significant performance bottlenecks,
> > > > especially under high memory pressure. The kswapd thread becomes a
> > > > single point of contention, causing delays in memory reclaiming and
> > > > overall system performance degradation.
> > >
> > > Isn't it general problem if backend for swap is slow(but synchronous)?
> > > I think zram need to support asynchrnous IO(can do introduce multiple
> > > threads to compress batched pages) and doesn't declare it's
> > > synchrnous device for the case.
> >
> > The current conclusion is that kcompressd will sit above zram,
> > because zram is not the only compressing swap backend we have.
>
> also. it is not good to hack zram to be aware of if it is kswapd
> , direct reclaim , proactive reclaim and block device with
> mounted filesystem.
>
> so i am thinking sth as below
>
> page_io.c
>
> if (sync_device or zswap_enabled())
> schedule swap_writepage to a separate per-node thread
>
Hi Qun-wei, Nhat, Sergey and Minchan,
I managed to find some time to prototype a kcompressd that supports
both zswap and zram, though it has only been build-tested.
Hi Qun-wei,
Apologies, but I’m quite busy with other tasks and don’t have time to
debug or test it. Please feel free to test it. When you submit v2, you’re
welcome to keep yourself as the author of the patch as v1.
If you’re okay with it, you can also add me as a co-developer in the
changelog. The below prototype, I'd rather start with a per-node thread
approach. While this might not provide the greatest benefit, it carries
the least risk and helps avoid complex questions, such as how to
determine the number of threads. - And we have actually observed a
significant reduction in allocstall by using a single thread to
asynchronously handle kswapd's compression as I reported.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index dbb0ad69e17f..4f9ee2fb338d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -23,6 +23,7 @@
#include <linux/page-flags.h>
#include <linux/local_lock.h>
#include <linux/zswap.h>
+#include <linux/kfifo.h>
#include <asm/page.h>
/* Free memory management - zoned buddy allocator. */
@@ -1389,6 +1390,11 @@ typedef struct pglist_data {
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
+#define KCOMPRESS_FIFO_SIZE 256
+ wait_queue_head_t kcompressd_wait;
+ struct task_struct *kcompressd;
+ struct kfifo kcompress_fifo;
+
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_highest_zoneidx;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 281802a7a10d..8cd143f59e76 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1410,6 +1410,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
pgdat_init_kcompactd(pgdat);
init_waitqueue_head(&pgdat->kswapd_wait);
+ init_waitqueue_head(&pgdat->kcompressd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
diff --git a/mm/page_io.c b/mm/page_io.c
index 4bce19df557b..7bbd14991ffb 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -233,6 +233,33 @@ static void swap_zeromap_folio_clear(struct folio *folio)
}
}
+static bool swap_sched_async_compress(struct folio *folio)
+{
+ struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ int nid = numa_node_id();
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ if (unlikely(!pgdat->kcompressd))
+ return false;
+
+ if (!current_is_kswapd())
+ return false;
+
+ if (!folio_test_anon(folio))
+ return false;
+ /*
+ * This case needs to synchronously return AOP_WRITEPAGE_ACTIVATE
+ */
+ if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio)))
+ return false;
+
+ sis = swp_swap_info(folio->swap);
+ if (zswap_is_enabled() || data_race(sis->flags & SWP_SYNCHRONOUS_IO))
+ return kfifo_in(&pgdat->kcompress_fifo, folio, sizeof(folio));
+
+ return false;
+}
+
/*
* We may have stale swap cache pages in memory: notice
* them here and get rid of the unnecessary final write.
@@ -275,6 +302,15 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
*/
swap_zeromap_folio_clear(folio);
}
+
+ /*
+ * Compression within zswap and zram might block rmap, unmap
+ * of both file and anon pages, try to do compression async
+ * if possible
+ */
+ if (swap_sched_async_compress(folio))
+ return 0;
+
if (zswap_store(folio)) {
count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
folio_unlock(folio);
@@ -289,6 +325,38 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
return 0;
}
+int kcompressd(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t *)p;
+ struct folio *folio;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = SWAP_CLUSTER_MAX,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ .for_reclaim = 1,
+ };
+
+ while (!kthread_should_stop()) {
+ wait_event_interruptible(pgdat->kcompressd_wait,
+ !kfifo_is_empty(&pgdat->kcompress_fifo));
+
+ if (kthread_should_stop())
+ break;
+ while(!kfifo_is_empty(&pgdat->kcompress_fifo)) {
+ if (kfifo_out(&pgdat->kcompress_fifo, &folio, sizeof(folio))) {
+ if (zswap_store(folio)) {
+ count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
+ folio_unlock(folio);
+ return 0;
+ }
+ __swap_writepage(folio, &wbc);
+ }
+ }
+ }
+ return 0;
+}
+
static inline void count_swpout_vm_event(struct folio *folio)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/swap.h b/mm/swap.h
index 0abb68091b4f..38d61c6a06f1 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -21,6 +21,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
void swap_write_unplug(struct swap_iocb *sio);
int swap_writepage(struct page *page, struct writeback_control *wbc);
void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
+int kcompressd(void *p);
/* linux/mm/swap_state.c */
/* One swap address space for each 64M swap space */
@@ -198,6 +199,11 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
return 0;
}
+static inline int kcompressd(void *p)
+{
+ return 0;
+}
+
#endif /* CONFIG_SWAP */
#endif /* _MM_SWAP_H */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2bc740637a6c..ba0245b74e45 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7370,6 +7370,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
void __meminit kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
+ int ret;
pgdat_kswapd_lock(pgdat);
if (!pgdat->kswapd) {
@@ -7383,7 +7384,23 @@ void __meminit kswapd_run(int nid)
} else {
wake_up_process(pgdat->kswapd);
}
+ ret = kfifo_alloc(&pgdat->kcompress_fifo,
+ KCOMPRESS_FIFO_SIZE * sizeof(struct folio *),
+ GFP_KERNEL);
+ if (ret)
+ goto out;
+ pgdat->kcompressd = kthread_create_on_node(kcompressd, pgdat, nid,
+ "kcompressd%d", nid);
+ if (IS_ERR(pgdat->kcompressd)) {
+ pr_err("Failed to start kcompressd on node %d,ret=%ld\n",
+ nid, PTR_ERR(pgdat->kcompressd));
+ pgdat->kcompressd = NULL;
+ kfifo_free(&pgdat->kcompress_fifo);
+ } else {
+ wake_up_process(pgdat->kcompressd);
+ }
}
+out:
pgdat_kswapd_unlock(pgdat);
}
@@ -7402,6 +7419,11 @@ void __meminit kswapd_stop(int nid)
kthread_stop(kswapd);
pgdat->kswapd = NULL;
}
+ if (pgdat->kcompressd) {
+ kthread_stop(pgdat->kcompressd);
+ pgdat->kcompressd = NULL;
+ kfifo_free(&pgdat->kcompress_fifo);
+ }
pgdat_kswapd_unlock(pgdat);
}
> btw, ran the current patchset with one thread(not default 4)
> on phones and saw 50%+ allocstall reduction. so the idea
> looks like a good direction to go.
>
Thanks
Barry
^ permalink raw reply [flat|nested] 39+ messages in thread