[PATCH v2] zram: Implement multi-page write-back

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2] zram: Implement multi-page write-back
       [not found] <tencent_78FC2C4FE16BA1EBAF0897DB60FCD675ED05@qq.com>
@ 2025-11-05  3:33 ` Yuwen Chen
  2025-11-05  6:48   ` [PATCH v3] " Yuwen Chen
  0 siblings, 1 reply; 21+ messages in thread
From: Yuwen Chen @ 2025-11-05  3:33 UTC (permalink / raw)
  To: ywen.chen
  Cc: richardycc, axboe, licayy, linux-kernel, minchan, senozhatsky,
	akpm, bgeffon, linux-block, linux-mm, liumartin

For block devices, sequential write performance is significantly
better than random write. Currently, zram's write-back function
only supports single-page operations, which fails to leverage
the sequential write advantage and leads to suboptimal performance.
This patch implements multi-page batch write-back for zram to
leverage sequential write performance of block devices.
After applying this patch, a large number of pages being merged
into batch write operations can be observed via the following test
code, which effectively improves write-back performance.

mount -t debugfs none /sys/kernel/debug/
echo "block:block_bio_frontmerge" >> /sys/kernel/debug/tracing/set_event
echo "block:block_bio_backmerge" >> /sys/kernel/debug/tracing/set_event
cat /sys/kernel/debug/tracing/trace_pipe &
echo "page_indexes=1-10000" > /sys/block/zram0/writeback

Signed-off-by: Yuwen Chen <ywen.chen@foxmail.com>
Reviewed-by: Fengyu Lian <licayy@outlook.com>
---
Changes in v2:
  - Rename some data structures.
  - Fix an exception caused by accessing a null pointer.
---
 drivers/block/zram/zram_drv.c | 223 ++++++++++++++++++++++++++--------
 1 file changed, 169 insertions(+), 54 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 4f2824a..d2bd091 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -751,21 +751,130 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
 	submit_bio(bio);
 }
 
-static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
-{
-	unsigned long blk_idx = 0;
-	struct page *page = NULL;
+enum {
+	/* Indicate that the request has been allocated */
+	ZRAM_WB_REQUEST_ALLOCATED = 0,
+
+	/* the request has been processed by the block device layer */
+	ZRAM_WB_REQUEST_COMPLETED,
+};
+
+struct zram_wb_request {
+	struct completion *done;
+	unsigned long blk_idx;
+	struct page *page;
 	struct zram_pp_slot *pps;
 	struct bio_vec bio_vec;
 	struct bio bio;
-	int ret = 0, err;
-	u32 index;
+	unsigned long flags;
+};
 
-	page = alloc_page(GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
+static int zram_writeback_complete(struct zram *zram, struct zram_wb_request *req)
+{
+	u32 index = 0;
+	int err;
 
-	while ((pps = select_pp_slot(ctl))) {
+	if (!test_and_clear_bit(ZRAM_WB_REQUEST_COMPLETED, &req->flags))
+		return 0;
+
+	err = blk_status_to_errno(req->bio.bi_status);
+	if (err)
+		return err;
+
+	index = req->pps->index;
+	atomic64_inc(&zram->stats.bd_writes);
+	zram_slot_lock(zram, index);
+	/*
+	 * Same as above, we release slot lock during writeback so
+	 * slot can change under us: slot_free() or slot_free() and
+	 * reallocation (zram_write_page()). In both cases slot loses
+	 * ZRAM_PP_SLOT flag. No concurrent post-processing can set
+	 * ZRAM_PP_SLOT on such slots until current post-processing
+	 * finishes.
+	 */
+	if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
+		goto next;
+
+	zram_free_page(zram, index);
+	zram_set_flag(zram, index, ZRAM_WB);
+	zram_set_handle(zram, index, req->blk_idx);
+	req->blk_idx = 0;
+	atomic64_inc(&zram->stats.pages_stored);
+	spin_lock(&zram->wb_limit_lock);
+	if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
+		zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
+	spin_unlock(&zram->wb_limit_lock);
+
+next:
+	zram_slot_unlock(zram, index);
+	release_pp_slot(zram, req->pps);
+	req->pps = NULL;
+	return 0;
+}
+
+static void zram_writeback_endio(struct bio *bio)
+{
+	struct zram_wb_request *req = bio->bi_private;
+
+	set_bit(ZRAM_WB_REQUEST_COMPLETED, &req->flags);
+	clear_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags);
+	complete(req->done);
+}
+
+static struct zram_wb_request *zram_writeback_next_request(struct zram_wb_request *pool,
+	int pool_cnt, int *cnt_off)
+{
+	struct zram_wb_request *req = NULL;
+	int i = 0;
+
+	for (i = *cnt_off; i < pool_cnt + *cnt_off; i++) {
+		req = &pool[i % pool_cnt];
+		if (!req->page)
+			continue;
+
+		if (!test_and_set_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags)) {
+			*cnt_off = (i + 1) % pool_cnt;
+			return req;
+		}
+	}
+	return NULL;
+}
+
+#define ZRAM_WB_REQ_CNT (32)
+static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
+{
+	int ret = 0, err, i = 0, off = 0;
+	int req_pool_cnt = 0;
+	struct zram_wb_request req_prealloc[2] = {0};
+	struct zram_wb_request *req = NULL, *req_pool = NULL;
+	DECLARE_COMPLETION_ONSTACK(done);
+	u32 index = 0;
+	struct blk_plug plug;
+
+	/* allocate memory for req_pool */
+	req_pool = kzalloc(sizeof(*req) * ZRAM_WB_REQ_CNT, GFP_KERNEL);
+	if (req_pool) {
+		req_pool_cnt = ZRAM_WB_REQ_CNT;
+	} else {
+		req_pool = req_prealloc;
+		req_pool_cnt = ARRAY_SIZE(req_prealloc);
+	}
+
+	for (i = 0; i < req_pool_cnt; i++) {
+		req_pool[i].done = &done;
+		req_pool[i].flags = 0;
+		req_pool[i].page = alloc_page(GFP_KERNEL);
+		if (req_pool[i].page)
+			req = &req_pool[i];
+	}
+	if (!req) {
+		ret = -ENOMEM;
+		goto out_free_req_pool;
+	}
+	set_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags);
+
+	blk_start_plug(&plug);
+	while ((req->pps = select_pp_slot(ctl))) {
 		spin_lock(&zram->wb_limit_lock);
 		if (zram->wb_limit_enable && !zram->bd_wb_limit) {
 			spin_unlock(&zram->wb_limit_lock);
@@ -774,15 +883,15 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 		}
 		spin_unlock(&zram->wb_limit_lock);
 
-		if (!blk_idx) {
-			blk_idx = alloc_block_bdev(zram);
-			if (!blk_idx) {
+		if (!req->blk_idx) {
+			req->blk_idx = alloc_block_bdev(zram);
+			if (!req->blk_idx) {
 				ret = -ENOSPC;
 				break;
 			}
 		}
 
-		index = pps->index;
+		index = req->pps->index;
 		zram_slot_lock(zram, index);
 		/*
 		 * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so
@@ -792,22 +901,32 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 		 */
 		if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
 			goto next;
-		if (zram_read_from_zspool(zram, page, index))
+		if (zram_read_from_zspool(zram, req->page, index))
 			goto next;
 		zram_slot_unlock(zram, index);
 
-		bio_init(&bio, zram->bdev, &bio_vec, 1,
+		bio_init(&req->bio, zram->bdev, &req->bio_vec, 1,
 			 REQ_OP_WRITE | REQ_SYNC);
-		bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
-		__bio_add_page(&bio, page, PAGE_SIZE, 0);
-
-		/*
-		 * XXX: A single page IO would be inefficient for write
-		 * but it would be not bad as starter.
-		 */
-		err = submit_bio_wait(&bio);
+		req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
+		req->bio.bi_end_io = zram_writeback_endio;
+		req->bio.bi_private = req;
+		__bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
+
+		list_del_init(&req->pps->entry);
+		submit_bio(&req->bio);
+
+		do {
+			req = zram_writeback_next_request(req_pool, req_pool_cnt, &off);
+			if (!req) {
+				blk_finish_plug(&plug);
+				wait_for_completion_io(&done);
+				blk_start_plug(&plug);
+			}
+		} while (!req);
+		err = zram_writeback_complete(zram, req);
 		if (err) {
-			release_pp_slot(zram, pps);
+			release_pp_slot(zram, req->pps);
+			req->pps = NULL;
 			/*
 			 * BIO errors are not fatal, we continue and simply
 			 * attempt to writeback the remaining objects (pages).
@@ -817,43 +936,39 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 			 * the most recent BIO error.
 			 */
 			ret = err;
-			continue;
 		}
+		cond_resched();
+		continue;
 
-		atomic64_inc(&zram->stats.bd_writes);
-		zram_slot_lock(zram, index);
-		/*
-		 * Same as above, we release slot lock during writeback so
-		 * slot can change under us: slot_free() or slot_free() and
-		 * reallocation (zram_write_page()). In both cases slot loses
-		 * ZRAM_PP_SLOT flag. No concurrent post-processing can set
-		 * ZRAM_PP_SLOT on such slots until current post-processing
-		 * finishes.
-		 */
-		if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
-			goto next;
-
-		zram_free_page(zram, index);
-		zram_set_flag(zram, index, ZRAM_WB);
-		zram_set_handle(zram, index, blk_idx);
-		blk_idx = 0;
-		atomic64_inc(&zram->stats.pages_stored);
-		spin_lock(&zram->wb_limit_lock);
-		if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
-			zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
-		spin_unlock(&zram->wb_limit_lock);
 next:
 		zram_slot_unlock(zram, index);
-		release_pp_slot(zram, pps);
-
+		release_pp_slot(zram, req->pps);
+		req->pps = NULL;
 		cond_resched();
 	}
+	blk_finish_plug(&plug);
 
-	if (blk_idx)
-		free_block_bdev(zram, blk_idx);
-	if (page)
-		__free_page(page);
+	if (req)
+		clear_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags);
+	for (i = 0; i < req_pool_cnt; i++) {
+		while (test_bit(ZRAM_WB_REQUEST_ALLOCATED, &req_pool[i].flags))
+			wait_for_completion_io(&done);
+		err = zram_writeback_complete(zram, &req_pool[i]);
+		if (err) {
+			release_pp_slot(zram, req_pool[i].pps);
+			req->pps = NULL;
+			ret = err;
+		}
+
+		if (req_pool[i].blk_idx)
+			free_block_bdev(zram, req_pool[i].blk_idx);
+		if (req_pool[i].page)
+			__free_page(req_pool[i].page);
+	}
 
+out_free_req_pool:
+	if (req_pool != req_prealloc)
+		kfree(req_pool);
 	return ret;
 }
 
-- 
2.34.1



^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v3] zram: Implement multi-page write-back
  2025-11-05  3:33 ` [PATCH v2] zram: Implement multi-page write-back Yuwen Chen
@ 2025-11-05  6:48   ` Yuwen Chen
  2025-11-05 15:25     ` Jens Axboe
  0 siblings, 1 reply; 21+ messages in thread
From: Yuwen Chen @ 2025-11-05  6:48 UTC (permalink / raw)
  To: ywen.chen
  Cc: akpm, axboe, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, senozhatsky

For block devices, sequential write performance is significantly
better than random write. Currently, zram's write-back function
only supports single-page operations, which fails to leverage
the sequential write advantage and leads to suboptimal performance.
This patch implements multi-page batch write-back for zram to
leverage sequential write performance of block devices.
After applying this patch, a large number of pages being merged
into batch write operations can be observed via the following test
code, which effectively improves write-back performance.

mount -t debugfs none /sys/kernel/debug/
echo "block:block_bio_frontmerge" >> /sys/kernel/debug/tracing/set_event
echo "block:block_bio_backmerge" >> /sys/kernel/debug/tracing/set_event
cat /sys/kernel/debug/tracing/trace_pipe &
echo "page_indexes=1-10000" > /sys/block/zram0/writeback

Signed-off-by: Yuwen Chen <ywen.chen@foxmail.com>
Reviewed-by: Fengyu Lian <licayy@outlook.com>
---
Changes in v3:
  - Postpone the page allocation.
Changes in v2:
  - Rename some data structures.
  - Fix an exception caused by accessing a null pointer.
---
 drivers/block/zram/zram_drv.c | 224 ++++++++++++++++++++++++++--------
 1 file changed, 170 insertions(+), 54 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 4f2824a..ce8fc3c 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -751,21 +751,131 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
 	submit_bio(bio);
 }
 
-static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
-{
-	unsigned long blk_idx = 0;
-	struct page *page = NULL;
+enum {
+	/* Indicate that the request has been allocated */
+	ZRAM_WB_REQUEST_ALLOCATED = 0,
+
+	/* the request has been processed by the block device layer */
+	ZRAM_WB_REQUEST_COMPLETED,
+};
+
+struct zram_wb_request {
+	struct completion *done;
+	unsigned long blk_idx;
+	struct page *page;
 	struct zram_pp_slot *pps;
 	struct bio_vec bio_vec;
 	struct bio bio;
-	int ret = 0, err;
-	u32 index;
+	unsigned long flags;
+};
 
-	page = alloc_page(GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
+static int zram_writeback_complete(struct zram *zram, struct zram_wb_request *req)
+{
+	u32 index = 0;
+	int err;
 
-	while ((pps = select_pp_slot(ctl))) {
+	if (!test_and_clear_bit(ZRAM_WB_REQUEST_COMPLETED, &req->flags))
+		return 0;
+
+	err = blk_status_to_errno(req->bio.bi_status);
+	if (err)
+		return err;
+
+	index = req->pps->index;
+	atomic64_inc(&zram->stats.bd_writes);
+	zram_slot_lock(zram, index);
+	/*
+	 * Same as above, we release slot lock during writeback so
+	 * slot can change under us: slot_free() or slot_free() and
+	 * reallocation (zram_write_page()). In both cases slot loses
+	 * ZRAM_PP_SLOT flag. No concurrent post-processing can set
+	 * ZRAM_PP_SLOT on such slots until current post-processing
+	 * finishes.
+	 */
+	if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
+		goto next;
+
+	zram_free_page(zram, index);
+	zram_set_flag(zram, index, ZRAM_WB);
+	zram_set_handle(zram, index, req->blk_idx);
+	req->blk_idx = 0;
+	atomic64_inc(&zram->stats.pages_stored);
+	spin_lock(&zram->wb_limit_lock);
+	if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
+		zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
+	spin_unlock(&zram->wb_limit_lock);
+
+next:
+	zram_slot_unlock(zram, index);
+	release_pp_slot(zram, req->pps);
+	req->pps = NULL;
+	return 0;
+}
+
+static void zram_writeback_endio(struct bio *bio)
+{
+	struct zram_wb_request *req = bio->bi_private;
+
+	set_bit(ZRAM_WB_REQUEST_COMPLETED, &req->flags);
+	clear_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags);
+	complete(req->done);
+}
+
+static struct zram_wb_request *zram_writeback_next_request(struct zram_wb_request *pool,
+	int pool_cnt, int *cnt_off)
+{
+	struct zram_wb_request *req = NULL;
+	int i = 0;
+
+	for (i = *cnt_off; i < pool_cnt + *cnt_off; i++) {
+		req = &pool[i % pool_cnt];
+		if (!req->page) {
+			/* This memory should be freed by the caller. */
+			req->page = alloc_page(GFP_KERNEL);
+			if (!req->page)
+				continue;
+		}
+
+		if (!test_and_set_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags)) {
+			*cnt_off = (i + 1) % pool_cnt;
+			return req;
+		}
+	}
+	return NULL;
+}
+
+#define ZRAM_WB_REQ_CNT (32)
+static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
+{
+	int ret = 0, err, i = 0, cnt_off = 0;
+	int req_pool_cnt = 0;
+	struct zram_wb_request req_prealloc[2] = {0};
+	struct zram_wb_request *req = NULL, *req_pool = NULL;
+	DECLARE_COMPLETION_ONSTACK(done);
+	u32 index = 0;
+	struct blk_plug plug;
+
+	/* allocate memory for req_pool */
+	req_pool = kzalloc(sizeof(*req) * ZRAM_WB_REQ_CNT, GFP_KERNEL);
+	if (req_pool) {
+		req_pool_cnt = ZRAM_WB_REQ_CNT;
+	} else {
+		req_pool = req_prealloc;
+		req_pool_cnt = ARRAY_SIZE(req_prealloc);
+	}
+
+	for (i = 0; i < req_pool_cnt; i++) {
+		req_pool[i].done = &done;
+		req_pool[i].flags = 0;
+	}
+	req = zram_writeback_next_request(req_pool, req_pool_cnt, &cnt_off);
+	if (!req) {
+		ret = -ENOMEM;
+		goto out_free_req_pool;
+	}
+
+	blk_start_plug(&plug);
+	while ((req->pps = select_pp_slot(ctl))) {
 		spin_lock(&zram->wb_limit_lock);
 		if (zram->wb_limit_enable && !zram->bd_wb_limit) {
 			spin_unlock(&zram->wb_limit_lock);
@@ -774,15 +884,15 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 		}
 		spin_unlock(&zram->wb_limit_lock);
 
-		if (!blk_idx) {
-			blk_idx = alloc_block_bdev(zram);
-			if (!blk_idx) {
+		if (!req->blk_idx) {
+			req->blk_idx = alloc_block_bdev(zram);
+			if (!req->blk_idx) {
 				ret = -ENOSPC;
 				break;
 			}
 		}
 
-		index = pps->index;
+		index = req->pps->index;
 		zram_slot_lock(zram, index);
 		/*
 		 * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so
@@ -792,22 +902,32 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 		 */
 		if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
 			goto next;
-		if (zram_read_from_zspool(zram, page, index))
+		if (zram_read_from_zspool(zram, req->page, index))
 			goto next;
 		zram_slot_unlock(zram, index);
 
-		bio_init(&bio, zram->bdev, &bio_vec, 1,
+		bio_init(&req->bio, zram->bdev, &req->bio_vec, 1,
 			 REQ_OP_WRITE | REQ_SYNC);
-		bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
-		__bio_add_page(&bio, page, PAGE_SIZE, 0);
-
-		/*
-		 * XXX: A single page IO would be inefficient for write
-		 * but it would be not bad as starter.
-		 */
-		err = submit_bio_wait(&bio);
+		req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
+		req->bio.bi_end_io = zram_writeback_endio;
+		req->bio.bi_private = req;
+		__bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
+
+		list_del_init(&req->pps->entry);
+		submit_bio(&req->bio);
+
+		do {
+			req = zram_writeback_next_request(req_pool, req_pool_cnt, &cnt_off);
+			if (!req) {
+				blk_finish_plug(&plug);
+				wait_for_completion_io(&done);
+				blk_start_plug(&plug);
+			}
+		} while (!req);
+		err = zram_writeback_complete(zram, req);
 		if (err) {
-			release_pp_slot(zram, pps);
+			release_pp_slot(zram, req->pps);
+			req->pps = NULL;
 			/*
 			 * BIO errors are not fatal, we continue and simply
 			 * attempt to writeback the remaining objects (pages).
@@ -817,43 +937,39 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 			 * the most recent BIO error.
 			 */
 			ret = err;
-			continue;
 		}
+		cond_resched();
+		continue;
 
-		atomic64_inc(&zram->stats.bd_writes);
-		zram_slot_lock(zram, index);
-		/*
-		 * Same as above, we release slot lock during writeback so
-		 * slot can change under us: slot_free() or slot_free() and
-		 * reallocation (zram_write_page()). In both cases slot loses
-		 * ZRAM_PP_SLOT flag. No concurrent post-processing can set
-		 * ZRAM_PP_SLOT on such slots until current post-processing
-		 * finishes.
-		 */
-		if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
-			goto next;
-
-		zram_free_page(zram, index);
-		zram_set_flag(zram, index, ZRAM_WB);
-		zram_set_handle(zram, index, blk_idx);
-		blk_idx = 0;
-		atomic64_inc(&zram->stats.pages_stored);
-		spin_lock(&zram->wb_limit_lock);
-		if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
-			zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
-		spin_unlock(&zram->wb_limit_lock);
 next:
 		zram_slot_unlock(zram, index);
-		release_pp_slot(zram, pps);
-
+		release_pp_slot(zram, req->pps);
+		req->pps = NULL;
 		cond_resched();
 	}
+	blk_finish_plug(&plug);
 
-	if (blk_idx)
-		free_block_bdev(zram, blk_idx);
-	if (page)
-		__free_page(page);
+	if (req)
+		clear_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags);
+	for (i = 0; i < req_pool_cnt; i++) {
+		while (test_bit(ZRAM_WB_REQUEST_ALLOCATED, &req_pool[i].flags))
+			wait_for_completion_io(&done);
+		err = zram_writeback_complete(zram, &req_pool[i]);
+		if (err) {
+			release_pp_slot(zram, req_pool[i].pps);
+			req->pps = NULL;
+			ret = err;
+		}
+
+		if (req_pool[i].blk_idx)
+			free_block_bdev(zram, req_pool[i].blk_idx);
+		if (req_pool[i].page)
+			__free_page(req_pool[i].page);
+	}
 
+out_free_req_pool:
+	if (req_pool != req_prealloc)
+		kfree(req_pool);
 	return ret;
 }
 
-- 
2.34.1



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] zram: Implement multi-page write-back
  2025-11-05  6:48   ` [PATCH v3] " Yuwen Chen
@ 2025-11-05 15:25     ` Jens Axboe
  2025-11-06  1:49       ` [PATCH v4] " Yuwen Chen
  2025-11-06  2:28       ` [PATCH v3] " Yuwen Chen
  0 siblings, 2 replies; 21+ messages in thread
From: Jens Axboe @ 2025-11-05 15:25 UTC (permalink / raw)
  To: Yuwen Chen
  Cc: akpm, bgeffon, licayy, linux-block, linux-kernel, linux-mm,
	liumartin, minchan, richardycc, senozhatsky

On 11/4/25 11:48 PM, Yuwen Chen wrote:
> For block devices, sequential write performance is significantly
> better than random write. Currently, zram's write-back function
> only supports single-page operations, which fails to leverage
> the sequential write advantage and leads to suboptimal performance.
> This patch implements multi-page batch write-back for zram to
> leverage sequential write performance of block devices.
> After applying this patch, a large number of pages being merged
> into batch write operations can be observed via the following test
> code, which effectively improves write-back performance.
> 
> mount -t debugfs none /sys/kernel/debug/
> echo "block:block_bio_frontmerge" >> /sys/kernel/debug/tracing/set_event
> echo "block:block_bio_backmerge" >> /sys/kernel/debug/tracing/set_event
> cat /sys/kernel/debug/tracing/trace_pipe &
> echo "page_indexes=1-10000" > /sys/block/zram0/writeback
> 
> Signed-off-by: Yuwen Chen <ywen.chen@foxmail.com>
> Reviewed-by: Fengyu Lian <licayy@outlook.com>
> ---
> Changes in v3:
>   - Postpone the page allocation.
> Changes in v2:
>   - Rename some data structures.
>   - Fix an exception caused by accessing a null pointer.

Please either finish the patch before sending it out, or take your
time before posting again. Sending 3 versions in one day will just
make people ignore you.

This commit message is in dire need of some actual performance
results. This is a change for better performance, no? If so, you
should have some clear numbers in there describing where it's
better, and where it's worse (if appropriate).

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v4] zram: Implement multi-page write-back
  2025-11-05 15:25     ` Jens Axboe
@ 2025-11-06  1:49       ` Yuwen Chen
  2025-11-10  4:49         ` Sergey Senozhatsky
                           ` (7 more replies)
  2025-11-06  2:28       ` [PATCH v3] " Yuwen Chen
  1 sibling, 8 replies; 21+ messages in thread
From: Yuwen Chen @ 2025-11-06  1:49 UTC (permalink / raw)
  To: axboe
  Cc: akpm, bgeffon, licayy, linux-block, linux-kernel, linux-mm,
	liumartin, minchan, richardycc, senozhatsky, ywen.chen

For block devices, sequential write performance is significantly
better than random write. Currently, zram's write-back function
only supports single-page operations, which fails to leverage
the sequential write advantage and leads to suboptimal performance.

This patch implements multi-page batch write-back for zram to
leverage sequential write performance of block devices.

After applying this patch, a large number of pages being merged
into batch write operations can be observed via the following test
code, which effectively improves write-back performance.

We used the following instructions to conduct a performance test
on the write-back function of zram in the QEMU environment.
$ echo "/dev/sdb" > /sys/block/zram0/backing_dev
$ echo "1024000000" > /sys/block/zram0/disksize
$ dd if=/dev/random of=/dev/zram0
$ time echo "page_indexes=1-100000" > /sys/block/zram0/writeback

before modification:
real	0m 16.62s
user	0m 0.00s
sys	0m 5.98s

real	0m 15.38s
user	0m 0.00s
sys	0m 5.31s

real	0m 15.58s
user	0m 0.00s
sys	0m 5.49s

after modification:
real	0m 1.36s
user	0m 0.00s
sys	0m 1.13s

real	0m 1.36s
user	0m 0.00s
sys	0m 1.11s

real	0m 1.39s
user	0m 0.00s
sys	0m 1.16s

Signed-off-by: Yuwen Chen <ywen.chen@foxmail.com>
Reviewed-by: Fengyu Lian <licayy@outlook.com>
---
Changes in v4:
  - Add performance test data.
Changes in v3:
  - Postpone the page allocation.
Changes in v2:
  - Rename some data structures.
  - Fix an exception caused by accessing a null pointer.
---
 drivers/block/zram/zram_drv.c | 224 ++++++++++++++++++++++++++--------
 1 file changed, 170 insertions(+), 54 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 4f2824a..ce8fc3c 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -751,21 +751,131 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
 	submit_bio(bio);
 }
 
-static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
-{
-	unsigned long blk_idx = 0;
-	struct page *page = NULL;
+enum {
+	/* Indicate that the request has been allocated */
+	ZRAM_WB_REQUEST_ALLOCATED = 0,
+
+	/* the request has been processed by the block device layer */
+	ZRAM_WB_REQUEST_COMPLETED,
+};
+
+struct zram_wb_request {
+	struct completion *done;
+	unsigned long blk_idx;
+	struct page *page;
 	struct zram_pp_slot *pps;
 	struct bio_vec bio_vec;
 	struct bio bio;
-	int ret = 0, err;
-	u32 index;
+	unsigned long flags;
+};
 
-	page = alloc_page(GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
+static int zram_writeback_complete(struct zram *zram, struct zram_wb_request *req)
+{
+	u32 index = 0;
+	int err;
 
-	while ((pps = select_pp_slot(ctl))) {
+	if (!test_and_clear_bit(ZRAM_WB_REQUEST_COMPLETED, &req->flags))
+		return 0;
+
+	err = blk_status_to_errno(req->bio.bi_status);
+	if (err)
+		return err;
+
+	index = req->pps->index;
+	atomic64_inc(&zram->stats.bd_writes);
+	zram_slot_lock(zram, index);
+	/*
+	 * Same as above, we release slot lock during writeback so
+	 * slot can change under us: slot_free() or slot_free() and
+	 * reallocation (zram_write_page()). In both cases slot loses
+	 * ZRAM_PP_SLOT flag. No concurrent post-processing can set
+	 * ZRAM_PP_SLOT on such slots until current post-processing
+	 * finishes.
+	 */
+	if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
+		goto next;
+
+	zram_free_page(zram, index);
+	zram_set_flag(zram, index, ZRAM_WB);
+	zram_set_handle(zram, index, req->blk_idx);
+	req->blk_idx = 0;
+	atomic64_inc(&zram->stats.pages_stored);
+	spin_lock(&zram->wb_limit_lock);
+	if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
+		zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
+	spin_unlock(&zram->wb_limit_lock);
+
+next:
+	zram_slot_unlock(zram, index);
+	release_pp_slot(zram, req->pps);
+	req->pps = NULL;
+	return 0;
+}
+
+static void zram_writeback_endio(struct bio *bio)
+{
+	struct zram_wb_request *req = bio->bi_private;
+
+	set_bit(ZRAM_WB_REQUEST_COMPLETED, &req->flags);
+	clear_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags);
+	complete(req->done);
+}
+
+static struct zram_wb_request *zram_writeback_next_request(struct zram_wb_request *pool,
+	int pool_cnt, int *cnt_off)
+{
+	struct zram_wb_request *req = NULL;
+	int i = 0;
+
+	for (i = *cnt_off; i < pool_cnt + *cnt_off; i++) {
+		req = &pool[i % pool_cnt];
+		if (!req->page) {
+			/* This memory should be freed by the caller. */
+			req->page = alloc_page(GFP_KERNEL);
+			if (!req->page)
+				continue;
+		}
+
+		if (!test_and_set_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags)) {
+			*cnt_off = (i + 1) % pool_cnt;
+			return req;
+		}
+	}
+	return NULL;
+}
+
+#define ZRAM_WB_REQ_CNT (32)
+static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
+{
+	int ret = 0, err, i = 0, cnt_off = 0;
+	int req_pool_cnt = 0;
+	struct zram_wb_request req_prealloc[2] = {0};
+	struct zram_wb_request *req = NULL, *req_pool = NULL;
+	DECLARE_COMPLETION_ONSTACK(done);
+	u32 index = 0;
+	struct blk_plug plug;
+
+	/* allocate memory for req_pool */
+	req_pool = kzalloc(sizeof(*req) * ZRAM_WB_REQ_CNT, GFP_KERNEL);
+	if (req_pool) {
+		req_pool_cnt = ZRAM_WB_REQ_CNT;
+	} else {
+		req_pool = req_prealloc;
+		req_pool_cnt = ARRAY_SIZE(req_prealloc);
+	}
+
+	for (i = 0; i < req_pool_cnt; i++) {
+		req_pool[i].done = &done;
+		req_pool[i].flags = 0;
+	}
+	req = zram_writeback_next_request(req_pool, req_pool_cnt, &cnt_off);
+	if (!req) {
+		ret = -ENOMEM;
+		goto out_free_req_pool;
+	}
+
+	blk_start_plug(&plug);
+	while ((req->pps = select_pp_slot(ctl))) {
 		spin_lock(&zram->wb_limit_lock);
 		if (zram->wb_limit_enable && !zram->bd_wb_limit) {
 			spin_unlock(&zram->wb_limit_lock);
@@ -774,15 +884,15 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 		}
 		spin_unlock(&zram->wb_limit_lock);
 
-		if (!blk_idx) {
-			blk_idx = alloc_block_bdev(zram);
-			if (!blk_idx) {
+		if (!req->blk_idx) {
+			req->blk_idx = alloc_block_bdev(zram);
+			if (!req->blk_idx) {
 				ret = -ENOSPC;
 				break;
 			}
 		}
 
-		index = pps->index;
+		index = req->pps->index;
 		zram_slot_lock(zram, index);
 		/*
 		 * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so
@@ -792,22 +902,32 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 		 */
 		if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
 			goto next;
-		if (zram_read_from_zspool(zram, page, index))
+		if (zram_read_from_zspool(zram, req->page, index))
 			goto next;
 		zram_slot_unlock(zram, index);
 
-		bio_init(&bio, zram->bdev, &bio_vec, 1,
+		bio_init(&req->bio, zram->bdev, &req->bio_vec, 1,
 			 REQ_OP_WRITE | REQ_SYNC);
-		bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
-		__bio_add_page(&bio, page, PAGE_SIZE, 0);
-
-		/*
-		 * XXX: A single page IO would be inefficient for write
-		 * but it would be not bad as starter.
-		 */
-		err = submit_bio_wait(&bio);
+		req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
+		req->bio.bi_end_io = zram_writeback_endio;
+		req->bio.bi_private = req;
+		__bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
+
+		list_del_init(&req->pps->entry);
+		submit_bio(&req->bio);
+
+		do {
+			req = zram_writeback_next_request(req_pool, req_pool_cnt, &cnt_off);
+			if (!req) {
+				blk_finish_plug(&plug);
+				wait_for_completion_io(&done);
+				blk_start_plug(&plug);
+			}
+		} while (!req);
+		err = zram_writeback_complete(zram, req);
 		if (err) {
-			release_pp_slot(zram, pps);
+			release_pp_slot(zram, req->pps);
+			req->pps = NULL;
 			/*
 			 * BIO errors are not fatal, we continue and simply
 			 * attempt to writeback the remaining objects (pages).
@@ -817,43 +937,39 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 			 * the most recent BIO error.
 			 */
 			ret = err;
-			continue;
 		}
+		cond_resched();
+		continue;
 
-		atomic64_inc(&zram->stats.bd_writes);
-		zram_slot_lock(zram, index);
-		/*
-		 * Same as above, we release slot lock during writeback so
-		 * slot can change under us: slot_free() or slot_free() and
-		 * reallocation (zram_write_page()). In both cases slot loses
-		 * ZRAM_PP_SLOT flag. No concurrent post-processing can set
-		 * ZRAM_PP_SLOT on such slots until current post-processing
-		 * finishes.
-		 */
-		if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
-			goto next;
-
-		zram_free_page(zram, index);
-		zram_set_flag(zram, index, ZRAM_WB);
-		zram_set_handle(zram, index, blk_idx);
-		blk_idx = 0;
-		atomic64_inc(&zram->stats.pages_stored);
-		spin_lock(&zram->wb_limit_lock);
-		if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
-			zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
-		spin_unlock(&zram->wb_limit_lock);
 next:
 		zram_slot_unlock(zram, index);
-		release_pp_slot(zram, pps);
-
+		release_pp_slot(zram, req->pps);
+		req->pps = NULL;
 		cond_resched();
 	}
+	blk_finish_plug(&plug);
 
-	if (blk_idx)
-		free_block_bdev(zram, blk_idx);
-	if (page)
-		__free_page(page);
+	if (req)
+		clear_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags);
+	for (i = 0; i < req_pool_cnt; i++) {
+		while (test_bit(ZRAM_WB_REQUEST_ALLOCATED, &req_pool[i].flags))
+			wait_for_completion_io(&done);
+		err = zram_writeback_complete(zram, &req_pool[i]);
+		if (err) {
+			release_pp_slot(zram, req_pool[i].pps);
+			req->pps = NULL;
+			ret = err;
+		}
+
+		if (req_pool[i].blk_idx)
+			free_block_bdev(zram, req_pool[i].blk_idx);
+		if (req_pool[i].page)
+			__free_page(req_pool[i].page);
+	}
 
+out_free_req_pool:
+	if (req_pool != req_prealloc)
+		kfree(req_pool);
 	return ret;
 }
 
-- 
2.34.1



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-06  1:49       ` [PATCH v4] " Yuwen Chen
@ 2025-11-10  4:49         ` Sergey Senozhatsky
  2025-11-10  7:16           ` Yuwen Chen
  2025-11-12  5:18         ` Sergey Senozhatsky
                           ` (6 subsequent siblings)
  7 siblings, 1 reply; 21+ messages in thread
From: Sergey Senozhatsky @ 2025-11-10  4:49 UTC (permalink / raw)
  To: Yuwen Chen
  Cc: axboe, akpm, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, senozhatsky

On (25/11/06 09:49), Yuwen Chen wrote:
> For block devices, sequential write performance is significantly
> better than random write. Currently, zram's write-back function
> only supports single-page operations, which fails to leverage
> the sequential write advantage and leads to suboptimal performance.

As a side note:
You almost never do sequential writes to the backing device. The
thing is, e.g. when zram is used as swap, page faults happen randomly
and free up (slot-free) random page-size chunks (so random bits in
zram->bitmap become clear), which then get overwritten (zram simply
picks the first available bit from zram->bitmap) during next writeback.
There is nothing sequential about that, in systems with sufficiently
large uptime and sufficiently frequent writeback/readback events
writeback bitmap becomes sparse, which results in random IO, so your
test tests an ideal case that almost never happens in practice.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-10  4:49         ` Sergey Senozhatsky
@ 2025-11-10  7:16           ` Yuwen Chen
  2025-11-12  5:16             ` Sergey Senozhatsky
  0 siblings, 1 reply; 21+ messages in thread
From: Yuwen Chen @ 2025-11-10  7:16 UTC (permalink / raw)
  To: senozhatsky
  Cc: akpm, axboe, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, ywen.chen

On 10 Nov 2025 13:49:26 +0900, Sergey Senozhatsky wrote:
> As a side note:
> You almost never do sequential writes to the backing device. The
> thing is, e.g. when zram is used as swap, page faults happen randomly
> and free up (slot-free) random page-size chunks (so random bits in
> zram->bitmap become clear), which then get overwritten (zram simply
> picks the first available bit from zram->bitmap) during next writeback.
> There is nothing sequential about that, in systems with sufficiently
> large uptime and sufficiently frequent writeback/readback events
> writeback bitmap becomes sparse, which results in random IO, so your
> test tests an ideal case that almost never happens in practice.

Thank you very much for your reply.
As you mentioned, the current test data was measured under the condition
that all writes were sequential writes. In a normal user environment,
there are a large number of random writes. However, the multiple
concurrent submissions implemented in this submission still have performance
advantages for storage devices. I artificially created the worst - case
scenario (all writes are random writes) with the following code:

for (int i = 0; i < nr_pages; i++)
    alloc_block_bdev(zram);

for (int i = 0; i < nr_pages; i += 2)
    free_block_bdev(zram, i);

On the physical machine, the measured data is as follows:
before modification:
real	0m0.624s
user	0m0.000s
sys	0m0.347s

real	0m0.663s
user	0m0.001s
sys	0m0.354s

real	0m0.635s
user	0m0.000s
sys	0m0.335s

after modification:
real	0m0.340s
user	0m0.000s
sys	0m0.239s

real	0m0.326s
user	0m0.000s
sys	0m0.230s

real	0m0.313s
user	0m0.000s
sys	0m0.223s

The test script is as follows:
# mknod /dev/loop45 b 7 45
# losetup /dev/loop45 ./zram_writeback.img
# echo "/dev/loop45" > /sys/block/zram0/backing_dev
# echo "1024000000" > /sys/block/zram0/disksize
# dd if=/dev/random of=/dev/zram0
# time echo "page_indexes=1-100000" > /sys/block/zram0/writeback

Thank you again for your reply.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-10  7:16           ` Yuwen Chen
@ 2025-11-12  5:16             ` Sergey Senozhatsky
  0 siblings, 0 replies; 21+ messages in thread
From: Sergey Senozhatsky @ 2025-11-12  5:16 UTC (permalink / raw)
  To: Yuwen Chen
  Cc: senozhatsky, akpm, axboe, bgeffon, licayy, linux-block,
	linux-kernel, linux-mm, liumartin, minchan, richardycc

On (25/11/10 15:16), Yuwen Chen wrote:
> On 10 Nov 2025 13:49:26 +0900, Sergey Senozhatsky wrote:
> > As a side note:
> > You almost never do sequential writes to the backing device. The
> > thing is, e.g. when zram is used as swap, page faults happen randomly
> > and free up (slot-free) random page-size chunks (so random bits in
> > zram->bitmap become clear), which then get overwritten (zram simply
> > picks the first available bit from zram->bitmap) during next writeback.
> > There is nothing sequential about that, in systems with sufficiently
> > large uptime and sufficiently frequent writeback/readback events
> > writeback bitmap becomes sparse, which results in random IO, so your
> > test tests an ideal case that almost never happens in practice.
> 
> Thank you very much for your reply.
> As you mentioned, the current test data was measured under the condition
> that all writes were sequential writes. In a normal user environment,
> there are a large number of random writes. However, the multiple
> concurrent submissions implemented in this submission still have performance
> advantages for storage devices. I artificially created the worst - case
> scenario (all writes are random writes) with the following code:
> 
> for (int i = 0; i < nr_pages; i++)
>     alloc_block_bdev(zram);
> 
> for (int i = 0; i < nr_pages; i += 2)
>     free_block_bdev(zram, i);

Well, technically, I guess that's not the worst case.  The worst case
is when writeback races with page-faults/slot-free events, which happen
on opposite sides of the writeback device and which clear ->bitmap bits
on the opposite sides, so for writeback you alternate all the time and
pick either head or tail slots (->bitmap bits).  But you don't need to
test it, it's just a note.

The thing that I'm curious about is why does it help for flash storage?
It's not a spinning disk, where seek times dominate the IO time.

> On the physical machine, the measured data is as follows:
> before modification:
> real	0m0.624s
> user	0m0.000s
> sys	0m0.347s
> 
> real	0m0.663s
> user	0m0.001s
> sys	0m0.354s
> 
> real	0m0.635s
> user	0m0.000s
> sys	0m0.335s
> 
> after modification:
> real	0m0.340s
> user	0m0.000s
> sys	0m0.239s
> 
> real	0m0.326s
> user	0m0.000s
> sys	0m0.230s
> 
> real	0m0.313s
> user	0m0.000s
> sys	0m0.223s

Thanks for testing.

My next question is: what problem do you solve with this?  I mean,
do you use it production (somewhere).  If so, do you have a rough
number of how many MiBs you writeback and how often, and what's the
performance impact of this patch.  Again, if you use it in production.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-06  1:49       ` [PATCH v4] " Yuwen Chen
  2025-11-10  4:49         ` Sergey Senozhatsky
@ 2025-11-12  5:18         ` Sergey Senozhatsky
  2025-11-12  6:57           ` Yuwen Chen
  2025-11-13  2:04         ` Sergey Senozhatsky
                           ` (5 subsequent siblings)
  7 siblings, 1 reply; 21+ messages in thread
From: Sergey Senozhatsky @ 2025-11-12  5:18 UTC (permalink / raw)
  To: Yuwen Chen
  Cc: axboe, akpm, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, senozhatsky

On (25/11/06 09:49), Yuwen Chen wrote:
[..]
> +	blk_start_plug(&plug);
> +	while ((req->pps = select_pp_slot(ctl))) {
>  		spin_lock(&zram->wb_limit_lock);
>  		if (zram->wb_limit_enable && !zram->bd_wb_limit) {
>  			spin_unlock(&zram->wb_limit_lock);
> @@ -774,15 +884,15 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
>  		}
>  		spin_unlock(&zram->wb_limit_lock);
>  
> -		if (!blk_idx) {
> -			blk_idx = alloc_block_bdev(zram);
> -			if (!blk_idx) {
> +		if (!req->blk_idx) {
> +			req->blk_idx = alloc_block_bdev(zram);
> +			if (!req->blk_idx) {
>  				ret = -ENOSPC;
>  				break;
>  			}
>  		}
>  
> -		index = pps->index;
> +		index = req->pps->index;
>  		zram_slot_lock(zram, index);
>  		/*
>  		 * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so
> @@ -792,22 +902,32 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
>  		 */
>  		if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
>  			goto next;
> -		if (zram_read_from_zspool(zram, page, index))
> +		if (zram_read_from_zspool(zram, req->page, index))
>  			goto next;
>  		zram_slot_unlock(zram, index);
>  
> -		bio_init(&bio, zram->bdev, &bio_vec, 1,
> +		bio_init(&req->bio, zram->bdev, &req->bio_vec, 1,
>  			 REQ_OP_WRITE | REQ_SYNC);
> -		bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
> -		__bio_add_page(&bio, page, PAGE_SIZE, 0);
> -
> -		/*
> -		 * XXX: A single page IO would be inefficient for write
> -		 * but it would be not bad as starter.
> -		 */
> -		err = submit_bio_wait(&bio);
> +		req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
> +		req->bio.bi_end_io = zram_writeback_endio;
> +		req->bio.bi_private = req;
> +		__bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
> +
> +		list_del_init(&req->pps->entry);
> +		submit_bio(&req->bio);
> +
> +		do {
> +			req = zram_writeback_next_request(req_pool, req_pool_cnt, &cnt_off);
> +			if (!req) {
> +				blk_finish_plug(&plug);
> +				wait_for_completion_io(&done);
> +				blk_start_plug(&plug);
> +			}
> +		} while (!req);

Why do you do this do-while loop here?


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-12  5:18         ` Sergey Senozhatsky
@ 2025-11-12  6:57           ` Yuwen Chen
  0 siblings, 0 replies; 21+ messages in thread
From: Yuwen Chen @ 2025-11-12  6:57 UTC (permalink / raw)
  To: senozhatsky
  Cc: akpm, axboe, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, ywen.chen

On Wed, 12 Nov 2025 14:16:20 +0900, Sergey Senozhatsky wrote:
> The thing that I'm curious about is why does it help for flash storage?
> It's not a spinning disk, where seek times dominate the IO time.

1. For flash-based storage devices such as UFS and NVMe, the Command Queue
mechanism is implemented. Submitting multiple random write requests can
fully utilize the bandwidth of their buses.

2. When we submit consecutive pages separately instead of submitting them
continuously together, the write amplification problem is more likely to
occur. This is because there is an LBA (Logical Block Addressing) table in UFS.

3. Sequential writing has lower requirements for the bus bandwidth.

> My next question is: what problem do you solve with this?  I mean,
> do you use it production (somewhere).  If so, do you have a rough
> number of how many MiBs you writeback and how often, and what's the
> performance impact of this patch.  Again, if you use it in production.

We haven't deployed this commit in the product yet. We're now deploying
it on mobile phones running the Android system. Our ideas are as follows:

1. When an app switches to the background, use process_madvise to swap out
the app's anonymous pages to zram. When the system is idle, cache the app
to the external UFS through the writeback interface.

2. When the system memory is tight and the IO load is low, use the IO load to
improve the memory release speed.

On Wed, 12 Nov 2025 14:18:01 +0900, Sergey Senozhatsky wrote:
> Why do you do this do-while loop here?

When there are no free zram_wb_request structures in the req_pool,
zram_writeback_next_request will return NULL. In this case, you need
to retry once to obtain a zram_wb_request.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-06  1:49       ` [PATCH v4] " Yuwen Chen
  2025-11-10  4:49         ` Sergey Senozhatsky
  2025-11-12  5:18         ` Sergey Senozhatsky
@ 2025-11-13  2:04         ` Sergey Senozhatsky
  2025-11-13  5:10           ` Sergey Senozhatsky
  2025-11-13  2:11         ` Sergey Senozhatsky
                           ` (4 subsequent siblings)
  7 siblings, 1 reply; 21+ messages in thread
From: Sergey Senozhatsky @ 2025-11-13  2:04 UTC (permalink / raw)
  To: Yuwen Chen
  Cc: axboe, akpm, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, senozhatsky

On (25/11/06 09:49), Yuwen Chen wrote:
> +
> +#define ZRAM_WB_REQ_CNT (32)
> +

How was this number chosen?  Did you try lower/higher values?
I think we might want this to be runtime tunable via sysfs, e.g.
writeback_batch_size attr, with min value of 1.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-13  2:04         ` Sergey Senozhatsky
@ 2025-11-13  5:10           ` Sergey Senozhatsky
  0 siblings, 0 replies; 21+ messages in thread
From: Sergey Senozhatsky @ 2025-11-13  5:10 UTC (permalink / raw)
  To: Yuwen Chen
  Cc: axboe, akpm, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, Sergey Senozhatsky

On (25/11/13 11:04), Sergey Senozhatsky wrote:
> On (25/11/06 09:49), Yuwen Chen wrote:
> > +
> > +#define ZRAM_WB_REQ_CNT (32)
> > +
> 
> How was this number chosen?  Did you try lower/higher values?
> I think we might want this to be runtime tunable via sysfs, e.g.
> writeback_batch_size attr, with min value of 1.

So I think something like this should work:

---
 drivers/block/zram/zram_drv.c | 50 ++++++++++++++++++++++++++++++-----
 drivers/block/zram/zram_drv.h |  1 +
 2 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 10b6e57603a0..cf92d4e8ca9b 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -570,6 +570,44 @@ static ssize_t writeback_limit_show(struct device *dev,
 	return sysfs_emit(buf, "%llu\n", val);
 }
 
+static ssize_t writeback_batch_size_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t len)
+{
+	struct zram *zram = dev_to_zram(dev);
+	u32 val;
+	ssize_t ret = -EINVAL;
+
+	if (kstrtouint(buf, 10, &val))
+		return ret;
+
+	if (!val)
+		val = 1;
+
+	down_read(&zram->init_lock);
+	zram->wb_batch_size = val;
+	up_read(&zram->init_lock);
+	ret = len;
+
+	return ret;
+}
+
+static ssize_t writeback_batch_size_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	u32 val;
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	spin_lock(&zram->wb_limit_lock);
+	val = zram->wb_batch_size;
+	spin_unlock(&zram->wb_limit_lock);
+	up_read(&zram->init_lock);
+
+	return sysfs_emit(buf, "%u\n", val);
+}
+
 static void reset_bdev(struct zram *zram)
 {
 	if (!zram->backing_dev)
@@ -776,10 +814,7 @@ static void release_wb_ctl(struct zram_wb_ctl *wb_ctl)
 	kfree(wb_ctl);
 }
 
-/* should be a module param */
-#define ZRAM_WB_REQ_CNT (32)
-
-static struct zram_wb_ctl *init_wb_ctl(void)
+static struct zram_wb_ctl *init_wb_ctl(struct zram *zram)
 {
 	struct zram_wb_ctl *wb_ctl;
 	int i;
@@ -793,7 +828,7 @@ static struct zram_wb_ctl *init_wb_ctl(void)
 	atomic_set(&wb_ctl->num_inflight, 0);
 	init_completion(&wb_ctl->done);
 
-	for (i = 0; i < ZRAM_WB_REQ_CNT; i++) {
+	for (i = 0; i < zram->wb_batch_size; i++) {
 		struct zram_wb_req *req;
 
 		req = kmalloc(sizeof(*req), GFP_KERNEL);
@@ -1145,7 +1180,7 @@ static ssize_t writeback_store(struct device *dev,
 		goto release_init_lock;
 	}
 
-	wb_ctl = init_wb_ctl();
+	wb_ctl = init_wb_ctl(zram);
 	if (!wb_ctl) {
 		ret = -ENOMEM;
 		goto release_init_lock;
@@ -2786,6 +2821,7 @@ static DEVICE_ATTR_RW(backing_dev);
 static DEVICE_ATTR_WO(writeback);
 static DEVICE_ATTR_RW(writeback_limit);
 static DEVICE_ATTR_RW(writeback_limit_enable);
+static DEVICE_ATTR_RW(writeback_batch_size);
 #endif
 #ifdef CONFIG_ZRAM_MULTI_COMP
 static DEVICE_ATTR_RW(recomp_algorithm);
@@ -2807,6 +2843,7 @@ static struct attribute *zram_disk_attrs[] = {
 	&dev_attr_writeback.attr,
 	&dev_attr_writeback_limit.attr,
 	&dev_attr_writeback_limit_enable.attr,
+	&dev_attr_writeback_batch_size.attr,
 #endif
 	&dev_attr_io_stat.attr,
 	&dev_attr_mm_stat.attr,
@@ -2868,6 +2905,7 @@ static int zram_add(void)
 
 	init_rwsem(&zram->init_lock);
 #ifdef CONFIG_ZRAM_WRITEBACK
+	zram->wb_batch_size = 1;
 	spin_lock_init(&zram->wb_limit_lock);
 #endif
 
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 6cee93f9c0d0..1a647f42c1a4 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -129,6 +129,7 @@ struct zram {
 	struct file *backing_dev;
 	spinlock_t wb_limit_lock;
 	bool wb_limit_enable;
+	u32 wb_batch_size;
 	u64 bd_wb_limit;
 	struct block_device *bdev;
 	unsigned long *bitmap;
-- 
2.51.2.1041.gc1ab5b90ca-goog


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-06  1:49       ` [PATCH v4] " Yuwen Chen
                           ` (2 preceding siblings ...)
  2025-11-13  2:04         ` Sergey Senozhatsky
@ 2025-11-13  2:11         ` Sergey Senozhatsky
  2025-11-13  2:20         ` Sergey Senozhatsky
                           ` (3 subsequent siblings)
  7 siblings, 0 replies; 21+ messages in thread
From: Sergey Senozhatsky @ 2025-11-13  2:11 UTC (permalink / raw)
  To: Yuwen Chen
  Cc: axboe, akpm, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, senozhatsky

On (25/11/06 09:49), Yuwen Chen wrote:
[..]
> +static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
> +{
[..]
> +	struct zram_wb_request req_prealloc[2] = {0};
[..]
> +	/* allocate memory for req_pool */
> +	req_pool = kzalloc(sizeof(*req) * ZRAM_WB_REQ_CNT, GFP_KERNEL);
> +	if (req_pool) {
> +		req_pool_cnt = ZRAM_WB_REQ_CNT;
> +	} else {
> +		req_pool = req_prealloc;
> +		req_pool_cnt = ARRAY_SIZE(req_prealloc);
> +	}

This looks like a preliminary optimization, I'd probably prefer
to not have req_prealloc entirely.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-06  1:49       ` [PATCH v4] " Yuwen Chen
                           ` (3 preceding siblings ...)
  2025-11-13  2:11         ` Sergey Senozhatsky
@ 2025-11-13  2:20         ` Sergey Senozhatsky
  2025-11-13  4:44           ` Sergey Senozhatsky
  2025-11-13  7:55           ` Yuwen Chen
  2025-11-13  5:40         ` Minchan Kim
                           ` (2 subsequent siblings)
  7 siblings, 2 replies; 21+ messages in thread
From: Sergey Senozhatsky @ 2025-11-13  2:20 UTC (permalink / raw)
  To: Yuwen Chen
  Cc: axboe, akpm, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, senozhatsky

On (25/11/06 09:49), Yuwen Chen wrote:
> +static struct zram_wb_request *zram_writeback_next_request(struct zram_wb_request *pool,
> +	int pool_cnt, int *cnt_off)
> +{
> +	struct zram_wb_request *req = NULL;
> +	int i = 0;
> +
> +	for (i = *cnt_off; i < pool_cnt + *cnt_off; i++) {
> +		req = &pool[i % pool_cnt];
> +		if (!req->page) {
> +			/* This memory should be freed by the caller. */
> +			req->page = alloc_page(GFP_KERNEL);
> +			if (!req->page)
> +				continue;
> +		}
> +
> +		if (!test_and_set_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags)) {
> +			*cnt_off = (i + 1) % pool_cnt;
> +			return req;
> +		}
> +	}
> +	return NULL;
> +}

So I wonder if things will look simpler (is this the word I'm looking
for?) if you just have two lists for requests: one list for completed/idle
requests and one list for in-flight requests (and you move requests
around accordingly).  Then you don't need to iterate the pool and check
flags, you just can check list_empty(&idle_requests) and take the first
(front) element.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-13  2:20         ` Sergey Senozhatsky
@ 2025-11-13  4:44           ` Sergey Senozhatsky
  2025-11-13  7:55           ` Yuwen Chen
  1 sibling, 0 replies; 21+ messages in thread
From: Sergey Senozhatsky @ 2025-11-13  4:44 UTC (permalink / raw)
  To: Yuwen Chen
  Cc: axboe, akpm, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, Sergey Senozhatsky

On (25/11/13 11:20), Sergey Senozhatsky wrote:
> On (25/11/06 09:49), Yuwen Chen wrote:
> > +static struct zram_wb_request *zram_writeback_next_request(struct zram_wb_request *pool,
> > +	int pool_cnt, int *cnt_off)
> > +{
> > +	struct zram_wb_request *req = NULL;
> > +	int i = 0;
> > +
> > +	for (i = *cnt_off; i < pool_cnt + *cnt_off; i++) {
> > +		req = &pool[i % pool_cnt];
> > +		if (!req->page) {
> > +			/* This memory should be freed by the caller. */
> > +			req->page = alloc_page(GFP_KERNEL);
> > +			if (!req->page)
> > +				continue;
> > +		}
> > +
> > +		if (!test_and_set_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags)) {
> > +			*cnt_off = (i + 1) % pool_cnt;
> > +			return req;
> > +		}
> > +	}
> > +	return NULL;
> > +}
> 
> So I wonder if things will look simpler (is this the word I'm looking
> for?) if you just have two lists for requests: one list for completed/idle
> requests and one list for in-flight requests (and you move requests
> around accordingly).  Then you don't need to iterate the pool and check
> flags, you just can check list_empty(&idle_requests) and take the first
> (front) element.


OK, so this is (look below) roughly what I want it to look like.
It's closer (in sense of logic and approach) to what we do for
post-processing, and I think it's easier to understand.

1. factored out a new structure zram_wb_ctl, similar to zram_pp_ctl;
2. zram_wb_ctl is initialized outside of zram_writeback_slots(),
   because this function is purely about pp-slot writeback. wb_ctl
   is passed to zram_writeback_slots() as an argument, just like pp_ctl;
3. requests move between two lists: idle and inflight requests;
4. factored out and unified the wait-for-completion logic;

TODO: writeback_batch_size device attr.

Please take a look?  Does this make sense to you, do this
work for you?

---
 drivers/block/zram/zram_drv.c | 312 ++++++++++++++++++++++++++--------
 1 file changed, 244 insertions(+), 68 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index a43074657531..60c3a61c7ee8 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -734,20 +734,195 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
 	submit_bio(bio);
 }
 
-static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
-{
-	unsigned long blk_idx = 0;
-	struct page *page = NULL;
+struct zram_wb_ctl {
+	struct list_head idle_reqs;
+	struct list_head inflight_reqs;
+
+	atomic_t num_inflight;
+	struct completion done;
+	struct blk_plug plug;
+};
+
+struct zram_wb_req {
+	unsigned long blk_idx;
+	struct page *page;
 	struct zram_pp_slot *pps;
 	struct bio_vec bio_vec;
 	struct bio bio;
-	int ret = 0, err;
+
+	struct list_head entry;
+};
+
+static void release_wb_req(struct zram_wb_req *req)
+{
+	__free_page(req->page);
+	kfree(req);
+}
+
+static void release_wb_ctl(struct zram_wb_ctl *wb_ctl)
+{
+	/* We should never have inflight requests at this point */
+	WARN_ON(!list_empty(&wb_ctl->inflight_reqs));
+
+	while (!list_empty(&wb_ctl->idle_reqs)) {
+		struct zram_wb_req *req;
+
+		req = list_first_entry(&wb_ctl->idle_reqs,
+				       struct zram_wb_req, entry);
+		list_del(&req->entry);
+		release_wb_req(req);
+	}
+
+	kfree(wb_ctl);
+}
+
+/* should be a module param */
+#define ZRAM_WB_REQ_CNT (32)
+
+static struct zram_wb_ctl *init_wb_ctl(void)
+{
+	struct zram_wb_ctl *wb_ctl;
+	int i;
+
+	wb_ctl = kmalloc(sizeof(*wb_ctl), GFP_KERNEL);
+	if (!wb_ctl)
+		return NULL;
+
+	INIT_LIST_HEAD(&wb_ctl->idle_reqs);
+	INIT_LIST_HEAD(&wb_ctl->inflight_reqs);
+	atomic_set(&wb_ctl->num_inflight, 0);
+	init_completion(&wb_ctl->done);
+
+	for (i = 0; i < ZRAM_WB_REQ_CNT; i++) {
+		struct zram_wb_req *req;
+
+		req = kmalloc(sizeof(*req), GFP_KERNEL);
+		if (!req)
+			goto release_wb_ctl;
+
+		req->page = alloc_page(GFP_KERNEL);
+		if (!req->page) {
+			kfree(req);
+			goto release_wb_ctl;
+		}
+
+		INIT_LIST_HEAD(&req->entry);
+		list_add(&req->entry, &wb_ctl->idle_reqs);
+	}
+
+	return wb_ctl;
+
+release_wb_ctl:
+	release_wb_ctl(wb_ctl);
+	return NULL;
+}
+
+static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req)
+{
 	u32 index;
+	int err;
 
-	page = alloc_page(GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
+	index = req->pps->index;
+	release_pp_slot(zram, req->pps);
+	req->pps = NULL;
+
+	err = blk_status_to_errno(req->bio.bi_status);
+	if (err)
+		return err;
+
+	atomic64_inc(&zram->stats.bd_writes);
+	zram_slot_lock(zram, index);
+	/*
+	 * We release slot lock during writeback so slot can change under us:
+	 * slot_free() or slot_free() and zram_write_page(). In both cases
+	 * slot loses ZRAM_PP_SLOT flag. No concurrent post-processing can
+	 * set ZRAM_PP_SLOT on such slots until current post-processing
+	 * finishes.
+	 */
+	if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
+		goto out;
+
+	zram_free_page(zram, index);
+	zram_set_flag(zram, index, ZRAM_WB);
+	zram_set_handle(zram, index, req->blk_idx);
+	atomic64_inc(&zram->stats.pages_stored);
+	spin_lock(&zram->wb_limit_lock);
+	if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
+		zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
+	spin_unlock(&zram->wb_limit_lock);
+
+out:
+	zram_slot_unlock(zram, index);
+	return 0;
+}
+
+static void zram_writeback_endio(struct bio *bio)
+{
+	struct zram_wb_ctl *wb_ctl = bio->bi_private;
+
+	if (atomic_dec_return(&wb_ctl->num_inflight) == 0)
+		complete(&wb_ctl->done);
+}
+
+static void zram_submit_wb_request(struct zram_wb_ctl *wb_ctl,
+				   struct zram_wb_req *req)
+{
+	atomic_inc(&wb_ctl->num_inflight);
+	list_add_tail(&req->entry, &wb_ctl->inflight_reqs);
+	submit_bio(&req->bio);
+}
+
+static struct zram_wb_req *select_idle_req(struct zram_wb_ctl *wb_ctl)
+{
+	struct zram_wb_req *req = NULL;
+
+	if (!list_empty(&wb_ctl->idle_reqs)) {
+		req = list_first_entry(&wb_ctl->idle_reqs,
+				       struct zram_wb_req, entry);
+		list_del(&req->entry);
+	}
 
+	return req;
+}
+
+static int zram_wb_wait_for_completion(struct zram *zram,
+				       struct zram_wb_ctl *wb_ctl)
+{
+	int ret = 0;
+
+	if (atomic_read(&wb_ctl->num_inflight) == 0)
+		return 0;
+
+	wait_for_completion_io(&wb_ctl->done);
+	reinit_completion(&wb_ctl->done);
+
+	while (!list_empty(&wb_ctl->inflight_reqs)) {
+		struct zram_wb_req *req;
+		int err;
+
+		req = list_first_entry(&wb_ctl->inflight_reqs,
+				       struct zram_wb_req, entry);
+		list_move(&req->entry, &wb_ctl->idle_reqs);
+
+		err = zram_writeback_complete(zram, req);
+		if (err)
+			ret = err;
+	}
+
+	return ret;
+}
+
+static int zram_writeback_slots(struct zram *zram,
+				struct zram_wb_ctl *wb_ctl,
+				struct zram_pp_ctl *ctl)
+{
+	struct zram_wb_req *req = NULL;
+	unsigned long blk_idx = 0;
+	struct zram_pp_slot *pps;
+	int ret = 0, err;
+	u32 index = 0;
+
+	blk_start_plug(&wb_ctl->plug);
 	while ((pps = select_pp_slot(ctl))) {
 		spin_lock(&zram->wb_limit_lock);
 		if (zram->wb_limit_enable && !zram->bd_wb_limit) {
@@ -757,15 +932,34 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 		}
 		spin_unlock(&zram->wb_limit_lock);
 
+		while (!req) {
+			req = select_idle_req(wb_ctl);
+			if (req)
+				break;
+
+			blk_finish_plug(&wb_ctl->plug);
+			err = zram_wb_wait_for_completion(zram, wb_ctl);
+			blk_start_plug(&wb_ctl->plug);
+			/*
+			 * BIO errors are not fatal, we continue and simply
+			 * attempt to writeback the remaining objects (pages).
+			 * At the same time we need to signal user-space that
+			 * some writes (at least one, but also could be all of
+			 * them) were not successful and we do so by returning
+			 * the most recent BIO error.
+			 */
+			if (err)
+				ret = err;
+		}
+
 		if (!blk_idx) {
 			blk_idx = alloc_block_bdev(zram);
-			if (!blk_idx) {
+			if (blk_idx) {
 				ret = -ENOSPC;
 				break;
 			}
 		}
 
-		index = pps->index;
 		zram_slot_lock(zram, index);
 		/*
 		 * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so
@@ -775,67 +969,41 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 		 */
 		if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
 			goto next;
-		if (zram_read_from_zspool(zram, page, index))
+		if (zram_read_from_zspool(zram, req->page, index))
 			goto next;
 		zram_slot_unlock(zram, index);
 
-		bio_init(&bio, zram->bdev, &bio_vec, 1,
+		req->blk_idx = blk_idx;
+		req->pps = pps;
+		bio_init(&req->bio, zram->bdev, &req->bio_vec, 1,
 			 REQ_OP_WRITE | REQ_SYNC);
-		bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
-		__bio_add_page(&bio, page, PAGE_SIZE, 0);
-
-		/*
-		 * XXX: A single page IO would be inefficient for write
-		 * but it would be not bad as starter.
-		 */
-		err = submit_bio_wait(&bio);
-		if (err) {
-			release_pp_slot(zram, pps);
-			/*
-			 * BIO errors are not fatal, we continue and simply
-			 * attempt to writeback the remaining objects (pages).
-			 * At the same time we need to signal user-space that
-			 * some writes (at least one, but also could be all of
-			 * them) were not successful and we do so by returning
-			 * the most recent BIO error.
-			 */
-			ret = err;
-			continue;
-		}
+		req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
+		req->bio.bi_end_io = zram_writeback_endio;
+		req->bio.bi_private = wb_ctl;
+		__bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
 
-		atomic64_inc(&zram->stats.bd_writes);
-		zram_slot_lock(zram, index);
-		/*
-		 * Same as above, we release slot lock during writeback so
-		 * slot can change under us: slot_free() or slot_free() and
-		 * reallocation (zram_write_page()). In both cases slot loses
-		 * ZRAM_PP_SLOT flag. No concurrent post-processing can set
-		 * ZRAM_PP_SLOT on such slots until current post-processing
-		 * finishes.
-		 */
-		if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
-			goto next;
-
-		zram_free_page(zram, index);
-		zram_set_flag(zram, index, ZRAM_WB);
-		zram_set_handle(zram, index, blk_idx);
+		zram_submit_wb_request(wb_ctl, req);
 		blk_idx = 0;
-		atomic64_inc(&zram->stats.pages_stored);
-		spin_lock(&zram->wb_limit_lock);
-		if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
-			zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
-		spin_unlock(&zram->wb_limit_lock);
+		req = NULL;
+		continue;
+
 next:
 		zram_slot_unlock(zram, index);
 		release_pp_slot(zram, pps);
-
 		cond_resched();
 	}
 
-	if (blk_idx)
-		free_block_bdev(zram, blk_idx);
-	if (page)
-		__free_page(page);
+	/*
+	 * Selected idle req, but never submitted it due to some error or
+	 * wb limit.
+	 */
+	if (req)
+		release_wb_req(req);
+
+	blk_finish_plug(&wb_ctl->plug);
+	err = zram_wb_wait_for_completion(zram, wb_ctl);
+	if (err)
+		ret = err;
 
 	return ret;
 }
@@ -948,7 +1116,8 @@ static ssize_t writeback_store(struct device *dev,
 	struct zram *zram = dev_to_zram(dev);
 	u64 nr_pages = zram->disksize >> PAGE_SHIFT;
 	unsigned long lo = 0, hi = nr_pages;
-	struct zram_pp_ctl *ctl = NULL;
+	struct zram_pp_ctl *pp_ctl = NULL;
+	struct zram_wb_ctl *wb_ctl = NULL;
 	char *args, *param, *val;
 	ssize_t ret = len;
 	int err, mode = 0;
@@ -970,8 +1139,14 @@ static ssize_t writeback_store(struct device *dev,
 		goto release_init_lock;
 	}
 
-	ctl = init_pp_ctl();
-	if (!ctl) {
+	pp_ctl = init_pp_ctl();
+	if (!pp_ctl) {
+		ret = -ENOMEM;
+		goto release_init_lock;
+	}
+
+	wb_ctl = init_wb_ctl();
+	if (!wb_ctl) {
 		ret = -ENOMEM;
 		goto release_init_lock;
 	}
@@ -1000,7 +1175,7 @@ static ssize_t writeback_store(struct device *dev,
 				goto release_init_lock;
 			}
 
-			scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+			scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
 			break;
 		}
 
@@ -1011,7 +1186,7 @@ static ssize_t writeback_store(struct device *dev,
 				goto release_init_lock;
 			}
 
-			scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+			scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
 			break;
 		}
 
@@ -1022,7 +1197,7 @@ static ssize_t writeback_store(struct device *dev,
 				goto release_init_lock;
 			}
 
-			scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+			scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
 			continue;
 		}
 
@@ -1033,17 +1208,18 @@ static ssize_t writeback_store(struct device *dev,
 				goto release_init_lock;
 			}
 
-			scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+			scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
 			continue;
 		}
 	}
 
-	err = zram_writeback_slots(zram, ctl);
+	err = zram_writeback_slots(zram, wb_ctl, pp_ctl);
 	if (err)
 		ret = err;
 
 release_init_lock:
-	release_pp_ctl(zram, ctl);
+	release_pp_ctl(zram, pp_ctl);
+	release_wb_ctl(wb_ctl);
 	atomic_set(&zram->pp_in_progress, 0);
 	up_read(&zram->init_lock);
 
-- 
2.51.2.1041.gc1ab5b90ca-goog



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-13  2:20         ` Sergey Senozhatsky
  2025-11-13  4:44           ` Sergey Senozhatsky
@ 2025-11-13  7:55           ` Yuwen Chen
  1 sibling, 0 replies; 21+ messages in thread
From: Yuwen Chen @ 2025-11-13  7:55 UTC (permalink / raw)
  To: senozhatsky
  Cc: akpm, axboe, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, ywen.chen

On Thu, 13 Nov 2025 11:04:04 +0900, Sergey Senozhatsky wrote:
> How was this number chosen?  Did you try lower/higher values?
> I think we might want this to be runtime tunable via sysfs, e.g.
> writeback_batch_size attr, with min value of 1.

I haven't conducted any tests on this value. I just set an empirical
value of 32 based on the submission queue length of the storage device.

As you said, providing a sys node for configuration might offer
performance advantages for mechanical hard drives.

On Thu, 13 Nov 2025 11:20:15 +0900, Sergey Senozhatsky wrote:
> So I wonder if things will look simpler (is this the word I'm looking
> for?) if you just have two lists for requests: one list for completed/idle
> requests and one list for in-flight requests (and you move requests
> around accordingly).  Then you don't need to iterate the pool and check
> flags, you just can check list_empty(&idle_requests) and take the first
> (front) element.

Yes, using two linked lists can reduce the complexity. It's just that
before I saw your submission, I couldn't find a better way to avoid
introducing locks. Thank you very much!



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-06  1:49       ` [PATCH v4] " Yuwen Chen
                           ` (4 preceding siblings ...)
  2025-11-13  2:20         ` Sergey Senozhatsky
@ 2025-11-13  5:40         ` Minchan Kim
  2025-11-13  6:03           ` Sergey Senozhatsky
  2025-11-13  7:37         ` Sergey Senozhatsky
  2025-11-13  7:55         ` Sergey Senozhatsky
  7 siblings, 1 reply; 21+ messages in thread
From: Minchan Kim @ 2025-11-13  5:40 UTC (permalink / raw)
  To: Yuwen Chen
  Cc: axboe, akpm, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, richardycc, senozhatsky

Hello Yuwen,

On Thu, Nov 06, 2025 at 09:49:01AM +0800, Yuwen Chen wrote:
> For block devices, sequential write performance is significantly
> better than random write. Currently, zram's write-back function
> only supports single-page operations, which fails to leverage
> the sequential write advantage and leads to suboptimal performance.
> 
> This patch implements multi-page batch write-back for zram to
> leverage sequential write performance of block devices.
> 
> After applying this patch, a large number of pages being merged
> into batch write operations can be observed via the following test
> code, which effectively improves write-back performance.
> 
> We used the following instructions to conduct a performance test
> on the write-back function of zram in the QEMU environment.
> $ echo "/dev/sdb" > /sys/block/zram0/backing_dev
> $ echo "1024000000" > /sys/block/zram0/disksize
> $ dd if=/dev/random of=/dev/zram0
> $ time echo "page_indexes=1-100000" > /sys/block/zram0/writeback
> 
> before modification:
> real	0m 16.62s
> user	0m 0.00s
> sys	0m 5.98s
> 
> real	0m 15.38s
> user	0m 0.00s
> sys	0m 5.31s
> 
> real	0m 15.58s
> user	0m 0.00s
> sys	0m 5.49s
> 
> after modification:
> real	0m 1.36s
> user	0m 0.00s
> sys	0m 1.13s
> 
> real	0m 1.36s
> user	0m 0.00s
> sys	0m 1.11s
> 
> real	0m 1.39s
> user	0m 0.00s
> sys	0m 1.16s

That's the great improvement. Thanks.

Have you check the other effort?
https://lore.kernel.org/lkml/20250731064949.1690732-3-richardycc@google.com/

In my opinion, it's much simpler and strightforward align with current zram
writeback utility functions.

Do you have any issue with that?

> 
> Signed-off-by: Yuwen Chen <ywen.chen@foxmail.com>
> Reviewed-by: Fengyu Lian <licayy@outlook.com>
> ---
> Changes in v4:
>   - Add performance test data.
> Changes in v3:
>   - Postpone the page allocation.
> Changes in v2:
>   - Rename some data structures.
>   - Fix an exception caused by accessing a null pointer.
> ---
>  drivers/block/zram/zram_drv.c | 224 ++++++++++++++++++++++++++--------
>  1 file changed, 170 insertions(+), 54 deletions(-)
> 
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 4f2824a..ce8fc3c 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -751,21 +751,131 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
>  	submit_bio(bio);
>  }
>  
> -static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
> -{
> -	unsigned long blk_idx = 0;
> -	struct page *page = NULL;
> +enum {
> +	/* Indicate that the request has been allocated */
> +	ZRAM_WB_REQUEST_ALLOCATED = 0,
> +
> +	/* the request has been processed by the block device layer */
> +	ZRAM_WB_REQUEST_COMPLETED,
> +};
> +
> +struct zram_wb_request {
> +	struct completion *done;
> +	unsigned long blk_idx;
> +	struct page *page;
>  	struct zram_pp_slot *pps;
>  	struct bio_vec bio_vec;
>  	struct bio bio;
> -	int ret = 0, err;
> -	u32 index;
> +	unsigned long flags;
> +};
>  
> -	page = alloc_page(GFP_KERNEL);
> -	if (!page)
> -		return -ENOMEM;
> +static int zram_writeback_complete(struct zram *zram, struct zram_wb_request *req)
> +{
> +	u32 index = 0;
> +	int err;
>  
> -	while ((pps = select_pp_slot(ctl))) {
> +	if (!test_and_clear_bit(ZRAM_WB_REQUEST_COMPLETED, &req->flags))
> +		return 0;
> +
> +	err = blk_status_to_errno(req->bio.bi_status);
> +	if (err)
> +		return err;
> +
> +	index = req->pps->index;
> +	atomic64_inc(&zram->stats.bd_writes);
> +	zram_slot_lock(zram, index);
> +	/*
> +	 * Same as above, we release slot lock during writeback so
> +	 * slot can change under us: slot_free() or slot_free() and
> +	 * reallocation (zram_write_page()). In both cases slot loses
> +	 * ZRAM_PP_SLOT flag. No concurrent post-processing can set
> +	 * ZRAM_PP_SLOT on such slots until current post-processing
> +	 * finishes.
> +	 */
> +	if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
> +		goto next;
> +
> +	zram_free_page(zram, index);
> +	zram_set_flag(zram, index, ZRAM_WB);
> +	zram_set_handle(zram, index, req->blk_idx);
> +	req->blk_idx = 0;
> +	atomic64_inc(&zram->stats.pages_stored);
> +	spin_lock(&zram->wb_limit_lock);
> +	if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
> +		zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
> +	spin_unlock(&zram->wb_limit_lock);
> +
> +next:
> +	zram_slot_unlock(zram, index);
> +	release_pp_slot(zram, req->pps);
> +	req->pps = NULL;
> +	return 0;
> +}
> +
> +static void zram_writeback_endio(struct bio *bio)
> +{
> +	struct zram_wb_request *req = bio->bi_private;
> +
> +	set_bit(ZRAM_WB_REQUEST_COMPLETED, &req->flags);
> +	clear_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags);
> +	complete(req->done);
> +}
> +
> +static struct zram_wb_request *zram_writeback_next_request(struct zram_wb_request *pool,
> +	int pool_cnt, int *cnt_off)
> +{
> +	struct zram_wb_request *req = NULL;
> +	int i = 0;
> +
> +	for (i = *cnt_off; i < pool_cnt + *cnt_off; i++) {
> +		req = &pool[i % pool_cnt];
> +		if (!req->page) {
> +			/* This memory should be freed by the caller. */
> +			req->page = alloc_page(GFP_KERNEL);
> +			if (!req->page)
> +				continue;
> +		}
> +
> +		if (!test_and_set_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags)) {
> +			*cnt_off = (i + 1) % pool_cnt;
> +			return req;
> +		}
> +	}
> +	return NULL;
> +}
> +
> +#define ZRAM_WB_REQ_CNT (32)
> +static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
> +{
> +	int ret = 0, err, i = 0, cnt_off = 0;
> +	int req_pool_cnt = 0;
> +	struct zram_wb_request req_prealloc[2] = {0};
> +	struct zram_wb_request *req = NULL, *req_pool = NULL;
> +	DECLARE_COMPLETION_ONSTACK(done);
> +	u32 index = 0;
> +	struct blk_plug plug;
> +
> +	/* allocate memory for req_pool */
> +	req_pool = kzalloc(sizeof(*req) * ZRAM_WB_REQ_CNT, GFP_KERNEL);
> +	if (req_pool) {
> +		req_pool_cnt = ZRAM_WB_REQ_CNT;
> +	} else {
> +		req_pool = req_prealloc;
> +		req_pool_cnt = ARRAY_SIZE(req_prealloc);
> +	}
> +
> +	for (i = 0; i < req_pool_cnt; i++) {
> +		req_pool[i].done = &done;
> +		req_pool[i].flags = 0;
> +	}
> +	req = zram_writeback_next_request(req_pool, req_pool_cnt, &cnt_off);
> +	if (!req) {
> +		ret = -ENOMEM;
> +		goto out_free_req_pool;
> +	}
> +
> +	blk_start_plug(&plug);
> +	while ((req->pps = select_pp_slot(ctl))) {
>  		spin_lock(&zram->wb_limit_lock);
>  		if (zram->wb_limit_enable && !zram->bd_wb_limit) {
>  			spin_unlock(&zram->wb_limit_lock);
> @@ -774,15 +884,15 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
>  		}
>  		spin_unlock(&zram->wb_limit_lock);
>  
> -		if (!blk_idx) {
> -			blk_idx = alloc_block_bdev(zram);
> -			if (!blk_idx) {
> +		if (!req->blk_idx) {
> +			req->blk_idx = alloc_block_bdev(zram);
> +			if (!req->blk_idx) {
>  				ret = -ENOSPC;
>  				break;
>  			}
>  		}
>  
> -		index = pps->index;
> +		index = req->pps->index;
>  		zram_slot_lock(zram, index);
>  		/*
>  		 * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so
> @@ -792,22 +902,32 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
>  		 */
>  		if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
>  			goto next;
> -		if (zram_read_from_zspool(zram, page, index))
> +		if (zram_read_from_zspool(zram, req->page, index))
>  			goto next;
>  		zram_slot_unlock(zram, index);
>  
> -		bio_init(&bio, zram->bdev, &bio_vec, 1,
> +		bio_init(&req->bio, zram->bdev, &req->bio_vec, 1,
>  			 REQ_OP_WRITE | REQ_SYNC);
> -		bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
> -		__bio_add_page(&bio, page, PAGE_SIZE, 0);
> -
> -		/*
> -		 * XXX: A single page IO would be inefficient for write
> -		 * but it would be not bad as starter.
> -		 */
> -		err = submit_bio_wait(&bio);
> +		req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
> +		req->bio.bi_end_io = zram_writeback_endio;
> +		req->bio.bi_private = req;
> +		__bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
> +
> +		list_del_init(&req->pps->entry);
> +		submit_bio(&req->bio);
> +
> +		do {
> +			req = zram_writeback_next_request(req_pool, req_pool_cnt, &cnt_off);
> +			if (!req) {
> +				blk_finish_plug(&plug);
> +				wait_for_completion_io(&done);
> +				blk_start_plug(&plug);
> +			}
> +		} while (!req);
> +		err = zram_writeback_complete(zram, req);
>  		if (err) {
> -			release_pp_slot(zram, pps);
> +			release_pp_slot(zram, req->pps);
> +			req->pps = NULL;
>  			/*
>  			 * BIO errors are not fatal, we continue and simply
>  			 * attempt to writeback the remaining objects (pages).
> @@ -817,43 +937,39 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
>  			 * the most recent BIO error.
>  			 */
>  			ret = err;
> -			continue;
>  		}
> +		cond_resched();
> +		continue;
>  
> -		atomic64_inc(&zram->stats.bd_writes);
> -		zram_slot_lock(zram, index);
> -		/*
> -		 * Same as above, we release slot lock during writeback so
> -		 * slot can change under us: slot_free() or slot_free() and
> -		 * reallocation (zram_write_page()). In both cases slot loses
> -		 * ZRAM_PP_SLOT flag. No concurrent post-processing can set
> -		 * ZRAM_PP_SLOT on such slots until current post-processing
> -		 * finishes.
> -		 */
> -		if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
> -			goto next;
> -
> -		zram_free_page(zram, index);
> -		zram_set_flag(zram, index, ZRAM_WB);
> -		zram_set_handle(zram, index, blk_idx);
> -		blk_idx = 0;
> -		atomic64_inc(&zram->stats.pages_stored);
> -		spin_lock(&zram->wb_limit_lock);
> -		if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
> -			zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
> -		spin_unlock(&zram->wb_limit_lock);
>  next:
>  		zram_slot_unlock(zram, index);
> -		release_pp_slot(zram, pps);
> -
> +		release_pp_slot(zram, req->pps);
> +		req->pps = NULL;
>  		cond_resched();
>  	}
> +	blk_finish_plug(&plug);
>  
> -	if (blk_idx)
> -		free_block_bdev(zram, blk_idx);
> -	if (page)
> -		__free_page(page);
> +	if (req)
> +		clear_bit(ZRAM_WB_REQUEST_ALLOCATED, &req->flags);
> +	for (i = 0; i < req_pool_cnt; i++) {
> +		while (test_bit(ZRAM_WB_REQUEST_ALLOCATED, &req_pool[i].flags))
> +			wait_for_completion_io(&done);
> +		err = zram_writeback_complete(zram, &req_pool[i]);
> +		if (err) {
> +			release_pp_slot(zram, req_pool[i].pps);
> +			req->pps = NULL;
> +			ret = err;
> +		}
> +
> +		if (req_pool[i].blk_idx)
> +			free_block_bdev(zram, req_pool[i].blk_idx);
> +		if (req_pool[i].page)
> +			__free_page(req_pool[i].page);
> +	}
>  
> +out_free_req_pool:
> +	if (req_pool != req_prealloc)
> +		kfree(req_pool);
>  	return ret;
>  }
>  
> -- 
> 2.34.1
> 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-13  5:40         ` Minchan Kim
@ 2025-11-13  6:03           ` Sergey Senozhatsky
  2025-11-13  8:27             ` Yuwen Chen
  0 siblings, 1 reply; 21+ messages in thread
From: Sergey Senozhatsky @ 2025-11-13  6:03 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Yuwen Chen, axboe, akpm, bgeffon, licayy, linux-block,
	linux-kernel, linux-mm, liumartin, richardycc, senozhatsky

On (25/11/12 21:40), Minchan Kim wrote:
> 
> In my opinion, it's much simpler and strightforward align with current zram
> writeback utility functions.

My preference is [1], which is very close to how current post-processing
is implemented in zram, w/o complexity that dedicated kthread handling
introduces and so on.

[1] https://lore.kernel.org/linux-mm/45b418277c6ae613783b9ecc714c96313ceb841d.1763013260.git.senozhatsky@chromium.org


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-13  6:03           ` Sergey Senozhatsky
@ 2025-11-13  8:27             ` Yuwen Chen
  0 siblings, 0 replies; 21+ messages in thread
From: Yuwen Chen @ 2025-11-13  8:27 UTC (permalink / raw)
  To: senozhatsky
  Cc: akpm, axboe, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, ywen.chen

On (25/11/12 21:40), Minchan Kim wrote:

> My preference is [1], which is very close to how current post-processing
> is implemented in zram, w/o complexity that dedicated kthread handling
> introduces and so on.

> [1] https://lore.kernel.org/linux-mm/45b418277c6ae613783b9ecc714c96313ceb841d.1763013260.git.senozhatsky@chromium.org

Yes, I also agree with this. Introducing threads in writeback will increase
the complexity.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-06  1:49       ` [PATCH v4] " Yuwen Chen
                           ` (5 preceding siblings ...)
  2025-11-13  5:40         ` Minchan Kim
@ 2025-11-13  7:37         ` Sergey Senozhatsky
  2025-11-13  7:55         ` Sergey Senozhatsky
  7 siblings, 0 replies; 21+ messages in thread
From: Sergey Senozhatsky @ 2025-11-13  7:37 UTC (permalink / raw)
  To: Yuwen Chen
  Cc: axboe, akpm, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, senozhatsky

On (25/11/06 09:49), Yuwen Chen wrote:
[..]
> +static int zram_writeback_complete(struct zram *zram, struct zram_wb_request *req)
> +{
> +	u32 index = 0;
> +	int err;
>  
> -	while ((pps = select_pp_slot(ctl))) {
> +	if (!test_and_clear_bit(ZRAM_WB_REQUEST_COMPLETED, &req->flags))
> +		return 0;
> +
> +	err = blk_status_to_errno(req->bio.bi_status);
> +	if (err)
> +		return err;
> +
> +	index = req->pps->index;
> +	atomic64_inc(&zram->stats.bd_writes);
> +	zram_slot_lock(zram, index);
> +	/*
> +	 * Same as above, we release slot lock during writeback so
> +	 * slot can change under us: slot_free() or slot_free() and
> +	 * reallocation (zram_write_page()). In both cases slot loses
> +	 * ZRAM_PP_SLOT flag. No concurrent post-processing can set
> +	 * ZRAM_PP_SLOT on such slots until current post-processing
> +	 * finishes.
> +	 */
> +	if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
> +		goto next;
> +
> +	zram_free_page(zram, index);
> +	zram_set_flag(zram, index, ZRAM_WB);
> +	zram_set_handle(zram, index, req->blk_idx);
> +	req->blk_idx = 0;
> +	atomic64_inc(&zram->stats.pages_stored);
> +	spin_lock(&zram->wb_limit_lock);
> +	if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
> +		zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
> +	spin_unlock(&zram->wb_limit_lock);

This should be done before the submission, not after the completion.
Otherwise we can significantly overshoot the wb_limit.  And we simply
need to rollback wb_limit adjustment for failed bio requests.

Will incorporate this into next iteration of the patch.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] zram: Implement multi-page write-back
  2025-11-06  1:49       ` [PATCH v4] " Yuwen Chen
                           ` (6 preceding siblings ...)
  2025-11-13  7:37         ` Sergey Senozhatsky
@ 2025-11-13  7:55         ` Sergey Senozhatsky
  7 siblings, 0 replies; 21+ messages in thread
From: Sergey Senozhatsky @ 2025-11-13  7:55 UTC (permalink / raw)
  To: Yuwen Chen
  Cc: axboe, akpm, bgeffon, licayy, linux-block, linux-kernel,
	linux-mm, liumartin, minchan, richardycc, senozhatsky

Just a minor side not:

> +
> +static struct zram_wb_request *zram_writeback_next_request(struct zram_wb_request *pool,
> +	int pool_cnt, int *cnt_off)
> +{

This is not in accordance with the kernel coding style.

If you use the best code editor (vim/nvim) then I'd highly
recommend the linuxsty plugin [1]

[1] https://github.com/gregkh/kernel-coding-style


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] zram: Implement multi-page write-back
  2025-11-05 15:25     ` Jens Axboe
  2025-11-06  1:49       ` [PATCH v4] " Yuwen Chen
@ 2025-11-06  2:28       ` Yuwen Chen
  1 sibling, 0 replies; 21+ messages in thread
From: Yuwen Chen @ 2025-11-06  2:28 UTC (permalink / raw)
  To: axboe
  Cc: akpm, bgeffon, licayy, linux-block, linux-kernel, linux-mm,
	liumartin, minchan, richardycc, senozhatsky, ywen.chen

On 11/5/25 08:25 AM, Jens Axboe wrote:
> Please either finish the patch before sending it out, or take your
> time before posting again. Sending 3 versions in one day will just
> make people ignore you.

> This commit message is in dire need of some actual performance
> results. This is a change for better performance, no? If so, you
> should have some clear numbers in there describing where it's
> better, and where it's worse (if appropriate).

Thank you very much for your reply. The description of the patch has
been revised. This time, performance test data has been added,
showing an approximate increase of 1000%. Please help review
version v4. Thank you again!

The QEMU parameters used for this test data are as follows:
sudo qemu-system-x86_64 \
        -M pc \
        -smp 8 \
        -m 2G \
        -kernel /boot/vmlinuz-`uname -r` \
        -hda system.img \
        -hdb zram_writeback.img \
        -append "root=/dev/sda rw init=/linuxrc console=ttyAMA0 console=ttyS0" \
        -virtfs local,path=`pwd`/../,mount_tag=hostshare,security_model=passthrough \
        -nographic

The approximate configuration of the host is as follows:
Kernel: x86_64 Linux 6.8.0-50-generic
CPU   : 12th Gen Intel Core i7-12700K @ 20x 4.9GHz
MEM   : 64G
DISK  : PC801 NVMe SK hynix 1TB



^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2025-11-13  8:27 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <tencent_78FC2C4FE16BA1EBAF0897DB60FCD675ED05@qq.com>
2025-11-05  3:33 ` [PATCH v2] zram: Implement multi-page write-back Yuwen Chen
2025-11-05  6:48   ` [PATCH v3] " Yuwen Chen
2025-11-05 15:25     ` Jens Axboe
2025-11-06  1:49       ` [PATCH v4] " Yuwen Chen
2025-11-10  4:49         ` Sergey Senozhatsky
2025-11-10  7:16           ` Yuwen Chen
2025-11-12  5:16             ` Sergey Senozhatsky
2025-11-12  5:18         ` Sergey Senozhatsky
2025-11-12  6:57           ` Yuwen Chen
2025-11-13  2:04         ` Sergey Senozhatsky
2025-11-13  5:10           ` Sergey Senozhatsky
2025-11-13  2:11         ` Sergey Senozhatsky
2025-11-13  2:20         ` Sergey Senozhatsky
2025-11-13  4:44           ` Sergey Senozhatsky
2025-11-13  7:55           ` Yuwen Chen
2025-11-13  5:40         ` Minchan Kim
2025-11-13  6:03           ` Sergey Senozhatsky
2025-11-13  8:27             ` Yuwen Chen
2025-11-13  7:37         ` Sergey Senozhatsky
2025-11-13  7:55         ` Sergey Senozhatsky
2025-11-06  2:28       ` [PATCH v3] " Yuwen Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox