* [RFC PATCHv5 1/6] zram: introduce writeback bio batching
2025-11-20 15:21 [RFC PATCHv5 0/6] zram: introduce writeback bio batching Sergey Senozhatsky
@ 2025-11-20 15:21 ` Sergey Senozhatsky
2025-11-21 7:05 ` Yuwen Chen
2025-11-21 7:40 ` Hannes Reinecke
2025-11-20 15:21 ` [RFC PATCHv5 2/6] zram: add writeback batch size device attr Sergey Senozhatsky
` (5 subsequent siblings)
6 siblings, 2 replies; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-20 15:21 UTC (permalink / raw)
To: Andrew Morton, Minchan Kim, Yuwen Chen, Richard Chang
Cc: Brian Geffon, Fengyu Lian, linux-kernel, linux-mm, linux-block,
Sergey Senozhatsky, Minchan Kim
Currently, zram writeback supports only a single bio writeback
operation, waiting for bio completion before post-processing
next pp-slot. This works, in general, but has certain throughput
limitations. Introduce batched (multiple) bio writeback support
to take advantage of parallel requests processing and better
requests scheduling.
For the time being the writeback batch size (maximum number of
in-flight bio requests) is set to 32 for all devices. A follow
up patch adds a writeback_batch_size device attribute, so the
batch size becomes run-time configurable.
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Co-developed-by: Yuwen Chen <ywen.chen@foxmail.com>
Co-developed-by: Richard Chang <richardycc@google.com>
Suggested-by: Minchan Kim <minchan@google.com>
---
drivers/block/zram/zram_drv.c | 366 +++++++++++++++++++++++++++-------
1 file changed, 298 insertions(+), 68 deletions(-)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index a43074657531..37c1416ac902 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -500,6 +500,24 @@ static ssize_t idle_store(struct device *dev,
}
#ifdef CONFIG_ZRAM_WRITEBACK
+struct zram_wb_ctl {
+ struct list_head idle_reqs;
+ struct list_head done_reqs;
+ wait_queue_head_t done_wait;
+ spinlock_t done_lock;
+ atomic_t num_inflight;
+};
+
+struct zram_wb_req {
+ unsigned long blk_idx;
+ struct page *page;
+ struct zram_pp_slot *pps;
+ struct bio_vec bio_vec;
+ struct bio bio;
+
+ struct list_head entry;
+};
+
static ssize_t writeback_limit_enable_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
@@ -734,19 +752,220 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
submit_bio(bio);
}
-static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
+static void release_wb_req(struct zram_wb_req *req)
{
- unsigned long blk_idx = 0;
- struct page *page = NULL;
- struct zram_pp_slot *pps;
- struct bio_vec bio_vec;
- struct bio bio;
+ __free_page(req->page);
+ kfree(req);
+}
+
+static void release_wb_ctl(struct zram_wb_ctl *wb_ctl)
+{
+ if (!wb_ctl)
+ return;
+
+ /* We should never have inflight requests at this point */
+ WARN_ON(atomic_read(&wb_ctl->num_inflight));
+ WARN_ON(!list_empty(&wb_ctl->done_reqs));
+
+ while (!list_empty(&wb_ctl->idle_reqs)) {
+ struct zram_wb_req *req;
+
+ req = list_first_entry(&wb_ctl->idle_reqs,
+ struct zram_wb_req, entry);
+ list_del(&req->entry);
+ release_wb_req(req);
+ }
+
+ kfree(wb_ctl);
+}
+
+/* XXX: should be a per-device sysfs attr */
+#define ZRAM_WB_REQ_CNT 32
+
+static struct zram_wb_ctl *init_wb_ctl(void)
+{
+ struct zram_wb_ctl *wb_ctl;
+ int i;
+
+ wb_ctl = kmalloc(sizeof(*wb_ctl), GFP_KERNEL);
+ if (!wb_ctl)
+ return NULL;
+
+ INIT_LIST_HEAD(&wb_ctl->idle_reqs);
+ INIT_LIST_HEAD(&wb_ctl->done_reqs);
+ atomic_set(&wb_ctl->num_inflight, 0);
+ init_waitqueue_head(&wb_ctl->done_wait);
+ spin_lock_init(&wb_ctl->done_lock);
+
+ for (i = 0; i < ZRAM_WB_REQ_CNT; i++) {
+ struct zram_wb_req *req;
+
+ /*
+ * This is fatal condition only if we couldn't allocate
+ * any requests at all. Otherwise we just work with the
+ * requests that we have successfully allocated, so that
+ * writeback can still proceed, even if there is only one
+ * request on the idle list.
+ */
+ req = kzalloc(sizeof(*req), GFP_KERNEL | __GFP_NOWARN);
+ if (!req)
+ break;
+
+ req->page = alloc_page(GFP_KERNEL | __GFP_NOWARN);
+ if (!req->page) {
+ kfree(req);
+ break;
+ }
+
+ list_add(&req->entry, &wb_ctl->idle_reqs);
+ }
+
+ /* We couldn't allocate any requests, so writeabck is not possible */
+ if (list_empty(&wb_ctl->idle_reqs))
+ goto release_wb_ctl;
+
+ return wb_ctl;
+
+release_wb_ctl:
+ release_wb_ctl(wb_ctl);
+ return NULL;
+}
+
+static void zram_account_writeback_rollback(struct zram *zram)
+{
+ spin_lock(&zram->wb_limit_lock);
+ if (zram->wb_limit_enable)
+ zram->bd_wb_limit += 1UL << (PAGE_SHIFT - 12);
+ spin_unlock(&zram->wb_limit_lock);
+}
+
+static void zram_account_writeback_submit(struct zram *zram)
+{
+ spin_lock(&zram->wb_limit_lock);
+ if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
+ zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
+ spin_unlock(&zram->wb_limit_lock);
+}
+
+static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req)
+{
+ u32 index = req->pps->index;
+ int err;
+
+ err = blk_status_to_errno(req->bio.bi_status);
+ if (err) {
+ /*
+ * Failed wb requests should not be accounted in wb_limit
+ * (if enabled).
+ */
+ zram_account_writeback_rollback(zram);
+ free_block_bdev(zram, req->blk_idx);
+ return err;
+ }
+
+ atomic64_inc(&zram->stats.bd_writes);
+ zram_slot_lock(zram, index);
+ /*
+ * We release slot lock during writeback so slot can change under us:
+ * slot_free() or slot_free() and zram_write_page(). In both cases
+ * slot loses ZRAM_PP_SLOT flag. No concurrent post-processing can
+ * set ZRAM_PP_SLOT on such slots until current post-processing
+ * finishes.
+ */
+ if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) {
+ free_block_bdev(zram, req->blk_idx);
+ goto out;
+ }
+
+ zram_free_page(zram, index);
+ zram_set_flag(zram, index, ZRAM_WB);
+ zram_set_handle(zram, index, req->blk_idx);
+ atomic64_inc(&zram->stats.pages_stored);
+
+out:
+ zram_slot_unlock(zram, index);
+ return 0;
+}
+
+static void zram_writeback_endio(struct bio *bio)
+{
+ struct zram_wb_req *req = container_of(bio, struct zram_wb_req, bio);
+ struct zram_wb_ctl *wb_ctl = bio->bi_private;
+ unsigned long flags;
+
+ spin_lock_irqsave(&wb_ctl->done_lock, flags);
+ list_add(&req->entry, &wb_ctl->done_reqs);
+ spin_unlock_irqrestore(&wb_ctl->done_lock, flags);
+
+ wake_up(&wb_ctl->done_wait);
+}
+
+static void zram_submit_wb_request(struct zram *zram,
+ struct zram_wb_ctl *wb_ctl,
+ struct zram_wb_req *req)
+{
+ /*
+ * wb_limit (if enabled) should be adjusted before submission,
+ * so that we don't over-submit.
+ */
+ zram_account_writeback_submit(zram);
+ atomic_inc(&wb_ctl->num_inflight);
+ req->bio.bi_private = wb_ctl;
+ submit_bio(&req->bio);
+}
+
+static int zram_complete_done_reqs(struct zram *zram,
+ struct zram_wb_ctl *wb_ctl)
+{
+ struct zram_wb_req *req;
+ unsigned long flags;
int ret = 0, err;
- u32 index;
- page = alloc_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
+ while (1) {
+ spin_lock_irqsave(&wb_ctl->done_lock, flags);
+ req = list_first_entry_or_null(&wb_ctl->done_reqs,
+ struct zram_wb_req, entry);
+ if (req)
+ list_del(&req->entry);
+ spin_unlock_irqrestore(&wb_ctl->done_lock, flags);
+
+ if (!req)
+ break;
+
+ err = zram_writeback_complete(zram, req);
+ if (err)
+ ret = err;
+
+ atomic_dec(&wb_ctl->num_inflight);
+ release_pp_slot(zram, req->pps);
+ req->pps = NULL;
+
+ list_add(&req->entry, &wb_ctl->idle_reqs);
+ }
+
+ return ret;
+}
+
+static struct zram_wb_req *zram_select_idle_req(struct zram_wb_ctl *wb_ctl)
+{
+ struct zram_wb_req *req;
+
+ req = list_first_entry_or_null(&wb_ctl->idle_reqs,
+ struct zram_wb_req, entry);
+ if (req)
+ list_del(&req->entry);
+ return req;
+}
+
+static int zram_writeback_slots(struct zram *zram,
+ struct zram_pp_ctl *ctl,
+ struct zram_wb_ctl *wb_ctl)
+{
+ struct zram_wb_req *req = NULL;
+ unsigned long blk_idx = 0;
+ struct zram_pp_slot *pps;
+ int ret = 0, err = 0;
+ u32 index = 0;
while ((pps = select_pp_slot(ctl))) {
spin_lock(&zram->wb_limit_lock);
@@ -757,6 +976,27 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
}
spin_unlock(&zram->wb_limit_lock);
+ while (!req) {
+ req = zram_select_idle_req(wb_ctl);
+ if (req)
+ break;
+
+ wait_event(wb_ctl->done_wait,
+ !list_empty(&wb_ctl->done_reqs));
+
+ err = zram_complete_done_reqs(zram, wb_ctl);
+ /*
+ * BIO errors are not fatal, we continue and simply
+ * attempt to writeback the remaining objects (pages).
+ * At the same time we need to signal user-space that
+ * some writes (at least one, but also could be all of
+ * them) were not successful and we do so by returning
+ * the most recent BIO error.
+ */
+ if (err)
+ ret = err;
+ }
+
if (!blk_idx) {
blk_idx = alloc_block_bdev(zram);
if (!blk_idx) {
@@ -775,67 +1015,47 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
*/
if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
goto next;
- if (zram_read_from_zspool(zram, page, index))
+ if (zram_read_from_zspool(zram, req->page, index))
goto next;
zram_slot_unlock(zram, index);
- bio_init(&bio, zram->bdev, &bio_vec, 1,
- REQ_OP_WRITE | REQ_SYNC);
- bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
- __bio_add_page(&bio, page, PAGE_SIZE, 0);
-
/*
- * XXX: A single page IO would be inefficient for write
- * but it would be not bad as starter.
+ * From now on pp-slot is owned by the req, remove it from
+ * its pp bucket.
*/
- err = submit_bio_wait(&bio);
- if (err) {
- release_pp_slot(zram, pps);
- /*
- * BIO errors are not fatal, we continue and simply
- * attempt to writeback the remaining objects (pages).
- * At the same time we need to signal user-space that
- * some writes (at least one, but also could be all of
- * them) were not successful and we do so by returning
- * the most recent BIO error.
- */
- ret = err;
- continue;
- }
+ list_del_init(&pps->entry);
- atomic64_inc(&zram->stats.bd_writes);
- zram_slot_lock(zram, index);
- /*
- * Same as above, we release slot lock during writeback so
- * slot can change under us: slot_free() or slot_free() and
- * reallocation (zram_write_page()). In both cases slot loses
- * ZRAM_PP_SLOT flag. No concurrent post-processing can set
- * ZRAM_PP_SLOT on such slots until current post-processing
- * finishes.
- */
- if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
- goto next;
+ req->blk_idx = blk_idx;
+ req->pps = pps;
+ bio_init(&req->bio, zram->bdev, &req->bio_vec, 1, REQ_OP_WRITE);
+ req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
+ req->bio.bi_end_io = zram_writeback_endio;
+ __bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
- zram_free_page(zram, index);
- zram_set_flag(zram, index, ZRAM_WB);
- zram_set_handle(zram, index, blk_idx);
+ zram_submit_wb_request(zram, wb_ctl, req);
blk_idx = 0;
- atomic64_inc(&zram->stats.pages_stored);
- spin_lock(&zram->wb_limit_lock);
- if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
- zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
- spin_unlock(&zram->wb_limit_lock);
+ req = NULL;
+ cond_resched();
+ continue;
+
next:
zram_slot_unlock(zram, index);
release_pp_slot(zram, pps);
-
- cond_resched();
}
- if (blk_idx)
- free_block_bdev(zram, blk_idx);
- if (page)
- __free_page(page);
+ /*
+ * Selected idle req, but never submitted it due to some error or
+ * wb limit.
+ */
+ if (req)
+ release_wb_req(req);
+
+ while (atomic_read(&wb_ctl->num_inflight) > 0) {
+ wait_event(wb_ctl->done_wait, !list_empty(&wb_ctl->done_reqs));
+ err = zram_complete_done_reqs(zram, wb_ctl);
+ if (err)
+ ret = err;
+ }
return ret;
}
@@ -948,7 +1168,8 @@ static ssize_t writeback_store(struct device *dev,
struct zram *zram = dev_to_zram(dev);
u64 nr_pages = zram->disksize >> PAGE_SHIFT;
unsigned long lo = 0, hi = nr_pages;
- struct zram_pp_ctl *ctl = NULL;
+ struct zram_pp_ctl *pp_ctl = NULL;
+ struct zram_wb_ctl *wb_ctl = NULL;
char *args, *param, *val;
ssize_t ret = len;
int err, mode = 0;
@@ -970,8 +1191,14 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
- ctl = init_pp_ctl();
- if (!ctl) {
+ pp_ctl = init_pp_ctl();
+ if (!pp_ctl) {
+ ret = -ENOMEM;
+ goto release_init_lock;
+ }
+
+ wb_ctl = init_wb_ctl();
+ if (!wb_ctl) {
ret = -ENOMEM;
goto release_init_lock;
}
@@ -1000,7 +1227,7 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
- scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
break;
}
@@ -1011,7 +1238,7 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
- scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
break;
}
@@ -1022,7 +1249,7 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
- scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
continue;
}
@@ -1033,17 +1260,18 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
- scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
continue;
}
}
- err = zram_writeback_slots(zram, ctl);
+ err = zram_writeback_slots(zram, pp_ctl, wb_ctl);
if (err)
ret = err;
release_init_lock:
- release_pp_ctl(zram, ctl);
+ release_pp_ctl(zram, pp_ctl);
+ release_wb_ctl(wb_ctl);
atomic_set(&zram->pp_in_progress, 0);
up_read(&zram->init_lock);
@@ -1112,7 +1340,9 @@ static int read_from_bdev(struct zram *zram, struct page *page,
return -EIO;
}
-static void free_block_bdev(struct zram *zram, unsigned long blk_idx) {};
+static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
+{
+}
#endif
#ifdef CONFIG_ZRAM_MEMORY_TRACKING
--
2.52.0.rc1.455.g30608eb744-goog
^ permalink raw reply [flat|nested] 36+ messages in thread* [RFC PATCHv5 1/6] zram: introduce writeback bio batching
2025-11-20 15:21 ` [RFC PATCHv5 1/6] " Sergey Senozhatsky
@ 2025-11-21 7:05 ` Yuwen Chen
2025-11-21 7:18 ` Sergey Senozhatsky
2025-11-21 7:40 ` Hannes Reinecke
1 sibling, 1 reply; 36+ messages in thread
From: Yuwen Chen @ 2025-11-21 7:05 UTC (permalink / raw)
To: senozhatsky
Cc: akpm, bgeffon, licayy, linux-block, linux-kernel, linux-mm,
minchan, minchan, richardycc, ywen.chen
On Fri, 21 Nov 2025 00:21:21 +0900, Sergey Senozhatsky wrote:
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> Co-developed-by: Yuwen Chen <ywen.chen@foxmail.com>
> Co-developed-by: Richard Chang <richardycc@google.com>
> Suggested-by: Minchan Kim <minchan@google.com>
I rarely participate in community discussions and I'm not familiar
with the processes here! I hope the community can be more friendly
to new members. Indeed, you wrote that patch all by yourself.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 1/6] zram: introduce writeback bio batching
2025-11-21 7:05 ` Yuwen Chen
@ 2025-11-21 7:18 ` Sergey Senozhatsky
0 siblings, 0 replies; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-21 7:18 UTC (permalink / raw)
To: Yuwen Chen
Cc: senozhatsky, akpm, bgeffon, licayy, linux-block, linux-kernel,
linux-mm, minchan, minchan, richardycc
On (25/11/21 15:05), Yuwen Chen wrote:
> On Fri, 21 Nov 2025 00:21:21 +0900, Sergey Senozhatsky wrote:
> > Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> > Co-developed-by: Yuwen Chen <ywen.chen@foxmail.com>
> > Co-developed-by: Richard Chang <richardycc@google.com>
> > Suggested-by: Minchan Kim <minchan@google.com>
>
> I rarely participate in community discussions and I'm not familiar
> with the processes here! I hope the community can be more friendly
> to new members. Indeed, you wrote that patch all by yourself.
I apologise if I was "unfriendly" to you in any way, that was not
my intention.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 1/6] zram: introduce writeback bio batching
2025-11-20 15:21 ` [RFC PATCHv5 1/6] " Sergey Senozhatsky
2025-11-21 7:05 ` Yuwen Chen
@ 2025-11-21 7:40 ` Hannes Reinecke
2025-11-21 7:47 ` Sergey Senozhatsky
1 sibling, 1 reply; 36+ messages in thread
From: Hannes Reinecke @ 2025-11-21 7:40 UTC (permalink / raw)
To: Sergey Senozhatsky, Andrew Morton, Minchan Kim, Yuwen Chen,
Richard Chang
Cc: Brian Geffon, Fengyu Lian, linux-kernel, linux-mm, linux-block,
Minchan Kim
On 11/20/25 16:21, Sergey Senozhatsky wrote:
> Currently, zram writeback supports only a single bio writeback
> operation, waiting for bio completion before post-processing
> next pp-slot. This works, in general, but has certain throughput
> limitations. Introduce batched (multiple) bio writeback support
> to take advantage of parallel requests processing and better
> requests scheduling.
>
> For the time being the writeback batch size (maximum number of
> in-flight bio requests) is set to 32 for all devices. A follow
> up patch adds a writeback_batch_size device attribute, so the
> batch size becomes run-time configurable.
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> Co-developed-by: Yuwen Chen <ywen.chen@foxmail.com>
> Co-developed-by: Richard Chang <richardycc@google.com>
> Suggested-by: Minchan Kim <minchan@google.com>
> ---
> drivers/block/zram/zram_drv.c | 366 +++++++++++++++++++++++++++-------
> 1 file changed, 298 insertions(+), 68 deletions(-)
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index a43074657531..37c1416ac902 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
[ .. ]
> +static int zram_complete_done_reqs(struct zram *zram,
> + struct zram_wb_ctl *wb_ctl)
> +{
> + struct zram_wb_req *req;
> + unsigned long flags;
> int ret = 0, err;
> - u32 index;
>
> - page = alloc_page(GFP_KERNEL);
> - if (!page)
> - return -ENOMEM;
> + while (1) {
> + spin_lock_irqsave(&wb_ctl->done_lock, flags);
> + req = list_first_entry_or_null(&wb_ctl->done_reqs,
> + struct zram_wb_req, entry);
> + if (req)
> + list_del(&req->entry);
> + spin_unlock_irqrestore(&wb_ctl->done_lock, flags);
> +
> + if (!req)
> + break;
> +
> + err = zram_writeback_complete(zram, req);
> + if (err)
> + ret = err;
> +
> + atomic_dec(&wb_ctl->num_inflight);
> + release_pp_slot(zram, req->pps);
> + req->pps = NULL;
> +
> + list_add(&req->entry, &wb_ctl->idle_reqs);
Shouldn't this be locked?
> + }
> +
> + return ret;
> +}
> +
> +static struct zram_wb_req *zram_select_idle_req(struct zram_wb_ctl *wb_ctl)
> +{
> + struct zram_wb_req *req;
> +
> + req = list_first_entry_or_null(&wb_ctl->idle_reqs,
> + struct zram_wb_req, entry);
> + if (req)
> + list_del(&req->entry);
See above. I think you need to lock this to avoid someone stepping in
here an modify the element under you.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [RFC PATCHv5 1/6] zram: introduce writeback bio batching
2025-11-21 7:40 ` Hannes Reinecke
@ 2025-11-21 7:47 ` Sergey Senozhatsky
0 siblings, 0 replies; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-21 7:47 UTC (permalink / raw)
To: Hannes Reinecke
Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim, Yuwen Chen,
Richard Chang, Brian Geffon, Fengyu Lian, linux-kernel, linux-mm,
linux-block, Minchan Kim
On (25/11/21 08:40), Hannes Reinecke wrote:
> > +static int zram_complete_done_reqs(struct zram *zram,
> > + struct zram_wb_ctl *wb_ctl)
> > +{
> > + struct zram_wb_req *req;
> > + unsigned long flags;
> > int ret = 0, err;
> > - u32 index;
> > - page = alloc_page(GFP_KERNEL);
> > - if (!page)
> > - return -ENOMEM;
> > + while (1) {
> > + spin_lock_irqsave(&wb_ctl->done_lock, flags);
> > + req = list_first_entry_or_null(&wb_ctl->done_reqs,
> > + struct zram_wb_req, entry);
> > + if (req)
> > + list_del(&req->entry);
> > + spin_unlock_irqrestore(&wb_ctl->done_lock, flags);
> > +
> > + if (!req)
> > + break;
> > +
> > + err = zram_writeback_complete(zram, req);
> > + if (err)
> > + ret = err;
> > +
> > + atomic_dec(&wb_ctl->num_inflight);
> > + release_pp_slot(zram, req->pps);
> > + req->pps = NULL;
> > +
> > + list_add(&req->entry, &wb_ctl->idle_reqs);
>
> Shouldn't this be locked?
See below.
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +static struct zram_wb_req *zram_select_idle_req(struct zram_wb_ctl *wb_ctl)
> > +{
> > + struct zram_wb_req *req;
> > +
> > + req = list_first_entry_or_null(&wb_ctl->idle_reqs,
> > + struct zram_wb_req, entry);
> > + if (req)
> > + list_del(&req->entry);
>
> See above. I think you need to lock this to avoid someone stepping in
> here an modify the element under you.
->idle_reqs list is mutated by one and one task only: the one that
does writeback. ->done_reqs list, on the other hand, is accessed
both from IRQ (bio completion) and from the writeback task.
^ permalink raw reply [flat|nested] 36+ messages in thread
* [RFC PATCHv5 2/6] zram: add writeback batch size device attr
2025-11-20 15:21 [RFC PATCHv5 0/6] zram: introduce writeback bio batching Sergey Senozhatsky
2025-11-20 15:21 ` [RFC PATCHv5 1/6] " Sergey Senozhatsky
@ 2025-11-20 15:21 ` Sergey Senozhatsky
2025-11-20 15:57 ` Brian Geffon
2025-11-21 2:48 ` Sergey Senozhatsky
2025-11-20 15:21 ` [RFC PATCHv5 3/6] zram: take write lock in wb limit store handlers Sergey Senozhatsky
` (4 subsequent siblings)
6 siblings, 2 replies; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-20 15:21 UTC (permalink / raw)
To: Andrew Morton, Minchan Kim, Yuwen Chen, Richard Chang
Cc: Brian Geffon, Fengyu Lian, linux-kernel, linux-mm, linux-block,
Sergey Senozhatsky
Introduce writeback_batch_size device attribute so that
the maximum number of in-flight writeback bio requests
can be configured at run-time per-device. This essentially
enables batched bio writeback.
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
drivers/block/zram/zram_drv.c | 48 ++++++++++++++++++++++++++++++-----
drivers/block/zram/zram_drv.h | 1 +
2 files changed, 43 insertions(+), 6 deletions(-)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 37c1416ac902..7904159e9226 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -588,6 +588,42 @@ static ssize_t writeback_limit_show(struct device *dev,
return sysfs_emit(buf, "%llu\n", val);
}
+static ssize_t writeback_batch_size_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct zram *zram = dev_to_zram(dev);
+ u32 val;
+ ssize_t ret = -EINVAL;
+
+ if (kstrtouint(buf, 10, &val))
+ return ret;
+
+ if (!val)
+ val = 1;
+
+ down_read(&zram->init_lock);
+ zram->wb_batch_size = val;
+ up_read(&zram->init_lock);
+ ret = len;
+
+ return ret;
+}
+
+static ssize_t writeback_batch_size_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ u32 val;
+ struct zram *zram = dev_to_zram(dev);
+
+ down_read(&zram->init_lock);
+ val = zram->wb_batch_size;
+ up_read(&zram->init_lock);
+
+ return sysfs_emit(buf, "%u\n", val);
+}
+
static void reset_bdev(struct zram *zram)
{
if (!zram->backing_dev)
@@ -779,10 +815,7 @@ static void release_wb_ctl(struct zram_wb_ctl *wb_ctl)
kfree(wb_ctl);
}
-/* XXX: should be a per-device sysfs attr */
-#define ZRAM_WB_REQ_CNT 32
-
-static struct zram_wb_ctl *init_wb_ctl(void)
+static struct zram_wb_ctl *init_wb_ctl(struct zram *zram)
{
struct zram_wb_ctl *wb_ctl;
int i;
@@ -797,7 +830,7 @@ static struct zram_wb_ctl *init_wb_ctl(void)
init_waitqueue_head(&wb_ctl->done_wait);
spin_lock_init(&wb_ctl->done_lock);
- for (i = 0; i < ZRAM_WB_REQ_CNT; i++) {
+ for (i = 0; i < zram->wb_batch_size; i++) {
struct zram_wb_req *req;
/*
@@ -1197,7 +1230,7 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
- wb_ctl = init_wb_ctl();
+ wb_ctl = init_wb_ctl(zram);
if (!wb_ctl) {
ret = -ENOMEM;
goto release_init_lock;
@@ -2840,6 +2873,7 @@ static DEVICE_ATTR_RW(backing_dev);
static DEVICE_ATTR_WO(writeback);
static DEVICE_ATTR_RW(writeback_limit);
static DEVICE_ATTR_RW(writeback_limit_enable);
+static DEVICE_ATTR_RW(writeback_batch_size);
#endif
#ifdef CONFIG_ZRAM_MULTI_COMP
static DEVICE_ATTR_RW(recomp_algorithm);
@@ -2861,6 +2895,7 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_writeback.attr,
&dev_attr_writeback_limit.attr,
&dev_attr_writeback_limit_enable.attr,
+ &dev_attr_writeback_batch_size.attr,
#endif
&dev_attr_io_stat.attr,
&dev_attr_mm_stat.attr,
@@ -2922,6 +2957,7 @@ static int zram_add(void)
init_rwsem(&zram->init_lock);
#ifdef CONFIG_ZRAM_WRITEBACK
+ zram->wb_batch_size = 32;
spin_lock_init(&zram->wb_limit_lock);
#endif
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 6cee93f9c0d0..1a647f42c1a4 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -129,6 +129,7 @@ struct zram {
struct file *backing_dev;
spinlock_t wb_limit_lock;
bool wb_limit_enable;
+ u32 wb_batch_size;
u64 bd_wb_limit;
struct block_device *bdev;
unsigned long *bitmap;
--
2.52.0.rc1.455.g30608eb744-goog
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [RFC PATCHv5 2/6] zram: add writeback batch size device attr
2025-11-20 15:21 ` [RFC PATCHv5 2/6] zram: add writeback batch size device attr Sergey Senozhatsky
@ 2025-11-20 15:57 ` Brian Geffon
2025-11-21 1:56 ` Sergey Senozhatsky
2025-11-21 2:48 ` Sergey Senozhatsky
1 sibling, 1 reply; 36+ messages in thread
From: Brian Geffon @ 2025-11-20 15:57 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Andrew Morton, Minchan Kim, Yuwen Chen, Richard Chang,
Fengyu Lian, linux-kernel, linux-mm, linux-block
On Thu, Nov 20, 2025 at 10:22 AM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> Introduce writeback_batch_size device attribute so that
> the maximum number of in-flight writeback bio requests
> can be configured at run-time per-device. This essentially
> enables batched bio writeback.
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
> drivers/block/zram/zram_drv.c | 48 ++++++++++++++++++++++++++++++-----
> drivers/block/zram/zram_drv.h | 1 +
> 2 files changed, 43 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 37c1416ac902..7904159e9226 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -588,6 +588,42 @@ static ssize_t writeback_limit_show(struct device *dev,
> return sysfs_emit(buf, "%llu\n", val);
> }
>
> +static ssize_t writeback_batch_size_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct zram *zram = dev_to_zram(dev);
> + u32 val;
> + ssize_t ret = -EINVAL;
> +
> + if (kstrtouint(buf, 10, &val))
> + return ret;
> +
> + if (!val)
> + val = 1;
IMO a value of 0 for the batch size doesn't make sense, -EINVAL?
> +
> + down_read(&zram->init_lock);
> + zram->wb_batch_size = val;
> + up_read(&zram->init_lock);
> + ret = len;
> +
> + return ret;
> +}
> +
> +static ssize_t writeback_batch_size_show(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + u32 val;
> + struct zram *zram = dev_to_zram(dev);
> +
> + down_read(&zram->init_lock);
> + val = zram->wb_batch_size;
> + up_read(&zram->init_lock);
> +
> + return sysfs_emit(buf, "%u\n", val);
> +}
> +
> static void reset_bdev(struct zram *zram)
> {
> if (!zram->backing_dev)
> @@ -779,10 +815,7 @@ static void release_wb_ctl(struct zram_wb_ctl *wb_ctl)
> kfree(wb_ctl);
> }
>
> -/* XXX: should be a per-device sysfs attr */
> -#define ZRAM_WB_REQ_CNT 32
> -
> -static struct zram_wb_ctl *init_wb_ctl(void)
> +static struct zram_wb_ctl *init_wb_ctl(struct zram *zram)
> {
> struct zram_wb_ctl *wb_ctl;
> int i;
> @@ -797,7 +830,7 @@ static struct zram_wb_ctl *init_wb_ctl(void)
> init_waitqueue_head(&wb_ctl->done_wait);
> spin_lock_init(&wb_ctl->done_lock);
>
> - for (i = 0; i < ZRAM_WB_REQ_CNT; i++) {
> + for (i = 0; i < zram->wb_batch_size; i++) {
> struct zram_wb_req *req;
>
> /*
> @@ -1197,7 +1230,7 @@ static ssize_t writeback_store(struct device *dev,
> goto release_init_lock;
> }
>
> - wb_ctl = init_wb_ctl();
> + wb_ctl = init_wb_ctl(zram);
> if (!wb_ctl) {
> ret = -ENOMEM;
> goto release_init_lock;
> @@ -2840,6 +2873,7 @@ static DEVICE_ATTR_RW(backing_dev);
> static DEVICE_ATTR_WO(writeback);
> static DEVICE_ATTR_RW(writeback_limit);
> static DEVICE_ATTR_RW(writeback_limit_enable);
> +static DEVICE_ATTR_RW(writeback_batch_size);
> #endif
> #ifdef CONFIG_ZRAM_MULTI_COMP
> static DEVICE_ATTR_RW(recomp_algorithm);
> @@ -2861,6 +2895,7 @@ static struct attribute *zram_disk_attrs[] = {
> &dev_attr_writeback.attr,
> &dev_attr_writeback_limit.attr,
> &dev_attr_writeback_limit_enable.attr,
> + &dev_attr_writeback_batch_size.attr,
> #endif
> &dev_attr_io_stat.attr,
> &dev_attr_mm_stat.attr,
> @@ -2922,6 +2957,7 @@ static int zram_add(void)
>
> init_rwsem(&zram->init_lock);
> #ifdef CONFIG_ZRAM_WRITEBACK
> + zram->wb_batch_size = 32;
> spin_lock_init(&zram->wb_limit_lock);
> #endif
>
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index 6cee93f9c0d0..1a647f42c1a4 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -129,6 +129,7 @@ struct zram {
> struct file *backing_dev;
> spinlock_t wb_limit_lock;
> bool wb_limit_enable;
> + u32 wb_batch_size;
> u64 bd_wb_limit;
> struct block_device *bdev;
> unsigned long *bitmap;
> --
> 2.52.0.rc1.455.g30608eb744-goog
>
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [RFC PATCHv5 2/6] zram: add writeback batch size device attr
2025-11-20 15:57 ` Brian Geffon
@ 2025-11-21 1:56 ` Sergey Senozhatsky
0 siblings, 0 replies; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-21 1:56 UTC (permalink / raw)
To: Brian Geffon
Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim, Yuwen Chen,
Richard Chang, Fengyu Lian, linux-kernel, linux-mm, linux-block
On (25/11/20 10:57), Brian Geffon wrote:
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index 37c1416ac902..7904159e9226 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -588,6 +588,42 @@ static ssize_t writeback_limit_show(struct device *dev,
> > return sysfs_emit(buf, "%llu\n", val);
> > }
> >
> > +static ssize_t writeback_batch_size_store(struct device *dev,
> > + struct device_attribute *attr,
> > + const char *buf, size_t len)
> > +{
> > + struct zram *zram = dev_to_zram(dev);
> > + u32 val;
> > + ssize_t ret = -EINVAL;
> > +
> > + if (kstrtouint(buf, 10, &val))
> > + return ret;
> > +
> > + if (!val)
> > + val = 1;
>
> IMO a value of 0 for the batch size doesn't make sense, -EINVAL?
Ack.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 2/6] zram: add writeback batch size device attr
2025-11-20 15:21 ` [RFC PATCHv5 2/6] zram: add writeback batch size device attr Sergey Senozhatsky
2025-11-20 15:57 ` Brian Geffon
@ 2025-11-21 2:48 ` Sergey Senozhatsky
1 sibling, 0 replies; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-21 2:48 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Andrew Morton, Minchan Kim, Yuwen Chen, Richard Chang,
Brian Geffon, Fengyu Lian, linux-kernel, linux-mm, linux-block
On (25/11/21 00:21), Sergey Senozhatsky wrote:
> +static ssize_t writeback_batch_size_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct zram *zram = dev_to_zram(dev);
> + u32 val;
> + ssize_t ret = -EINVAL;
> +
> + if (kstrtouint(buf, 10, &val))
> + return ret;
> +
> + if (!val)
> + val = 1;
> +
> + down_read(&zram->init_lock);
> + zram->wb_batch_size = val;
> + up_read(&zram->init_lock);
> + ret = len;
> +
> + return ret;
> +}
JFI, as was suggested by Andrew in v4 thread, this should use write
lock.
^ permalink raw reply [flat|nested] 36+ messages in thread
* [RFC PATCHv5 3/6] zram: take write lock in wb limit store handlers
2025-11-20 15:21 [RFC PATCHv5 0/6] zram: introduce writeback bio batching Sergey Senozhatsky
2025-11-20 15:21 ` [RFC PATCHv5 1/6] " Sergey Senozhatsky
2025-11-20 15:21 ` [RFC PATCHv5 2/6] zram: add writeback batch size device attr Sergey Senozhatsky
@ 2025-11-20 15:21 ` Sergey Senozhatsky
2025-11-20 16:03 ` Brian Geffon
2025-11-20 15:21 ` [RFC PATCHv5 4/6] zram: drop wb_limit_lock Sergey Senozhatsky
` (3 subsequent siblings)
6 siblings, 1 reply; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-20 15:21 UTC (permalink / raw)
To: Andrew Morton, Minchan Kim, Yuwen Chen, Richard Chang
Cc: Brian Geffon, Fengyu Lian, linux-kernel, linux-mm, linux-block,
Sergey Senozhatsky
Write device attrs handlers should take write zram init_lock.
While at it, fixup coding styles.
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
drivers/block/zram/zram_drv.c | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 7904159e9226..71f37b960812 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -519,7 +519,8 @@ struct zram_wb_req {
};
static ssize_t writeback_limit_enable_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
+ struct device_attribute *attr,
+ const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
u64 val;
@@ -528,18 +529,19 @@ static ssize_t writeback_limit_enable_store(struct device *dev,
if (kstrtoull(buf, 10, &val))
return ret;
- down_read(&zram->init_lock);
+ down_write(&zram->init_lock);
spin_lock(&zram->wb_limit_lock);
zram->wb_limit_enable = val;
spin_unlock(&zram->wb_limit_lock);
- up_read(&zram->init_lock);
+ up_write(&zram->init_lock);
ret = len;
return ret;
}
static ssize_t writeback_limit_enable_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+ struct device_attribute *attr,
+ char *buf)
{
bool val;
struct zram *zram = dev_to_zram(dev);
@@ -554,7 +556,8 @@ static ssize_t writeback_limit_enable_show(struct device *dev,
}
static ssize_t writeback_limit_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
+ struct device_attribute *attr,
+ const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
u64 val;
@@ -563,11 +566,11 @@ static ssize_t writeback_limit_store(struct device *dev,
if (kstrtoull(buf, 10, &val))
return ret;
- down_read(&zram->init_lock);
+ down_write(&zram->init_lock);
spin_lock(&zram->wb_limit_lock);
zram->bd_wb_limit = val;
spin_unlock(&zram->wb_limit_lock);
- up_read(&zram->init_lock);
+ up_write(&zram->init_lock);
ret = len;
return ret;
--
2.52.0.rc1.455.g30608eb744-goog
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [RFC PATCHv5 3/6] zram: take write lock in wb limit store handlers
2025-11-20 15:21 ` [RFC PATCHv5 3/6] zram: take write lock in wb limit store handlers Sergey Senozhatsky
@ 2025-11-20 16:03 ` Brian Geffon
0 siblings, 0 replies; 36+ messages in thread
From: Brian Geffon @ 2025-11-20 16:03 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Andrew Morton, Minchan Kim, Yuwen Chen, Richard Chang,
Fengyu Lian, linux-kernel, linux-mm, linux-block
On Thu, Nov 20, 2025 at 10:22 AM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> Write device attrs handlers should take write zram init_lock.
> While at it, fixup coding styles.
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Brian Geffon <bgeffon@google.com>
> ---
> drivers/block/zram/zram_drv.c | 17 ++++++++++-------
> 1 file changed, 10 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 7904159e9226..71f37b960812 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -519,7 +519,8 @@ struct zram_wb_req {
> };
>
> static ssize_t writeback_limit_enable_store(struct device *dev,
> - struct device_attribute *attr, const char *buf, size_t len)
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> {
> struct zram *zram = dev_to_zram(dev);
> u64 val;
> @@ -528,18 +529,19 @@ static ssize_t writeback_limit_enable_store(struct device *dev,
> if (kstrtoull(buf, 10, &val))
> return ret;
>
> - down_read(&zram->init_lock);
> + down_write(&zram->init_lock);
> spin_lock(&zram->wb_limit_lock);
> zram->wb_limit_enable = val;
> spin_unlock(&zram->wb_limit_lock);
> - up_read(&zram->init_lock);
> + up_write(&zram->init_lock);
> ret = len;
>
> return ret;
> }
>
> static ssize_t writeback_limit_enable_show(struct device *dev,
> - struct device_attribute *attr, char *buf)
> + struct device_attribute *attr,
> + char *buf)
> {
> bool val;
> struct zram *zram = dev_to_zram(dev);
> @@ -554,7 +556,8 @@ static ssize_t writeback_limit_enable_show(struct device *dev,
> }
>
> static ssize_t writeback_limit_store(struct device *dev,
> - struct device_attribute *attr, const char *buf, size_t len)
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> {
> struct zram *zram = dev_to_zram(dev);
> u64 val;
> @@ -563,11 +566,11 @@ static ssize_t writeback_limit_store(struct device *dev,
> if (kstrtoull(buf, 10, &val))
> return ret;
>
> - down_read(&zram->init_lock);
> + down_write(&zram->init_lock);
> spin_lock(&zram->wb_limit_lock);
> zram->bd_wb_limit = val;
> spin_unlock(&zram->wb_limit_lock);
> - up_read(&zram->init_lock);
> + up_write(&zram->init_lock);
> ret = len;
>
> return ret;
> --
> 2.52.0.rc1.455.g30608eb744-goog
>
^ permalink raw reply [flat|nested] 36+ messages in thread
* [RFC PATCHv5 4/6] zram: drop wb_limit_lock
2025-11-20 15:21 [RFC PATCHv5 0/6] zram: introduce writeback bio batching Sergey Senozhatsky
` (2 preceding siblings ...)
2025-11-20 15:21 ` [RFC PATCHv5 3/6] zram: take write lock in wb limit store handlers Sergey Senozhatsky
@ 2025-11-20 15:21 ` Sergey Senozhatsky
2025-11-20 16:03 ` Brian Geffon
2025-11-20 15:21 ` [RFC PATCHv5 5/6] zram: rework bdev block allocation Sergey Senozhatsky
` (2 subsequent siblings)
6 siblings, 1 reply; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-20 15:21 UTC (permalink / raw)
To: Andrew Morton, Minchan Kim, Yuwen Chen, Richard Chang
Cc: Brian Geffon, Fengyu Lian, linux-kernel, linux-mm, linux-block,
Sergey Senozhatsky
We don't need wb_limit_lock. Writeback limit setters take an
exclusive write zram init_lock, while wb_limit modifications
happen only from a single task and under zram read init_lock.
No concurrent wb_limit modifications are possible (we permit
only one post-processing task at a time). Add lockdep
assertions to wb_limit mutators.
While at it, fixup coding styles.
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
drivers/block/zram/zram_drv.c | 22 +++++-----------------
drivers/block/zram/zram_drv.h | 1 -
2 files changed, 5 insertions(+), 18 deletions(-)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 71f37b960812..671ef2ec9b11 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -530,9 +530,7 @@ static ssize_t writeback_limit_enable_store(struct device *dev,
return ret;
down_write(&zram->init_lock);
- spin_lock(&zram->wb_limit_lock);
zram->wb_limit_enable = val;
- spin_unlock(&zram->wb_limit_lock);
up_write(&zram->init_lock);
ret = len;
@@ -547,9 +545,7 @@ static ssize_t writeback_limit_enable_show(struct device *dev,
struct zram *zram = dev_to_zram(dev);
down_read(&zram->init_lock);
- spin_lock(&zram->wb_limit_lock);
val = zram->wb_limit_enable;
- spin_unlock(&zram->wb_limit_lock);
up_read(&zram->init_lock);
return sysfs_emit(buf, "%d\n", val);
@@ -567,9 +563,7 @@ static ssize_t writeback_limit_store(struct device *dev,
return ret;
down_write(&zram->init_lock);
- spin_lock(&zram->wb_limit_lock);
zram->bd_wb_limit = val;
- spin_unlock(&zram->wb_limit_lock);
up_write(&zram->init_lock);
ret = len;
@@ -577,15 +571,13 @@ static ssize_t writeback_limit_store(struct device *dev,
}
static ssize_t writeback_limit_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+ struct device_attribute *attr, char *buf)
{
u64 val;
struct zram *zram = dev_to_zram(dev);
down_read(&zram->init_lock);
- spin_lock(&zram->wb_limit_lock);
val = zram->bd_wb_limit;
- spin_unlock(&zram->wb_limit_lock);
up_read(&zram->init_lock);
return sysfs_emit(buf, "%llu\n", val);
@@ -869,18 +861,18 @@ static struct zram_wb_ctl *init_wb_ctl(struct zram *zram)
static void zram_account_writeback_rollback(struct zram *zram)
{
- spin_lock(&zram->wb_limit_lock);
+ lockdep_assert_held_read(&zram->init_lock);
+
if (zram->wb_limit_enable)
zram->bd_wb_limit += 1UL << (PAGE_SHIFT - 12);
- spin_unlock(&zram->wb_limit_lock);
}
static void zram_account_writeback_submit(struct zram *zram)
{
- spin_lock(&zram->wb_limit_lock);
+ lockdep_assert_held_read(&zram->init_lock);
+
if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
- spin_unlock(&zram->wb_limit_lock);
}
static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req)
@@ -1004,13 +996,10 @@ static int zram_writeback_slots(struct zram *zram,
u32 index = 0;
while ((pps = select_pp_slot(ctl))) {
- spin_lock(&zram->wb_limit_lock);
if (zram->wb_limit_enable && !zram->bd_wb_limit) {
- spin_unlock(&zram->wb_limit_lock);
ret = -EIO;
break;
}
- spin_unlock(&zram->wb_limit_lock);
while (!req) {
req = zram_select_idle_req(wb_ctl);
@@ -2961,7 +2950,6 @@ static int zram_add(void)
init_rwsem(&zram->init_lock);
#ifdef CONFIG_ZRAM_WRITEBACK
zram->wb_batch_size = 32;
- spin_lock_init(&zram->wb_limit_lock);
#endif
/* gendisk structure */
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 1a647f42c1a4..c6d94501376c 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -127,7 +127,6 @@ struct zram {
bool claim; /* Protected by disk->open_mutex */
#ifdef CONFIG_ZRAM_WRITEBACK
struct file *backing_dev;
- spinlock_t wb_limit_lock;
bool wb_limit_enable;
u32 wb_batch_size;
u64 bd_wb_limit;
--
2.52.0.rc1.455.g30608eb744-goog
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [RFC PATCHv5 4/6] zram: drop wb_limit_lock
2025-11-20 15:21 ` [RFC PATCHv5 4/6] zram: drop wb_limit_lock Sergey Senozhatsky
@ 2025-11-20 16:03 ` Brian Geffon
0 siblings, 0 replies; 36+ messages in thread
From: Brian Geffon @ 2025-11-20 16:03 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Andrew Morton, Minchan Kim, Yuwen Chen, Richard Chang,
Fengyu Lian, linux-kernel, linux-mm, linux-block
On Thu, Nov 20, 2025 at 10:22 AM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> We don't need wb_limit_lock. Writeback limit setters take an
> exclusive write zram init_lock, while wb_limit modifications
> happen only from a single task and under zram read init_lock.
> No concurrent wb_limit modifications are possible (we permit
> only one post-processing task at a time). Add lockdep
> assertions to wb_limit mutators.
>
> While at it, fixup coding styles.
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Brian Geffon <bgeffon@google.com>
> ---
> drivers/block/zram/zram_drv.c | 22 +++++-----------------
> drivers/block/zram/zram_drv.h | 1 -
> 2 files changed, 5 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 71f37b960812..671ef2ec9b11 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -530,9 +530,7 @@ static ssize_t writeback_limit_enable_store(struct device *dev,
> return ret;
>
> down_write(&zram->init_lock);
> - spin_lock(&zram->wb_limit_lock);
> zram->wb_limit_enable = val;
> - spin_unlock(&zram->wb_limit_lock);
> up_write(&zram->init_lock);
> ret = len;
>
> @@ -547,9 +545,7 @@ static ssize_t writeback_limit_enable_show(struct device *dev,
> struct zram *zram = dev_to_zram(dev);
>
> down_read(&zram->init_lock);
> - spin_lock(&zram->wb_limit_lock);
> val = zram->wb_limit_enable;
> - spin_unlock(&zram->wb_limit_lock);
> up_read(&zram->init_lock);
>
> return sysfs_emit(buf, "%d\n", val);
> @@ -567,9 +563,7 @@ static ssize_t writeback_limit_store(struct device *dev,
> return ret;
>
> down_write(&zram->init_lock);
> - spin_lock(&zram->wb_limit_lock);
> zram->bd_wb_limit = val;
> - spin_unlock(&zram->wb_limit_lock);
> up_write(&zram->init_lock);
> ret = len;
>
> @@ -577,15 +571,13 @@ static ssize_t writeback_limit_store(struct device *dev,
> }
>
> static ssize_t writeback_limit_show(struct device *dev,
> - struct device_attribute *attr, char *buf)
> + struct device_attribute *attr, char *buf)
> {
> u64 val;
> struct zram *zram = dev_to_zram(dev);
>
> down_read(&zram->init_lock);
> - spin_lock(&zram->wb_limit_lock);
> val = zram->bd_wb_limit;
> - spin_unlock(&zram->wb_limit_lock);
> up_read(&zram->init_lock);
>
> return sysfs_emit(buf, "%llu\n", val);
> @@ -869,18 +861,18 @@ static struct zram_wb_ctl *init_wb_ctl(struct zram *zram)
>
> static void zram_account_writeback_rollback(struct zram *zram)
> {
> - spin_lock(&zram->wb_limit_lock);
> + lockdep_assert_held_read(&zram->init_lock);
> +
> if (zram->wb_limit_enable)
> zram->bd_wb_limit += 1UL << (PAGE_SHIFT - 12);
> - spin_unlock(&zram->wb_limit_lock);
> }
>
> static void zram_account_writeback_submit(struct zram *zram)
> {
> - spin_lock(&zram->wb_limit_lock);
> + lockdep_assert_held_read(&zram->init_lock);
> +
> if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
> zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
> - spin_unlock(&zram->wb_limit_lock);
> }
>
> static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req)
> @@ -1004,13 +996,10 @@ static int zram_writeback_slots(struct zram *zram,
> u32 index = 0;
>
> while ((pps = select_pp_slot(ctl))) {
> - spin_lock(&zram->wb_limit_lock);
> if (zram->wb_limit_enable && !zram->bd_wb_limit) {
> - spin_unlock(&zram->wb_limit_lock);
> ret = -EIO;
> break;
> }
> - spin_unlock(&zram->wb_limit_lock);
>
> while (!req) {
> req = zram_select_idle_req(wb_ctl);
> @@ -2961,7 +2950,6 @@ static int zram_add(void)
> init_rwsem(&zram->init_lock);
> #ifdef CONFIG_ZRAM_WRITEBACK
> zram->wb_batch_size = 32;
> - spin_lock_init(&zram->wb_limit_lock);
> #endif
>
> /* gendisk structure */
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index 1a647f42c1a4..c6d94501376c 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -127,7 +127,6 @@ struct zram {
> bool claim; /* Protected by disk->open_mutex */
> #ifdef CONFIG_ZRAM_WRITEBACK
> struct file *backing_dev;
> - spinlock_t wb_limit_lock;
> bool wb_limit_enable;
> u32 wb_batch_size;
> u64 bd_wb_limit;
> --
> 2.52.0.rc1.455.g30608eb744-goog
>
^ permalink raw reply [flat|nested] 36+ messages in thread
* [RFC PATCHv5 5/6] zram: rework bdev block allocation
2025-11-20 15:21 [RFC PATCHv5 0/6] zram: introduce writeback bio batching Sergey Senozhatsky
` (3 preceding siblings ...)
2025-11-20 15:21 ` [RFC PATCHv5 4/6] zram: drop wb_limit_lock Sergey Senozhatsky
@ 2025-11-20 15:21 ` Sergey Senozhatsky
2025-11-20 16:35 ` Brian Geffon
2025-11-20 15:21 ` [RFC PATCHv5 6/6] zram: read slot block idx under slot lock Sergey Senozhatsky
2025-11-21 7:14 ` [RFC PATCHv5 0/6] zram: introduce writeback bio batching Yuwen Chen
6 siblings, 1 reply; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-20 15:21 UTC (permalink / raw)
To: Andrew Morton, Minchan Kim, Yuwen Chen, Richard Chang
Cc: Brian Geffon, Fengyu Lian, linux-kernel, linux-mm, linux-block,
Sergey Senozhatsky
First, writeback bdev ->bitmap bits are set only from one
context, as we can have only one single task performing
writeback, so we cannot race with anything else. Remove
retry path.
Second, we always check ZRAM_WB flag to distinguish writtenback
slots, so we should not confuse 0 bdev block index and 0 handle.
We can use first bdev block (0 bit) for writeback as well.
While at it, give functions slightly more accurate names, as
we don't alloc/free anything there, we reserve a block for
async writeback or release the block.
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
drivers/block/zram/zram_drv.c | 37 +++++++++++++++++------------------
1 file changed, 18 insertions(+), 19 deletions(-)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 671ef2ec9b11..ecbd9b25dfed 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -500,6 +500,8 @@ static ssize_t idle_store(struct device *dev,
}
#ifdef CONFIG_ZRAM_WRITEBACK
+#define INVALID_BDEV_BLOCK (~0UL)
+
struct zram_wb_ctl {
struct list_head idle_reqs;
struct list_head done_reqs;
@@ -746,23 +748,20 @@ static ssize_t backing_dev_store(struct device *dev,
return err;
}
-static unsigned long alloc_block_bdev(struct zram *zram)
+static unsigned long zram_reserve_bdev_block(struct zram *zram)
{
- unsigned long blk_idx = 1;
-retry:
- /* skip 0 bit to confuse zram.handle = 0 */
- blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx);
- if (blk_idx == zram->nr_pages)
- return 0;
+ unsigned long blk_idx;
- if (test_and_set_bit(blk_idx, zram->bitmap))
- goto retry;
+ blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, 0);
+ if (blk_idx == zram->nr_pages)
+ return INVALID_BDEV_BLOCK;
+ set_bit(blk_idx, zram->bitmap);
atomic64_inc(&zram->stats.bd_count);
return blk_idx;
}
-static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
+static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx)
{
int was_set;
@@ -887,7 +886,7 @@ static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req)
* (if enabled).
*/
zram_account_writeback_rollback(zram);
- free_block_bdev(zram, req->blk_idx);
+ zram_release_bdev_block(zram, req->blk_idx);
return err;
}
@@ -901,7 +900,7 @@ static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req)
* finishes.
*/
if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) {
- free_block_bdev(zram, req->blk_idx);
+ zram_release_bdev_block(zram, req->blk_idx);
goto out;
}
@@ -989,8 +988,8 @@ static int zram_writeback_slots(struct zram *zram,
struct zram_pp_ctl *ctl,
struct zram_wb_ctl *wb_ctl)
{
+ unsigned long blk_idx = INVALID_BDEV_BLOCK;
struct zram_wb_req *req = NULL;
- unsigned long blk_idx = 0;
struct zram_pp_slot *pps;
int ret = 0, err = 0;
u32 index = 0;
@@ -1022,9 +1021,9 @@ static int zram_writeback_slots(struct zram *zram,
ret = err;
}
- if (!blk_idx) {
- blk_idx = alloc_block_bdev(zram);
- if (!blk_idx) {
+ if (blk_idx == INVALID_BDEV_BLOCK) {
+ blk_idx = zram_reserve_bdev_block(zram);
+ if (blk_idx == INVALID_BDEV_BLOCK) {
ret = -ENOSPC;
break;
}
@@ -1058,7 +1057,7 @@ static int zram_writeback_slots(struct zram *zram,
__bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
zram_submit_wb_request(zram, wb_ctl, req);
- blk_idx = 0;
+ blk_idx = INVALID_BDEV_BLOCK;
req = NULL;
cond_resched();
continue;
@@ -1365,7 +1364,7 @@ static int read_from_bdev(struct zram *zram, struct page *page,
return -EIO;
}
-static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
+static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx)
{
}
#endif
@@ -1889,7 +1888,7 @@ static void zram_free_page(struct zram *zram, size_t index)
if (zram_test_flag(zram, index, ZRAM_WB)) {
zram_clear_flag(zram, index, ZRAM_WB);
- free_block_bdev(zram, zram_get_handle(zram, index));
+ zram_release_bdev_block(zram, zram_get_handle(zram, index));
goto out;
}
--
2.52.0.rc1.455.g30608eb744-goog
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [RFC PATCHv5 5/6] zram: rework bdev block allocation
2025-11-20 15:21 ` [RFC PATCHv5 5/6] zram: rework bdev block allocation Sergey Senozhatsky
@ 2025-11-20 16:35 ` Brian Geffon
0 siblings, 0 replies; 36+ messages in thread
From: Brian Geffon @ 2025-11-20 16:35 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Andrew Morton, Minchan Kim, Yuwen Chen, Richard Chang,
Fengyu Lian, linux-kernel, linux-mm, linux-block
On Thu, Nov 20, 2025 at 10:22 AM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> First, writeback bdev ->bitmap bits are set only from one
> context, as we can have only one single task performing
> writeback, so we cannot race with anything else. Remove
> retry path.
>
> Second, we always check ZRAM_WB flag to distinguish writtenback
> slots, so we should not confuse 0 bdev block index and 0 handle.
> We can use first bdev block (0 bit) for writeback as well.
>
> While at it, give functions slightly more accurate names, as
> we don't alloc/free anything there, we reserve a block for
> async writeback or release the block.
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Brian Geffon <bgeffon@google.com>
> ---
> drivers/block/zram/zram_drv.c | 37 +++++++++++++++++------------------
> 1 file changed, 18 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 671ef2ec9b11..ecbd9b25dfed 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -500,6 +500,8 @@ static ssize_t idle_store(struct device *dev,
> }
>
> #ifdef CONFIG_ZRAM_WRITEBACK
> +#define INVALID_BDEV_BLOCK (~0UL)
> +
> struct zram_wb_ctl {
> struct list_head idle_reqs;
> struct list_head done_reqs;
> @@ -746,23 +748,20 @@ static ssize_t backing_dev_store(struct device *dev,
> return err;
> }
>
> -static unsigned long alloc_block_bdev(struct zram *zram)
> +static unsigned long zram_reserve_bdev_block(struct zram *zram)
> {
> - unsigned long blk_idx = 1;
> -retry:
> - /* skip 0 bit to confuse zram.handle = 0 */
> - blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx);
> - if (blk_idx == zram->nr_pages)
> - return 0;
> + unsigned long blk_idx;
>
> - if (test_and_set_bit(blk_idx, zram->bitmap))
> - goto retry;
> + blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, 0);
> + if (blk_idx == zram->nr_pages)
> + return INVALID_BDEV_BLOCK;
>
> + set_bit(blk_idx, zram->bitmap);
> atomic64_inc(&zram->stats.bd_count);
> return blk_idx;
> }
>
> -static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
> +static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx)
> {
> int was_set;
>
> @@ -887,7 +886,7 @@ static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req)
> * (if enabled).
> */
> zram_account_writeback_rollback(zram);
> - free_block_bdev(zram, req->blk_idx);
> + zram_release_bdev_block(zram, req->blk_idx);
> return err;
> }
>
> @@ -901,7 +900,7 @@ static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req)
> * finishes.
> */
> if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) {
> - free_block_bdev(zram, req->blk_idx);
> + zram_release_bdev_block(zram, req->blk_idx);
> goto out;
> }
>
> @@ -989,8 +988,8 @@ static int zram_writeback_slots(struct zram *zram,
> struct zram_pp_ctl *ctl,
> struct zram_wb_ctl *wb_ctl)
> {
> + unsigned long blk_idx = INVALID_BDEV_BLOCK;
> struct zram_wb_req *req = NULL;
> - unsigned long blk_idx = 0;
> struct zram_pp_slot *pps;
> int ret = 0, err = 0;
> u32 index = 0;
> @@ -1022,9 +1021,9 @@ static int zram_writeback_slots(struct zram *zram,
> ret = err;
> }
>
> - if (!blk_idx) {
> - blk_idx = alloc_block_bdev(zram);
> - if (!blk_idx) {
> + if (blk_idx == INVALID_BDEV_BLOCK) {
> + blk_idx = zram_reserve_bdev_block(zram);
> + if (blk_idx == INVALID_BDEV_BLOCK) {
> ret = -ENOSPC;
> break;
> }
> @@ -1058,7 +1057,7 @@ static int zram_writeback_slots(struct zram *zram,
> __bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
>
> zram_submit_wb_request(zram, wb_ctl, req);
> - blk_idx = 0;
> + blk_idx = INVALID_BDEV_BLOCK;
> req = NULL;
> cond_resched();
> continue;
> @@ -1365,7 +1364,7 @@ static int read_from_bdev(struct zram *zram, struct page *page,
> return -EIO;
> }
>
> -static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
> +static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx)
> {
> }
> #endif
> @@ -1889,7 +1888,7 @@ static void zram_free_page(struct zram *zram, size_t index)
>
> if (zram_test_flag(zram, index, ZRAM_WB)) {
> zram_clear_flag(zram, index, ZRAM_WB);
> - free_block_bdev(zram, zram_get_handle(zram, index));
> + zram_release_bdev_block(zram, zram_get_handle(zram, index));
> goto out;
> }
>
> --
> 2.52.0.rc1.455.g30608eb744-goog
>
^ permalink raw reply [flat|nested] 36+ messages in thread
* [RFC PATCHv5 6/6] zram: read slot block idx under slot lock
2025-11-20 15:21 [RFC PATCHv5 0/6] zram: introduce writeback bio batching Sergey Senozhatsky
` (4 preceding siblings ...)
2025-11-20 15:21 ` [RFC PATCHv5 5/6] zram: rework bdev block allocation Sergey Senozhatsky
@ 2025-11-20 15:21 ` Sergey Senozhatsky
2025-11-20 18:13 ` Brian Geffon
2025-11-24 14:49 ` Brian Geffon
2025-11-21 7:14 ` [RFC PATCHv5 0/6] zram: introduce writeback bio batching Yuwen Chen
6 siblings, 2 replies; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-20 15:21 UTC (permalink / raw)
To: Andrew Morton, Minchan Kim, Yuwen Chen, Richard Chang
Cc: Brian Geffon, Fengyu Lian, linux-kernel, linux-mm, linux-block,
Sergey Senozhatsky
Read slot's block id under slot-lock. We release the
slot-lock for bdev read so, technically, slot still can
get freed in the meantime, but at least we will read
bdev block (page) that holds previous know slot data,
not from slot->handle bdev block, which can be anything
at that point.
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
drivers/block/zram/zram_drv.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index ecbd9b25dfed..1f2867cd587e 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1994,14 +1994,14 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index,
ret = zram_read_from_zspool(zram, page, index);
zram_slot_unlock(zram, index);
} else {
+ unsigned long blk_idx = zram_get_handle(zram, index);
+
/*
* The slot should be unlocked before reading from the backing
* device.
*/
zram_slot_unlock(zram, index);
-
- ret = read_from_bdev(zram, page, zram_get_handle(zram, index),
- parent);
+ ret = read_from_bdev(zram, page, blk_idx, parent);
}
/* Should NEVER happen. Return bio error if it does. */
--
2.52.0.rc1.455.g30608eb744-goog
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [RFC PATCHv5 6/6] zram: read slot block idx under slot lock
2025-11-20 15:21 ` [RFC PATCHv5 6/6] zram: read slot block idx under slot lock Sergey Senozhatsky
@ 2025-11-20 18:13 ` Brian Geffon
2025-11-24 14:49 ` Brian Geffon
1 sibling, 0 replies; 36+ messages in thread
From: Brian Geffon @ 2025-11-20 18:13 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Andrew Morton, Minchan Kim, Yuwen Chen, Richard Chang,
Fengyu Lian, linux-kernel, linux-mm, linux-block
On Thu, Nov 20, 2025 at 10:22 AM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> Read slot's block id under slot-lock. We release the
> slot-lock for bdev read so, technically, slot still can
> get freed in the meantime, but at least we will read
> bdev block (page) that holds previous know slot data,
> not from slot->handle bdev block, which can be anything
> at that point.
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
> drivers/block/zram/zram_drv.c | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index ecbd9b25dfed..1f2867cd587e 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -1994,14 +1994,14 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index,
> ret = zram_read_from_zspool(zram, page, index);
> zram_slot_unlock(zram, index);
> } else {
> + unsigned long blk_idx = zram_get_handle(zram, index);
> +
> /*
> * The slot should be unlocked before reading from the backing
> * device.
> */
> zram_slot_unlock(zram, index);
> -
> - ret = read_from_bdev(zram, page, zram_get_handle(zram, index),
> - parent);
> + ret = read_from_bdev(zram, page, blk_idx, parent);
This patch doesn't really change things in a meaningful way, wouldn't
it be more correct to take the slot lock again after the read and
confirm the handle is the same and it's still ZRAM_WB?
> }
>
> /* Should NEVER happen. Return bio error if it does. */
> --
> 2.52.0.rc1.455.g30608eb744-goog
>
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [RFC PATCHv5 6/6] zram: read slot block idx under slot lock
2025-11-20 15:21 ` [RFC PATCHv5 6/6] zram: read slot block idx under slot lock Sergey Senozhatsky
2025-11-20 18:13 ` Brian Geffon
@ 2025-11-24 14:49 ` Brian Geffon
1 sibling, 0 replies; 36+ messages in thread
From: Brian Geffon @ 2025-11-24 14:49 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Andrew Morton, Minchan Kim, Yuwen Chen, Richard Chang,
Fengyu Lian, linux-kernel, linux-mm, linux-block
On Thu, Nov 20, 2025 at 10:22 AM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> Read slot's block id under slot-lock. We release the
> slot-lock for bdev read so, technically, slot still can
> get freed in the meantime, but at least we will read
> bdev block (page) that holds previous know slot data,
> not from slot->handle bdev block, which can be anything
> at that point.
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
After taking another look I think this is a worthwhile improvement.
Reviewed-by: Brian Geffon <bgeffon@google.com>
> ---
> drivers/block/zram/zram_drv.c | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index ecbd9b25dfed..1f2867cd587e 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -1994,14 +1994,14 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index,
> ret = zram_read_from_zspool(zram, page, index);
> zram_slot_unlock(zram, index);
> } else {
> + unsigned long blk_idx = zram_get_handle(zram, index);
> +
> /*
> * The slot should be unlocked before reading from the backing
> * device.
> */
> zram_slot_unlock(zram, index);
> -
> - ret = read_from_bdev(zram, page, zram_get_handle(zram, index),
> - parent);
> + ret = read_from_bdev(zram, page, blk_idx, parent);
> }
>
> /* Should NEVER happen. Return bio error if it does. */
> --
> 2.52.0.rc1.455.g30608eb744-goog
>
^ permalink raw reply [flat|nested] 36+ messages in thread
* [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-20 15:21 [RFC PATCHv5 0/6] zram: introduce writeback bio batching Sergey Senozhatsky
` (5 preceding siblings ...)
2025-11-20 15:21 ` [RFC PATCHv5 6/6] zram: read slot block idx under slot lock Sergey Senozhatsky
@ 2025-11-21 7:14 ` Yuwen Chen
2025-11-21 7:32 ` Sergey Senozhatsky
6 siblings, 1 reply; 36+ messages in thread
From: Yuwen Chen @ 2025-11-21 7:14 UTC (permalink / raw)
To: senozhatsky
Cc: akpm, bgeffon, licayy, linux-block, linux-kernel, linux-mm,
minchan, richardycc, ywen.chen
On Fri, 21 Nov 2025 00:21:20 +0900, Sergey Senozhatsky wrote:
> This is a different approach compared to [1]. Instead of
> using blk plug API to batch writeback bios, we just keep
> submitting them and track available of done/idle requests
> (we still use a pool of requests, to put a constraint on
> memory usage). The intuition is that blk plug API is good
> for sequential IO patterns, but zram writeback is more
> likely to use random IO patterns.
> I only did minimal testing so far (in a VM). More testing
> (on real H/W) is needed, any help is highly appreciated.
I conducted a test on an NVMe host. When all requests were random,
this fix was indeed a bit faster than the previous one.
before:
real 0m0.261s
user 0m0.000s
sys 0m0.243s
real 0m0.260s
user 0m0.000s
sys 0m0.244s
real 0m0.259s
user 0m0.000s
sys 0m0.243s
after:
real 0m0.322s
user 0m0.000s
sys 0m0.214s
real 0m0.326s
user 0m0.000s
sys 0m0.206s
real 0m0.325s
user 0m0.000s
sys 0m0.215s
This result is something to be happy about. However, I'm also quite
curious about the test results on devices like UFS, which have
relatively less internal memory.
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-21 7:14 ` [RFC PATCHv5 0/6] zram: introduce writeback bio batching Yuwen Chen
@ 2025-11-21 7:32 ` Sergey Senozhatsky
2025-11-21 7:44 ` Yuwen Chen
0 siblings, 1 reply; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-21 7:32 UTC (permalink / raw)
To: Yuwen Chen
Cc: senozhatsky, akpm, bgeffon, licayy, linux-block, linux-kernel,
linux-mm, minchan, richardycc
On (25/11/21 15:14), Yuwen Chen wrote:
> On Fri, 21 Nov 2025 00:21:20 +0900, Sergey Senozhatsky wrote:
> > This is a different approach compared to [1]. Instead of
> > using blk plug API to batch writeback bios, we just keep
> > submitting them and track available of done/idle requests
> > (we still use a pool of requests, to put a constraint on
> > memory usage). The intuition is that blk plug API is good
> > for sequential IO patterns, but zram writeback is more
> > likely to use random IO patterns.
>
> > I only did minimal testing so far (in a VM). More testing
> > (on real H/W) is needed, any help is highly appreciated.
>
> I conducted a test on an NVMe host. When all requests were random,
> this fix was indeed a bit faster than the previous one.
Is "before" blk-plug based approach and "after" this new approach?
> before:
> real 0m0.261s
> user 0m0.000s
> sys 0m0.243s
>
> real 0m0.260s
> user 0m0.000s
> sys 0m0.244s
>
> real 0m0.259s
> user 0m0.000s
> sys 0m0.243s
>
> after:
> real 0m0.322s
> user 0m0.000s
> sys 0m0.214s
>
> real 0m0.326s
> user 0m0.000s
> sys 0m0.206s
>
> real 0m0.325s
> user 0m0.000s
> sys 0m0.215s
Hmm that's less than was anticipated.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-21 7:32 ` Sergey Senozhatsky
@ 2025-11-21 7:44 ` Yuwen Chen
2025-11-21 7:58 ` Sergey Senozhatsky
0 siblings, 1 reply; 36+ messages in thread
From: Yuwen Chen @ 2025-11-21 7:44 UTC (permalink / raw)
To: senozhatsky
Cc: akpm, bgeffon, licayy, linux-block, linux-kernel, linux-mm,
minchan, richardycc, ywen.chen
On Fri, 21 Nov 2025 16:32:27 +0900, Sergey Senozhatsky wrote:
> Is "before" blk-plug based approach and "after" this new approach?
Sorry, I got the before and after mixed up.
In addition, I also have some related questions to consult:
1. Will page fault exceptions be delayed during the writeback processing?
2. Since the loop device uses a work queue to handle requests, when
the system load is relatively high, will it have a relatively large
impact on the latency of page fault exceptions? Is there any way to solve
this problem?
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-21 7:44 ` Yuwen Chen
@ 2025-11-21 7:58 ` Sergey Senozhatsky
2025-11-21 8:23 ` Yuwen Chen
0 siblings, 1 reply; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-21 7:58 UTC (permalink / raw)
To: Yuwen Chen
Cc: senozhatsky, akpm, bgeffon, licayy, linux-block, linux-kernel,
linux-mm, minchan, richardycc
On (25/11/21 15:44), Yuwen Chen wrote:
> On Fri, 21 Nov 2025 16:32:27 +0900, Sergey Senozhatsky wrote:
> > Is "before" blk-plug based approach and "after" this new approach?
>
> Sorry, I got the before and after mixed up.
No problem. I wonder if the effect is more visible on larger data sets.
0.3 second sounds like a very short write. In my VM tests I couldn't get
more than 2 inflight requests at a time, I guess because decompression
was much slower than IO. I wonder how many inflight requests you had in
your tests.
> In addition, I also have some related questions to consult:
>
> 1. Will page fault exceptions be delayed during the writeback processing?
I don't think our reads are blocked by writes.
> 2. Since the loop device uses a work queue to handle requests, when
> the system load is relatively high, will it have a relatively large
> impact on the latency of page fault exceptions? Is there any way to solve
> this problem?
I think page-fault latency of a written-back page is expected to be
higher, that's a trade-off that we agree on. Off the top of my head,
I don't think we can do anything about it.
Is loop device always used as for writeback targets?
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-21 7:58 ` Sergey Senozhatsky
@ 2025-11-21 8:23 ` Yuwen Chen
2025-11-21 9:12 ` Sergey Senozhatsky
0 siblings, 1 reply; 36+ messages in thread
From: Yuwen Chen @ 2025-11-21 8:23 UTC (permalink / raw)
To: senozhatsky
Cc: akpm, bgeffon, licayy, linux-block, linux-kernel, linux-mm,
minchan, richardycc, ywen.chen
On Fri, 21 Nov 2025 16:58:41 +0900, Sergey Senozhatsky wrote:
> No problem. I wonder if the effect is more visible on larger data sets.
> 0.3 second sounds like a very short write. In my VM tests I couldn't get
> more than 2 inflight requests at a time, I guess because decompression
> was much slower than IO. I wonder how many inflight requests you had in
> your tests.
I used the following code for testing here, and the result was 32.
code:
@@ -983,6 +983,7 @@ static int zram_writeback_slots(struct zram *zram,
struct zram_pp_slot *pps;
int ret = 0, err = 0;
u32 index = 0;
+ int inflight = 0;
while ((pps = select_pp_slot(ctl))) {
spin_lock(&zram->wb_limit_lock);
@@ -993,6 +994,9 @@ static int zram_writeback_slots(struct zram *zram,
}
spin_unlock(&zram->wb_limit_lock);
+ if (inflight < atomic_read(&wb_ctl->num_inflight))
+ inflight = atomic_read(&wb_ctl->num_inflight);
+
while (!req) {
req = zram_select_idle_req(wb_ctl);
if (req)
@@ -1074,6 +1078,7 @@ next:
ret = err;
}
+ pr_err("%s: inflight max: %d\n", __func__, inflight);
return ret;
}
log:
[3741949.842927] zram: zram_writeback_slots: inflight max: 32
Changing ZRAM_WB_REQ_CNT to 64 didn't shorten the overall time.
> I think page-fault latency of a written-back page is expected to be
> higher, that's a trade-off that we agree on. Off the top of my head,
> I don't think we can do anything about it.
>
> Is loop device always used as for writeback targets?
On the Android platform, currently only the loop device is supported as
the backend for writeback, possibly for security reasons. I noticed that
EROFS has implemented a CONFIG_EROFS_FS_BACKED_BY_FILE to reduce this
latency. I think ZRAM might also be able to do this.
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-21 8:23 ` Yuwen Chen
@ 2025-11-21 9:12 ` Sergey Senozhatsky
2025-11-21 12:21 ` Gao Xiang
0 siblings, 1 reply; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-21 9:12 UTC (permalink / raw)
To: Yuwen Chen
Cc: senozhatsky, akpm, bgeffon, licayy, linux-block, linux-kernel,
linux-mm, minchan, richardycc
On (25/11/21 16:23), Yuwen Chen wrote:
> I used the following code for testing here, and the result was 32.
>
> code:
> @@ -983,6 +983,7 @@ static int zram_writeback_slots(struct zram *zram,
> struct zram_pp_slot *pps;
> int ret = 0, err = 0;
> u32 index = 0;
> + int inflight = 0;
>
> while ((pps = select_pp_slot(ctl))) {
> spin_lock(&zram->wb_limit_lock);
> @@ -993,6 +994,9 @@ static int zram_writeback_slots(struct zram *zram,
> }
> spin_unlock(&zram->wb_limit_lock);
>
> + if (inflight < atomic_read(&wb_ctl->num_inflight))
> + inflight = atomic_read(&wb_ctl->num_inflight);
> +
> while (!req) {
> req = zram_select_idle_req(wb_ctl);
> if (req)
> @@ -1074,6 +1078,7 @@ next:
> ret = err;
> }
>
> + pr_err("%s: inflight max: %d\n", __func__, inflight);
> return ret;
> }
I think this will always give you 32 (or you current batch size limit),
just because the way it works - we first deplete all ->idle (reaching
max ->inflight) and only then complete finished requests (dropping
->inflight).
I had a version of the patch that had different main loop. It would
always first complete finished requests. I think this one will give
accurate ->inflight number.
---
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index ab0785878069..398609e9d061 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -999,13 +999,6 @@ static int zram_writeback_slots(struct zram *zram,
}
while (!req) {
- req = zram_select_idle_req(wb_ctl);
- if (req)
- break;
-
- wait_event(wb_ctl->done_wait,
- !list_empty(&wb_ctl->done_reqs));
-
err = zram_complete_done_reqs(zram, wb_ctl);
/*
* BIO errors are not fatal, we continue and simply
@@ -1017,6 +1010,13 @@ static int zram_writeback_slots(struct zram *zram,
*/
if (err)
ret = err;
+
+ req = zram_select_idle_req(wb_ctl);
+ if (req)
+ break;
+
+ wait_event(wb_ctl->done_wait,
+ !list_empty(&wb_ctl->done_reqs));
}
if (blk_idx == INVALID_BDEV_BLOCK) {
---
> > I think page-fault latency of a written-back page is expected to be
> > higher, that's a trade-off that we agree on. Off the top of my head,
> > I don't think we can do anything about it.
> >
> > Is loop device always used as for writeback targets?
>
> On the Android platform, currently only the loop device is supported as
> the backend for writeback, possibly for security reasons. I noticed that
> EROFS has implemented a CONFIG_EROFS_FS_BACKED_BY_FILE to reduce this
> latency. I think ZRAM might also be able to do this.
I see. Do you use S/W or H/W compression?
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-21 9:12 ` Sergey Senozhatsky
@ 2025-11-21 12:21 ` Gao Xiang
2025-11-21 12:43 ` Gao Xiang
2025-11-22 10:07 ` Sergey Senozhatsky
0 siblings, 2 replies; 36+ messages in thread
From: Gao Xiang @ 2025-11-21 12:21 UTC (permalink / raw)
To: Sergey Senozhatsky, Yuwen Chen
Cc: akpm, bgeffon, licayy, linux-block, linux-kernel, linux-mm,
minchan, richardycc
On 2025/11/21 17:12, Sergey Senozhatsky wrote:
> On (25/11/21 16:23), Yuwen Chen wrote:
..
>>> I think page-fault latency of a written-back page is expected to be
>>> higher, that's a trade-off that we agree on. Off the top of my head,
>>> I don't think we can do anything about it.
>>>
>>> Is loop device always used as for writeback targets?
>>
>> On the Android platform, currently only the loop device is supported as
>> the backend for writeback, possibly for security reasons. I noticed that
>> EROFS has implemented a CONFIG_EROFS_FS_BACKED_BY_FILE to reduce this
>> latency. I think ZRAM might also be able to do this.
>
> I see. Do you use S/W or H/W compression?
No, I'm pretty sure it's impossible for zram to access
file I/Os without another thread context (e.g. workqueue),
especially for write I/Os, which is unlike erofs:
EROFS can do because EROFS is a specific filesystem, you
could see it's a seperate fs, and it can only read (no
write context) backing files in erofs and/or other fses,
which is much like vfs/overlayfs read_iter() directly
going into the backing fses without nested contexts.
(Even if loop is used, it will create its own thread
contexts with workqueues, which is safe.)
In the other hand, zram/loop can act as a virtual block
device which is rather different, which means you could
format an ext4 filesystem and backing another ext4/btrfs,
like this:
zram(ext4) -> backing ext4/btrfs
It's unsafe (in addition to GFP_NOIO allocation
restriction) since zram cannot manage those ext4/btrfs
existing contexts:
- Take one detailed example, if the upper zram ext4
assigns current->journal_info = xxx, and submit_bio() to
zram, which will confuse the backing ext4 since it should
assume current->journal_info == NULL, so the virtual block
devices need another thread context to isolate those two
different uncontrolled contexts.
So I don't think it's feasible for block drivers to act
like this, especially mixing with writing to backing fses
operations.
Thanks,
Gao Xiang
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-21 12:21 ` Gao Xiang
@ 2025-11-21 12:43 ` Gao Xiang
2025-11-22 10:07 ` Sergey Senozhatsky
1 sibling, 0 replies; 36+ messages in thread
From: Gao Xiang @ 2025-11-21 12:43 UTC (permalink / raw)
To: Sergey Senozhatsky, Yuwen Chen
Cc: akpm, bgeffon, licayy, linux-block, linux-kernel, linux-mm,
minchan, richardycc
On 2025/11/21 20:21, Gao Xiang wrote:
>
>
> On 2025/11/21 17:12, Sergey Senozhatsky wrote:
>> On (25/11/21 16:23), Yuwen Chen wrote:
>
> ..
>
>
>>>> I think page-fault latency of a written-back page is expected to be
>>>> higher, that's a trade-off that we agree on. Off the top of my head,
>>>> I don't think we can do anything about it.
>>>>
>>>> Is loop device always used as for writeback targets?
>>>
>>> On the Android platform, currently only the loop device is supported as
>>> the backend for writeback, possibly for security reasons. I noticed that
>>> EROFS has implemented a CONFIG_EROFS_FS_BACKED_BY_FILE to reduce this
>>> latency. I think ZRAM might also be able to do this.
>>
>> I see. Do you use S/W or H/W compression?
>
> No, I'm pretty sure it's impossible for zram to access
> file I/Os without another thread context (e.g. workqueue),
> especially for write I/Os, which is unlike erofs:
>
> EROFS can do because EROFS is a specific filesystem, you
> could see it's a seperate fs, and it can only read (no
> write context) backing files in erofs and/or other fses,
> which is much like vfs/overlayfs read_iter() directly
> going into the backing fses without nested contexts.
> (Even if loop is used, it will create its own thread
> contexts with workqueues, which is safe.)
>
> In the other hand, zram/loop can act as a virtual block
> device which is rather different, which means you could
> format an ext4 filesystem and backing another ext4/btrfs,
> like this:
>
> zram(ext4) -> backing ext4/btrfs
>
> It's unsafe (in addition to GFP_NOIO allocation
> restriction) since zram cannot manage those ext4/btrfs
> existing contexts:
>
> - Take one detailed example, if the upper zram ext4
> assigns current->journal_info = xxx, and submit_bio() to
> zram, which will confuse the backing ext4 since it should
> assume current->journal_info == NULL, so the virtual block
> devices need another thread context to isolate those two
> different uncontrolled contexts.
>
> So I don't think it's feasible for block drivers to act
> like this, especially mixing with writing to backing fses
> operations.
In other words, a fs claims it can do file-backed-mounts
without a new context only if:
- Its own implementation can be safely applied to any
other kernel filesystem (e.g., it shouldn't change
current->journal_info or do context save/restore before
handing over, for example); and its own implementation
can safely mount itself with file-backed mounts.
So it's filesystem-specific internals to make sure it can
work like this (for example ext4 on erofs, ext4 still uses
loop to mount). The virtual block device layer knows
nothing about what the upper filesystem did before the
execution passes through, so it's unsafe to work like this.
Thanks,
Gao Xiang
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-21 12:21 ` Gao Xiang
2025-11-21 12:43 ` Gao Xiang
@ 2025-11-22 10:07 ` Sergey Senozhatsky
2025-11-22 12:24 ` Gao Xiang
1 sibling, 1 reply; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-22 10:07 UTC (permalink / raw)
To: Gao Xiang
Cc: Sergey Senozhatsky, Yuwen Chen, akpm, bgeffon, licayy,
linux-block, linux-kernel, linux-mm, minchan, richardycc
On (25/11/21 20:21), Gao Xiang wrote:
> > > > I think page-fault latency of a written-back page is expected to be
> > > > higher, that's a trade-off that we agree on. Off the top of my head,
> > > > I don't think we can do anything about it.
> > > >
> > > > Is loop device always used as for writeback targets?
> > >
> > > On the Android platform, currently only the loop device is supported as
> > > the backend for writeback, possibly for security reasons. I noticed that
> > > EROFS has implemented a CONFIG_EROFS_FS_BACKED_BY_FILE to reduce this
> > > latency. I think ZRAM might also be able to do this.
> >
> > I see. Do you use S/W or H/W compression?
>
> No, I'm pretty sure it's impossible for zram to access
> file I/Os without another thread context (e.g. workqueue),
> especially for write I/Os, which is unlike erofs:
>
> EROFS can do because EROFS is a specific filesystem, you
> could see it's a seperate fs, and it can only read (no
> write context) backing files in erofs and/or other fses,
> which is much like vfs/overlayfs read_iter() directly
> going into the backing fses without nested contexts.
> (Even if loop is used, it will create its own thread
> contexts with workqueues, which is safe.)
>
> In the other hand, zram/loop can act as a virtual block
> device which is rather different, which means you could
> format an ext4 filesystem and backing another ext4/btrfs,
> like this:
>
> zram(ext4) -> backing ext4/btrfs
>
> It's unsafe (in addition to GFP_NOIO allocation
> restriction) since zram cannot manage those ext4/btrfs
> existing contexts:
>
> - Take one detailed example, if the upper zram ext4
> assigns current->journal_info = xxx, and submit_bio() to
> zram, which will confuse the backing ext4 since it should
> assume current->journal_info == NULL, so the virtual block
> devices need another thread context to isolate those two
> different uncontrolled contexts.
>
> So I don't think it's feasible for block drivers to act
> like this, especially mixing with writing to backing fses
> operations.
Sorry, I don't completely understand your point, but backing
device is never expected to have any fs on it. So from your
email:
> zram(ext4) -> backing ext4/btrfs
This is not a valid configuration, as far as I'm concerned.
Unless I'm missing your point.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-22 10:07 ` Sergey Senozhatsky
@ 2025-11-22 12:24 ` Gao Xiang
2025-11-22 13:43 ` Sergey Senozhatsky
2025-11-23 0:22 ` Sergey Senozhatsky
0 siblings, 2 replies; 36+ messages in thread
From: Gao Xiang @ 2025-11-22 12:24 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Yuwen Chen, akpm, bgeffon, licayy, linux-block, linux-kernel,
linux-mm, minchan, richardycc
On 2025/11/22 18:07, Sergey Senozhatsky wrote:
> On (25/11/21 20:21), Gao Xiang wrote:
>>>>> I think page-fault latency of a written-back page is expected to be
>>>>> higher, that's a trade-off that we agree on. Off the top of my head,
>>>>> I don't think we can do anything about it.
>>>>>
>>>>> Is loop device always used as for writeback targets?
>>>>
>>>> On the Android platform, currently only the loop device is supported as
>>>> the backend for writeback, possibly for security reasons. I noticed that
>>>> EROFS has implemented a CONFIG_EROFS_FS_BACKED_BY_FILE to reduce this
>>>> latency. I think ZRAM might also be able to do this.
>>>
>>> I see. Do you use S/W or H/W compression?
>>
>> No, I'm pretty sure it's impossible for zram to access
>> file I/Os without another thread context (e.g. workqueue),
>> especially for write I/Os, which is unlike erofs:
>>
>> EROFS can do because EROFS is a specific filesystem, you
>> could see it's a seperate fs, and it can only read (no
>> write context) backing files in erofs and/or other fses,
>> which is much like vfs/overlayfs read_iter() directly
>> going into the backing fses without nested contexts.
>> (Even if loop is used, it will create its own thread
>> contexts with workqueues, which is safe.)
>>
>> In the other hand, zram/loop can act as a virtual block
>> device which is rather different, which means you could
>> format an ext4 filesystem and backing another ext4/btrfs,
>> like this:
>>
>> zram(ext4) -> backing ext4/btrfs
>>
>> It's unsafe (in addition to GFP_NOIO allocation
>> restriction) since zram cannot manage those ext4/btrfs
>> existing contexts:
>>
>> - Take one detailed example, if the upper zram ext4
>> assigns current->journal_info = xxx, and submit_bio() to
>> zram, which will confuse the backing ext4 since it should
>> assume current->journal_info == NULL, so the virtual block
>> devices need another thread context to isolate those two
>> different uncontrolled contexts.
>>
>> So I don't think it's feasible for block drivers to act
>> like this, especially mixing with writing to backing fses
>> operations.
>
> Sorry, I don't completely understand your point, but backing
> device is never expected to have any fs on it. So from your
> email:
zram(ext4) means zram device itself is formated as ext4.
>
>> zram(ext4) -> backing ext4/btrfs
>
> This is not a valid configuration, as far as I'm concerned.
> Unless I'm missing your point.
Why it's not valid? zram can be used as a regular virtual
block device, and format with any fs, and mount the zram
then.
Thanks,
Gao Xiang
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-22 12:24 ` Gao Xiang
@ 2025-11-22 13:43 ` Sergey Senozhatsky
2025-11-22 14:09 ` Gao Xiang
2025-11-23 0:22 ` Sergey Senozhatsky
1 sibling, 1 reply; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-22 13:43 UTC (permalink / raw)
To: Gao Xiang
Cc: Sergey Senozhatsky, Yuwen Chen, akpm, bgeffon, licayy,
linux-block, linux-kernel, linux-mm, minchan, richardycc
On (25/11/22 20:24), Gao Xiang wrote:
> zram(ext4) means zram device itself is formated as ext4.
>
> >
> > > zram(ext4) -> backing ext4/btrfs
> >
> > This is not a valid configuration, as far as I'm concerned.
> > Unless I'm missing your point.
>
> Why it's not valid? zram can be used as a regular virtual
> block device, and format with any fs, and mount the zram
> then.
I thought you were talking about the backing device being
ext4/btrfs. Sorry, I don't have enough context/knowledge
to understand what you're getting at. zram has been doing
writeback for ages, I really don't know what you mean by
"to act like this".
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-22 13:43 ` Sergey Senozhatsky
@ 2025-11-22 14:09 ` Gao Xiang
2025-11-23 0:08 ` Sergey Senozhatsky
0 siblings, 1 reply; 36+ messages in thread
From: Gao Xiang @ 2025-11-22 14:09 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Yuwen Chen, akpm, bgeffon, licayy, linux-block, linux-kernel,
linux-mm, minchan, richardycc
On 2025/11/22 21:43, Sergey Senozhatsky wrote:
> On (25/11/22 20:24), Gao Xiang wrote:
>> zram(ext4) means zram device itself is formated as ext4.
>>
>>>
>>>> zram(ext4) -> backing ext4/btrfs
>>>
>>> This is not a valid configuration, as far as I'm concerned.
>>> Unless I'm missing your point.
>>
>> Why it's not valid? zram can be used as a regular virtual
>> block device, and format with any fs, and mount the zram
>> then.
>
> I thought you were talking about the backing device being
> ext4/btrfs. Sorry, I don't have enough context/knowledge
> to understand what you're getting at. zram has been doing
> writeback for ages, I really don't know what you mean by
> "to act like this".
I mean, if zram is formatted as ext4, and then mount it;
and then there is a backing file which is also in another
ext4, you'd need a workqueue to do writeback I/Os (or needs
a loop device to transit), was that the original question
raised by Yuwen?
If it's backed by a physical device rather than a file in
a filesystem, such potential problem doesn't exist.
Thanks,
Gao Xiang
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-22 14:09 ` Gao Xiang
@ 2025-11-23 0:08 ` Sergey Senozhatsky
2025-11-23 1:23 ` Gao Xiang
0 siblings, 1 reply; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-23 0:08 UTC (permalink / raw)
To: Gao Xiang
Cc: Sergey Senozhatsky, Yuwen Chen, akpm, bgeffon, licayy,
linux-block, linux-kernel, linux-mm, minchan, richardycc
On (25/11/22 22:09), Gao Xiang wrote:
> > I thought you were talking about the backing device being
> > ext4/btrfs. Sorry, I don't have enough context/knowledge
> > to understand what you're getting at. zram has been doing
> > writeback for ages, I really don't know what you mean by
> > "to act like this".
>
> I mean, if zram is formatted as ext4, and then mount it;
> and then there is a backing file which is also in another
> ext4, you'd need a workqueue to do writeback I/Os (or needs
> a loop device to transit), was that the original question
> raised by Yuwen?
We take pages of data from zram0 and write them straight to
the backing device. Those writes don't go through vfs/fs so
fs on the backing device will simply be corrupted, as far as
I can tell. This is not intendant use case for zram writeback.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-23 0:08 ` Sergey Senozhatsky
@ 2025-11-23 1:23 ` Gao Xiang
2025-11-23 3:07 ` Sergey Senozhatsky
0 siblings, 1 reply; 36+ messages in thread
From: Gao Xiang @ 2025-11-23 1:23 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Yuwen Chen, akpm, bgeffon, licayy, linux-block, linux-kernel,
linux-mm, minchan, richardycc
On 2025/11/23 08:08, Sergey Senozhatsky wrote:
> On (25/11/22 22:09), Gao Xiang wrote:
>>> I thought you were talking about the backing device being
>>> ext4/btrfs. Sorry, I don't have enough context/knowledge
>>> to understand what you're getting at. zram has been doing
>>> writeback for ages, I really don't know what you mean by
>>> "to act like this".
>>
>> I mean, if zram is formatted as ext4, and then mount it;
>> and then there is a backing file which is also in another
>> ext4, you'd need a workqueue to do writeback I/Os (or needs
>> a loop device to transit), was that the original question
>> raised by Yuwen?
>
> We take pages of data from zram0 and write them straight to
> the backing device. Those writes don't go through vfs/fs so
> fs on the backing device will simply be corrupted, as far as
> I can tell. This is not intendant use case for zram writeback.
I'm pretty sure you don't understand what I meant.
I won't reply this anymore, good luck.
Thanks,
Gao Xiang
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-23 1:23 ` Gao Xiang
@ 2025-11-23 3:07 ` Sergey Senozhatsky
0 siblings, 0 replies; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-23 3:07 UTC (permalink / raw)
To: Gao Xiang
Cc: Yuwen Chen, akpm, Brian Geffon, Fengyu Lian, linux-block,
Linux Kernel Mailing List, Linux MM, Minchan Kim, Richard Chang
[-- Attachment #1: Type: text/plain, Size: 147 bytes --]
On Sun, 23 Nov 2025, 10:23 am Gao Xiang, <hsiangkao@linux.alibaba.com>
wrote:
>
> I won't reply this anymore, good luck.
Cheers.
>
[-- Attachment #2: Type: text/html, Size: 758 bytes --]
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-22 12:24 ` Gao Xiang
2025-11-22 13:43 ` Sergey Senozhatsky
@ 2025-11-23 0:22 ` Sergey Senozhatsky
2025-11-23 1:39 ` Gao Xiang
1 sibling, 1 reply; 36+ messages in thread
From: Sergey Senozhatsky @ 2025-11-23 0:22 UTC (permalink / raw)
To: Gao Xiang
Cc: Sergey Senozhatsky, Yuwen Chen, akpm, bgeffon, licayy,
linux-block, linux-kernel, linux-mm, minchan, richardycc
On (25/11/22 20:24), Gao Xiang wrote:
> >
> > > zram(ext4) -> backing ext4/btrfs
> >
> > This is not a valid configuration, as far as I'm concerned.
> > Unless I'm missing your point.
>
> Why it's not valid? zram can be used as a regular virtual
> block device, and format with any fs, and mount the zram
> then.
If you want to move data between two filesystems, then just
mount both devices and cp/mv data between them. zram is not
going to do that for you, zram writeback is for different
purpose.
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [RFC PATCHv5 0/6] zram: introduce writeback bio batching
2025-11-23 0:22 ` Sergey Senozhatsky
@ 2025-11-23 1:39 ` Gao Xiang
0 siblings, 0 replies; 36+ messages in thread
From: Gao Xiang @ 2025-11-23 1:39 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Yuwen Chen, akpm, bgeffon, licayy, linux-block, linux-kernel,
linux-mm, minchan, richardycc
On 2025/11/23 08:22, Sergey Senozhatsky wrote:
> On (25/11/22 20:24), Gao Xiang wrote:
>>>
>>>> zram(ext4) -> backing ext4/btrfs
>>>
>>> This is not a valid configuration, as far as I'm concerned.
>>> Unless I'm missing your point.
>>
>> Why it's not valid? zram can be used as a regular virtual
>> block device, and format with any fs, and mount the zram
>> then.
>
> If you want to move data between two filesystems, then just
> mount both devices and cp/mv data between them. zram is not
> going to do that for you, zram writeback is for different
> purpose.
No, I know what zram writeback is and I was definitely not
saying using zram writeback device to mount something (if
you have interest, just check out my first reply, it's
already clear. Also you can know why loop devices need a
workqueue or a kthread since pre-v2.6 in the first place
just because of the same reason). I want to stop here
because it's none of my business.
Thanks,
Gao Xiang
^ permalink raw reply [flat|nested] 36+ messages in thread