* [PATCH 1/4] mm/readahead: fix the broken readahead for POSIX_FADV_WILLNEED
2025-12-01 21:01 [PATCH RESEND 0/4] improve fadvise(POSIX_FADV_WILLNEED) with large folio Jaegeuk Kim
@ 2025-12-01 21:01 ` Jaegeuk Kim
2025-12-01 21:24 ` [f2fs-dev] " Chao Yu
2025-12-01 21:01 ` [PATCH 2/4] mm/readahead: use page_cache_sync_ra for FADVISE_FAV_WILLNEED Jaegeuk Kim
` (2 subsequent siblings)
3 siblings, 1 reply; 10+ messages in thread
From: Jaegeuk Kim @ 2025-12-01 21:01 UTC (permalink / raw)
To: linux-kernel, linux-f2fs-devel, linux-mm, Matthew Wilcox; +Cc: Jaegeuk Kim
This patch fixes the broken readahead flow for POSIX_FADV_WILLNEED, where
the problem is, in force_page_cache_ra(nr_to_read), nr_to_read is cut by
the below code.
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
IOWs, we are not able to read ahead larger than the above max_pages which
is most likely the range of 2MB and 16MB. Note, it doesn't make sense
to set ra->ra_pages to the entire file size. Instead, let's fix this logic.
Before:
f2fs_fadvise: dev = (252,16), ino = 14, i_size = 4294967296 offset:0, len:4294967296, advise:3
page_cache_ra_unbounded: dev=252:16 ino=e index=0 nr_to_read=512 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=512 nr_to_read=512 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=1024 nr_to_read=512 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=1536 nr_to_read=512 lookahead_size=0
After:
f2fs_fadvise: dev = (252,16), ino = 14, i_size = 4294967296 offset:0, len:4294967296, advise:3
page_cache_ra_unbounded: dev=252:16 ino=e index=0 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=2048 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=4096 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=6144 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=8192 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=10240 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=12288 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=14336 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=16384 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=18432 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=20480 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=22528 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=24576 nr_to_read=2048 lookahead_size=0
...
page_cache_ra_unbounded: dev=252:16 ino=e index=1042432 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=1044480 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=1046528 nr_to_read=2048 lookahead_size=0
Cc: linux-mm@kvack.org
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
mm/readahead.c | 27 ++++++++++++---------------
1 file changed, 12 insertions(+), 15 deletions(-)
diff --git a/mm/readahead.c b/mm/readahead.c
index 3a4b5d58eeb6..c0db049a5b7b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -311,7 +311,7 @@ EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*/
-static void do_page_cache_ra(struct readahead_control *ractl,
+static int do_page_cache_ra(struct readahead_control *ractl,
unsigned long nr_to_read, unsigned long lookahead_size)
{
struct inode *inode = ractl->mapping->host;
@@ -320,45 +320,42 @@ static void do_page_cache_ra(struct readahead_control *ractl,
pgoff_t end_index; /* The last page we want to read */
if (isize == 0)
- return;
+ return -EINVAL;
end_index = (isize - 1) >> PAGE_SHIFT;
if (index > end_index)
- return;
+ return -EINVAL;
/* Don't read past the page containing the last byte of the file */
if (nr_to_read > end_index - index)
nr_to_read = end_index - index + 1;
page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
+ return 0;
}
/*
- * Chunk the readahead into 2 megabyte units, so that we don't pin too much
- * memory at once.
+ * Chunk the readahead per the block device capacity, and read all nr_to_read.
*/
void force_page_cache_ra(struct readahead_control *ractl,
unsigned long nr_to_read)
{
struct address_space *mapping = ractl->mapping;
- struct file_ra_state *ra = ractl->ra;
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
- unsigned long max_pages;
+ unsigned long this_chunk;
if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead))
return;
/*
- * If the request exceeds the readahead window, allow the read to
- * be up to the optimal hardware IO size
+ * Consier the optimal hardware IO size for readahead chunk.
*/
- max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
- nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
+ this_chunk = max_t(unsigned long, bdi->io_pages, ractl->ra->ra_pages);
+
while (nr_to_read) {
- unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
+ this_chunk = min_t(unsigned long, this_chunk, nr_to_read);
- if (this_chunk > nr_to_read)
- this_chunk = nr_to_read;
- do_page_cache_ra(ractl, this_chunk, 0);
+ if (do_page_cache_ra(ractl, this_chunk, 0))
+ break;
nr_to_read -= this_chunk;
}
--
2.52.0.107.ga0afd4fd5b-goog
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [f2fs-dev] [PATCH 1/4] mm/readahead: fix the broken readahead for POSIX_FADV_WILLNEED
2025-12-01 21:01 ` [PATCH 1/4] mm/readahead: fix the broken readahead for POSIX_FADV_WILLNEED Jaegeuk Kim
@ 2025-12-01 21:24 ` Chao Yu
0 siblings, 0 replies; 10+ messages in thread
From: Chao Yu @ 2025-12-01 21:24 UTC (permalink / raw)
To: Jaegeuk Kim, linux-kernel, linux-f2fs-devel, linux-mm, Matthew Wilcox
Cc: chao
On 2025/12/2 05:01, Jaegeuk Kim via Linux-f2fs-devel wrote:
> This patch fixes the broken readahead flow for POSIX_FADV_WILLNEED, where
> the problem is, in force_page_cache_ra(nr_to_read), nr_to_read is cut by
> the below code.
>
> max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
> nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
>
> IOWs, we are not able to read ahead larger than the above max_pages which
> is most likely the range of 2MB and 16MB. Note, it doesn't make sense
> to set ra->ra_pages to the entire file size. Instead, let's fix this logic.
>
> Before:
> f2fs_fadvise: dev = (252,16), ino = 14, i_size = 4294967296 offset:0, len:4294967296, advise:3
> page_cache_ra_unbounded: dev=252:16 ino=e index=0 nr_to_read=512 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=512 nr_to_read=512 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=1024 nr_to_read=512 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=1536 nr_to_read=512 lookahead_size=0
>
> After:
> f2fs_fadvise: dev = (252,16), ino = 14, i_size = 4294967296 offset:0, len:4294967296, advise:3
> page_cache_ra_unbounded: dev=252:16 ino=e index=0 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=2048 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=4096 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=6144 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=8192 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=10240 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=12288 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=14336 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=16384 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=18432 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=20480 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=22528 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=24576 nr_to_read=2048 lookahead_size=0
> ...
> page_cache_ra_unbounded: dev=252:16 ino=e index=1042432 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=1044480 nr_to_read=2048 lookahead_size=0
> page_cache_ra_unbounded: dev=252:16 ino=e index=1046528 nr_to_read=2048 lookahead_size=0
>
> Cc: linux-mm@kvack.org
> Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> ---
> mm/readahead.c | 27 ++++++++++++---------------
> 1 file changed, 12 insertions(+), 15 deletions(-)
>
> diff --git a/mm/readahead.c b/mm/readahead.c
> index 3a4b5d58eeb6..c0db049a5b7b 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -311,7 +311,7 @@ EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
> * behaviour which would occur if page allocations are causing VM writeback.
> * We really don't want to intermingle reads and writes like that.
> */
> -static void do_page_cache_ra(struct readahead_control *ractl,
> +static int do_page_cache_ra(struct readahead_control *ractl,
> unsigned long nr_to_read, unsigned long lookahead_size)
> {
> struct inode *inode = ractl->mapping->host;
> @@ -320,45 +320,42 @@ static void do_page_cache_ra(struct readahead_control *ractl,
> pgoff_t end_index; /* The last page we want to read */
>
> if (isize == 0)
> - return;
> + return -EINVAL;
>
> end_index = (isize - 1) >> PAGE_SHIFT;
> if (index > end_index)
> - return;
> + return -EINVAL;
> /* Don't read past the page containing the last byte of the file */
> if (nr_to_read > end_index - index)
> nr_to_read = end_index - index + 1;
>
> page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
> + return 0;
> }
>
> /*
> - * Chunk the readahead into 2 megabyte units, so that we don't pin too much
> - * memory at once.
> + * Chunk the readahead per the block device capacity, and read all nr_to_read.
> */
> void force_page_cache_ra(struct readahead_control *ractl,
> unsigned long nr_to_read)
> {
> struct address_space *mapping = ractl->mapping;
> - struct file_ra_state *ra = ractl->ra;
> struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
> - unsigned long max_pages;
> + unsigned long this_chunk;
>
> if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead))
> return;
>
> /*
> - * If the request exceeds the readahead window, allow the read to
> - * be up to the optimal hardware IO size
> + * Consier the optimal hardware IO size for readahead chunk.
s/Consier/Consider
Thanks,
> */
> - max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
> - nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
> + this_chunk = max_t(unsigned long, bdi->io_pages, ractl->ra->ra_pages);
> +
> while (nr_to_read) {
> - unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
> + this_chunk = min_t(unsigned long, this_chunk, nr_to_read);
>
> - if (this_chunk > nr_to_read)
> - this_chunk = nr_to_read;
> - do_page_cache_ra(ractl, this_chunk, 0);
> + if (do_page_cache_ra(ractl, this_chunk, 0))
> + break;
>
> nr_to_read -= this_chunk;
> }
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 2/4] mm/readahead: use page_cache_sync_ra for FADVISE_FAV_WILLNEED
2025-12-01 21:01 [PATCH RESEND 0/4] improve fadvise(POSIX_FADV_WILLNEED) with large folio Jaegeuk Kim
2025-12-01 21:01 ` [PATCH 1/4] mm/readahead: fix the broken readahead for POSIX_FADV_WILLNEED Jaegeuk Kim
@ 2025-12-01 21:01 ` Jaegeuk Kim
2025-12-01 21:01 ` [PATCH 3/4] mm/readahead: add a_ops->ra_folio_order to get a desired folio order Jaegeuk Kim
2025-12-01 21:01 ` [PATCH 4/4] f2fs: attach a_ops->ra_folio_order to allocate large folios for readahead Jaegeuk Kim
3 siblings, 0 replies; 10+ messages in thread
From: Jaegeuk Kim @ 2025-12-01 21:01 UTC (permalink / raw)
To: linux-kernel, linux-f2fs-devel, linux-mm, Matthew Wilcox; +Cc: Jaegeuk Kim
This patch replaces page_cache_ra_unbounded() with page_cache_sync_ra() in
fadvise(FADVISE_FAV_WILLNEED) to support the large folio.
Before:
f2fs_fadvise: dev = (252,16), ino = 14, i_size = 4294967296 offset:0, len:4294967296, advise:3
page_cache_ra_unbounded: dev=252:16 ino=e index=0 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=2048 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=4096 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=6144 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=8192 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=10240 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=12288 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=14336 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=16384 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=18432 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=20480 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=22528 nr_to_read=2048 lookahead_size=0
page_cache_ra_unbounded: dev=252:16 ino=e index=24576 nr_to_read=2048 lookahead_size=0
...
page_cache_ra_unbounded: dev=252:16 ino=e index=1042432 nr_to_read=2048 lookahead_size=0
This is all zero-order page allocation.
After (order=0 by default):
f2fs_fadvise: dev = (252,16), ino = 14, i_size = 4294967296 offset:0, len:4294967296, advise:3
page_cache_sync_ra: dev=252:16 ino=e index=0 req_count=2048 order=0 size=0 async_size=0 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_order: dev=252:16 ino=e index=0 order=0 size=2048 async_size=1024 ra_pages=2048
page_cache_sync_ra: dev=252:16 ino=e index=2048 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=2048 nr_to_read=2048 lookahead_size=0
page_cache_sync_ra: dev=252:16 ino=e index=4096 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=4096 nr_to_read=2048 lookahead_size=0
page_cache_sync_ra: dev=252:16 ino=e index=6144 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=6144 nr_to_read=2048 lookahead_size=0
page_cache_sync_ra: dev=252:16 ino=e index=8192 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=8192 nr_to_read=2048 lookahead_size=0
page_cache_sync_ra: dev=252:16 ino=e index=10240 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=10240 nr_to_read=2048 lookahead_size=0
...
page_cache_ra_unbounded: dev=252:16 ino=e index=1042432 nr_to_read=2048 lookahead_size=0
page_cache_sync_ra: dev=252:16 ino=e index=1044480 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=1044480 nr_to_read=2048 lookahead_size=0
page_cache_sync_ra: dev=252:16 ino=e index=1046528 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=1046528 nr_to_read=2048 lookahead_size=0
Cc: linux-mm@kvack.org
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
mm/readahead.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/mm/readahead.c b/mm/readahead.c
index c0db049a5b7b..5beaf7803554 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -340,6 +340,7 @@ void force_page_cache_ra(struct readahead_control *ractl,
unsigned long nr_to_read)
{
struct address_space *mapping = ractl->mapping;
+ struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long this_chunk;
@@ -352,11 +353,19 @@ void force_page_cache_ra(struct readahead_control *ractl,
this_chunk = max_t(unsigned long, bdi->io_pages, ractl->ra->ra_pages);
while (nr_to_read) {
- this_chunk = min_t(unsigned long, this_chunk, nr_to_read);
+ unsigned long index = readahead_index(ractl);
+ pgoff_t end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
- if (do_page_cache_ra(ractl, this_chunk, 0))
+ if (index > end_index)
break;
+ if (nr_to_read > end_index - index)
+ nr_to_read = end_index - index + 1;
+
+ this_chunk = min_t(unsigned long, this_chunk, nr_to_read);
+
+ page_cache_sync_ra(ractl, this_chunk);
+
nr_to_read -= this_chunk;
}
}
@@ -573,7 +582,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
/* be dumb */
if (do_forced_ra) {
- force_page_cache_ra(ractl, req_count);
+ do_page_cache_ra(ractl, req_count, 0);
return;
}
--
2.52.0.107.ga0afd4fd5b-goog
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH 3/4] mm/readahead: add a_ops->ra_folio_order to get a desired folio order
2025-12-01 21:01 [PATCH RESEND 0/4] improve fadvise(POSIX_FADV_WILLNEED) with large folio Jaegeuk Kim
2025-12-01 21:01 ` [PATCH 1/4] mm/readahead: fix the broken readahead for POSIX_FADV_WILLNEED Jaegeuk Kim
2025-12-01 21:01 ` [PATCH 2/4] mm/readahead: use page_cache_sync_ra for FADVISE_FAV_WILLNEED Jaegeuk Kim
@ 2025-12-01 21:01 ` Jaegeuk Kim
2025-12-01 21:20 ` Matthew Wilcox
2025-12-01 21:01 ` [PATCH 4/4] f2fs: attach a_ops->ra_folio_order to allocate large folios for readahead Jaegeuk Kim
3 siblings, 1 reply; 10+ messages in thread
From: Jaegeuk Kim @ 2025-12-01 21:01 UTC (permalink / raw)
To: linux-kernel, linux-f2fs-devel, linux-mm, Matthew Wilcox; +Cc: Jaegeuk Kim
This patch introduces a new address operation, a_ops->ra_folio_order(), which
proposes a new folio order based on the adjusted order for page_cache_sync_ra.
Hence, each filesystem can set the desired minimum order of folio allocation
when requesting fadvise(POSIX_FADV_WILLNEED).
Cc: linux-mm@kvack.org
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
include/linux/fs.h | 4 ++++
include/linux/pagemap.h | 12 ++++++++++++
mm/readahead.c | 6 ++++--
3 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..ddab68b7e03b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -472,6 +472,10 @@ struct address_space_operations {
void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb);
int (*error_remove_folio)(struct address_space *, struct folio *);
+ /* Min folio order to allocate pages. */
+ unsigned int (*ra_folio_order)(struct address_space *mapping,
+ unsigned int order);
+
/* swapfile support */
int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
sector_t *span);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09b581c1d878..e1fe07477220 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -476,6 +476,18 @@ mapping_min_folio_order(const struct address_space *mapping)
return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN;
}
+static inline unsigned int
+mapping_ra_folio_order(struct address_space *mapping, unsigned int order)
+{
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return 0;
+
+ if (!mapping->a_ops->ra_folio_order)
+ return order;
+
+ return mapping->a_ops->ra_folio_order(mapping, order);
+}
+
static inline unsigned long
mapping_min_folio_nrpages(const struct address_space *mapping)
{
diff --git a/mm/readahead.c b/mm/readahead.c
index 5beaf7803554..8c7d08af6e00 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -592,8 +592,10 @@ void page_cache_sync_ra(struct readahead_control *ractl,
* A start of file, oversized read, or sequential cache miss:
* trivial case: (index - prev_index) == 1
* unaligned reads: (index - prev_index) == 0
+ * if filesystem sets high-order allocation
*/
- if (!index || req_count > max_pages || index - prev_index <= 1UL) {
+ if (!index || req_count > max_pages || index - prev_index <= 1UL ||
+ mapping_ra_folio_order(ractl->mapping, 0)) {
ra->start = index;
ra->size = get_init_ra_size(req_count, max_pages);
ra->async_size = ra->size > req_count ? ra->size - req_count :
@@ -627,7 +629,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
ra->size = min(contig_count + req_count, max_pages);
ra->async_size = 1;
readit:
- ra->order = 0;
+ ra->order = mapping_ra_folio_order(ractl->mapping, 0);
ractl->_index = ra->start;
page_cache_ra_order(ractl, ra);
}
--
2.52.0.107.ga0afd4fd5b-goog
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH 3/4] mm/readahead: add a_ops->ra_folio_order to get a desired folio order
2025-12-01 21:01 ` [PATCH 3/4] mm/readahead: add a_ops->ra_folio_order to get a desired folio order Jaegeuk Kim
@ 2025-12-01 21:20 ` Matthew Wilcox
2025-12-01 21:24 ` Jaegeuk Kim
0 siblings, 1 reply; 10+ messages in thread
From: Matthew Wilcox @ 2025-12-01 21:20 UTC (permalink / raw)
To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel, linux-mm
On Mon, Dec 01, 2025 at 09:01:26PM +0000, Jaegeuk Kim wrote:
> This patch introduces a new address operation, a_ops->ra_folio_order(), which
> proposes a new folio order based on the adjusted order for page_cache_sync_ra.
>
> Hence, each filesystem can set the desired minimum order of folio allocation
> when requesting fadvise(POSIX_FADV_WILLNEED).
Again, you've said what but not why. Does the mm code not ramp up the
folio order sufficiently quickly? What are you trying to accomplish?
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 3/4] mm/readahead: add a_ops->ra_folio_order to get a desired folio order
2025-12-01 21:20 ` Matthew Wilcox
@ 2025-12-01 21:24 ` Jaegeuk Kim
2025-12-01 21:38 ` Matthew Wilcox
0 siblings, 1 reply; 10+ messages in thread
From: Jaegeuk Kim @ 2025-12-01 21:24 UTC (permalink / raw)
To: Matthew Wilcox; +Cc: linux-kernel, linux-f2fs-devel, linux-mm
On 12/01, Matthew Wilcox wrote:
> On Mon, Dec 01, 2025 at 09:01:26PM +0000, Jaegeuk Kim wrote:
> > This patch introduces a new address operation, a_ops->ra_folio_order(), which
> > proposes a new folio order based on the adjusted order for page_cache_sync_ra.
> >
> > Hence, each filesystem can set the desired minimum order of folio allocation
> > when requesting fadvise(POSIX_FADV_WILLNEED).
>
> Again, you've said what but not why. Does the mm code not ramp up the
> folio order sufficiently quickly? What are you trying to accomplish?
That's why I posted a series of the patches to provide more details. Could you
please check the last patch in the series to show fadvise() does not increase
the folio order?
https://lore.kernel.org/linux-f2fs-devel/20251201210152.909339-5-jaegeuk@kernel.org/T/#u
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 3/4] mm/readahead: add a_ops->ra_folio_order to get a desired folio order
2025-12-01 21:24 ` Jaegeuk Kim
@ 2025-12-01 21:38 ` Matthew Wilcox
2025-12-02 1:33 ` Jaegeuk Kim
0 siblings, 1 reply; 10+ messages in thread
From: Matthew Wilcox @ 2025-12-01 21:38 UTC (permalink / raw)
To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel, linux-mm
On Mon, Dec 01, 2025 at 09:24:41PM +0000, Jaegeuk Kim wrote:
> On 12/01, Matthew Wilcox wrote:
> > On Mon, Dec 01, 2025 at 09:01:26PM +0000, Jaegeuk Kim wrote:
> > > This patch introduces a new address operation, a_ops->ra_folio_order(), which
> > > proposes a new folio order based on the adjusted order for page_cache_sync_ra.
> > >
> > > Hence, each filesystem can set the desired minimum order of folio allocation
> > > when requesting fadvise(POSIX_FADV_WILLNEED).
> >
> > Again, you've said what but not why. Does the mm code not ramp up the
> > folio order sufficiently quickly? What are you trying to accomplish?
>
> That's why I posted a series of the patches to provide more details. Could you
> please check the last patch in the series to show fadvise() does not increase
> the folio order?
>
> https://lore.kernel.org/linux-f2fs-devel/20251201210152.909339-5-jaegeuk@kernel.org/T/#u
So what you're trying to say is that readahead() currently only uses
order-0 pages and you want it to use larger order pages? I agree with
that! But I don't think this is the way to do it. We should just use
larger order allocations, always. None of this "call the filesystem,
check a sysfs parameter". Just use the largest order page that fits.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 3/4] mm/readahead: add a_ops->ra_folio_order to get a desired folio order
2025-12-01 21:38 ` Matthew Wilcox
@ 2025-12-02 1:33 ` Jaegeuk Kim
0 siblings, 0 replies; 10+ messages in thread
From: Jaegeuk Kim @ 2025-12-02 1:33 UTC (permalink / raw)
To: Matthew Wilcox; +Cc: linux-kernel, linux-f2fs-devel, linux-mm
On 12/01, Matthew Wilcox wrote:
> On Mon, Dec 01, 2025 at 09:24:41PM +0000, Jaegeuk Kim wrote:
> > On 12/01, Matthew Wilcox wrote:
> > > On Mon, Dec 01, 2025 at 09:01:26PM +0000, Jaegeuk Kim wrote:
> > > > This patch introduces a new address operation, a_ops->ra_folio_order(), which
> > > > proposes a new folio order based on the adjusted order for page_cache_sync_ra.
> > > >
> > > > Hence, each filesystem can set the desired minimum order of folio allocation
> > > > when requesting fadvise(POSIX_FADV_WILLNEED).
> > >
> > > Again, you've said what but not why. Does the mm code not ramp up the
> > > folio order sufficiently quickly? What are you trying to accomplish?
> >
> > That's why I posted a series of the patches to provide more details. Could you
> > please check the last patch in the series to show fadvise() does not increase
> > the folio order?
> >
> > https://lore.kernel.org/linux-f2fs-devel/20251201210152.909339-5-jaegeuk@kernel.org/T/#u
>
> So what you're trying to say is that readahead() currently only uses
> order-0 pages and you want it to use larger order pages? I agree with
> that! But I don't think this is the way to do it. We should just use
> larger order allocations, always. None of this "call the filesystem,
> check a sysfs parameter". Just use the largest order page that fits.
I got that, and posted v2.
Thanks,
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 4/4] f2fs: attach a_ops->ra_folio_order to allocate large folios for readahead
2025-12-01 21:01 [PATCH RESEND 0/4] improve fadvise(POSIX_FADV_WILLNEED) with large folio Jaegeuk Kim
` (2 preceding siblings ...)
2025-12-01 21:01 ` [PATCH 3/4] mm/readahead: add a_ops->ra_folio_order to get a desired folio order Jaegeuk Kim
@ 2025-12-01 21:01 ` Jaegeuk Kim
3 siblings, 0 replies; 10+ messages in thread
From: Jaegeuk Kim @ 2025-12-01 21:01 UTC (permalink / raw)
To: linux-kernel, linux-f2fs-devel, linux-mm, Matthew Wilcox; +Cc: Jaegeuk Kim
This patch adds a sysfs entry to change the folio order. Given ra_folio_order=9,
we can see page_cache_ra_order getting order=9 when we submit readahead() as
below.
==== folio_order=0 ====
f2fs_fadvise: dev = (252,16), ino = 14, i_size = 4294967296 offset:0, len:4294967296, advise:3
page_cache_sync_ra: dev=252:16 ino=e index=0 req_count=2048 order=0 size=0 async_size=0 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_order: dev=252:16 ino=e index=0 order=0 size=2048 async_size=1024 ra_pages=2048
page_cache_sync_ra: dev=252:16 ino=e index=2048 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=2048 nr_to_read=2048 lookahead_size=0
page_cache_sync_ra: dev=252:16 ino=e index=4096 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=4096 nr_to_read=2048 lookahead_size=0
page_cache_sync_ra: dev=252:16 ino=e index=6144 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=6144 nr_to_read=2048 lookahead_size=0
page_cache_sync_ra: dev=252:16 ino=e index=8192 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=8192 nr_to_read=2048 lookahead_size=0
page_cache_sync_ra: dev=252:16 ino=e index=10240 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=10240 nr_to_read=2048 lookahead_size=0
...
page_cache_ra_unbounded: dev=252:16 ino=e index=1042432 nr_to_read=2048 lookahead_size=0
page_cache_sync_ra: dev=252:16 ino=e index=1044480 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=1044480 nr_to_read=2048 lookahead_size=0
page_cache_sync_ra: dev=252:16 ino=e index=1046528 req_count=2048 order=0 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_unbounded: dev=252:16 ino=e index=1046528 nr_to_read=2048 lookahead_size=0
==== folio_order=9 ====
f2fs_fadvise: dev = (252,16), ino = 14, i_size = 4294967296 offset:0, len:4294967296, advise:3
page_cache_sync_ra: dev=252:16 ino=e index=0 req_count=2048 order=0 size=0 async_size=0 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_order: dev=252:16 ino=e index=0 order=9 size=2048 async_size=1024 ra_pages=2048
page_cache_sync_ra: dev=252:16 ino=e index=2048 req_count=2048 order=9 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_order: dev=252:16 ino=e index=2048 order=9 size=2048 async_size=1024 ra_pages=2048
page_cache_sync_ra: dev=252:16 ino=e index=4096 req_count=2048 order=9 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_order: dev=252:16 ino=e index=4096 order=9 size=2048 async_size=1024 ra_pages=2048
page_cache_sync_ra: dev=252:16 ino=e index=6144 req_count=2048 order=9 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_order: dev=252:16 ino=e index=6144 order=9 size=2048 async_size=1024 ra_pages=2048
page_cache_sync_ra: dev=252:16 ino=e index=8192 req_count=2048 order=9 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_order: dev=252:16 ino=e index=8192 order=9 size=2048 async_size=1024 ra_pages=2048
page_cache_sync_ra: dev=252:16 ino=e index=10240 req_count=2048 order=9 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_order: dev=252:16 ino=e index=10240 order=9 size=2048 async_size=1024 ra_pages=2048
page_cache_sync_ra: dev=252:16 ino=e index=12288 req_count=2048 order=9 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
...
page_cache_sync_ra: dev=252:16 ino=e index=1040384 req_count=2048 order=9 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_order: dev=252:16 ino=e index=1040384 order=9 size=2048 async_size=1024 ra_pages=2048
page_cache_sync_ra: dev=252:16 ino=e index=1042432 req_count=2048 order=9 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_order: dev=252:16 ino=e index=1042432 order=9 size=2048 async_size=1024 ra_pages=2048
page_cache_sync_ra: dev=252:16 ino=e index=1044480 req_count=2048 order=9 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_order: dev=252:16 ino=e index=1044480 order=9 size=2048 async_size=1024 ra_pages=2048
page_cache_sync_ra: dev=252:16 ino=e index=1046528 req_count=2048 order=9 size=2048 async_size=1024 ra_pages=2048 mmap_miss=0 prev_pos=-1
page_cache_ra_order: dev=252:16 ino=e index=1046528 order=9 size=2048 async_size=1024 ra_pages=2048
Cc: linux-mm@kvack.org
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
fs/f2fs/data.c | 9 +++++++++
fs/f2fs/f2fs.h | 3 +++
fs/f2fs/super.c | 1 +
fs/f2fs/sysfs.c | 9 +++++++++
4 files changed, 22 insertions(+)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7a4f0f2d60cf..addef5a1fdb1 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -3995,6 +3995,14 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping,
return false;
}
+static unsigned int f2fs_ra_folio_order(struct address_space *mapping,
+ unsigned int order)
+{
+ if (!mapping_large_folio_support(mapping))
+ return order;
+
+ return max(order, F2FS_M_SB(mapping)->ra_folio_order);
+}
static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block)
{
@@ -4313,6 +4321,7 @@ const struct address_space_operations f2fs_dblock_aops = {
.dirty_folio = f2fs_dirty_data_folio,
.migrate_folio = filemap_migrate_folio,
.invalidate_folio = f2fs_invalidate_folio,
+ .ra_folio_order = f2fs_ra_folio_order,
.release_folio = f2fs_release_folio,
.bmap = f2fs_bmap,
.swap_activate = f2fs_swap_activate,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d7600979218e..06f90d510a01 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1932,6 +1932,9 @@ struct f2fs_sb_info {
/* carve out reserved_blocks from total blocks */
bool carve_out;
+ /* enable large folio for readahead. */
+ unsigned int ra_folio_order;
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct kmem_cache *page_array_slab; /* page array entry */
unsigned int page_array_slab_size; /* default page array slab size */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index ccb477086444..bae02ca96c1f 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4287,6 +4287,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
NAT_ENTRY_PER_BLOCK));
sbi->allocate_section_hint = le32_to_cpu(raw_super->section_count);
sbi->allocate_section_policy = ALLOCATE_FORWARD_NOHINT;
+ sbi->ra_folio_order = 0;
F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino);
F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino);
F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index c42f4f979d13..2537a25986a6 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -906,6 +906,13 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
return count;
}
+ if (!strcmp(a->attr.name, "ra_folio_order")) {
+ if (t < 0 || t > MAX_PAGECACHE_ORDER)
+ return -EINVAL;
+ sbi->ra_folio_order = t;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -1180,6 +1187,7 @@ F2FS_SBI_GENERAL_RW_ATTR(migration_window_granularity);
F2FS_SBI_GENERAL_RW_ATTR(dir_level);
F2FS_SBI_GENERAL_RW_ATTR(allocate_section_hint);
F2FS_SBI_GENERAL_RW_ATTR(allocate_section_policy);
+F2FS_SBI_GENERAL_RW_ATTR(ra_folio_order);
#ifdef CONFIG_F2FS_IOSTAT
F2FS_SBI_GENERAL_RW_ATTR(iostat_enable);
F2FS_SBI_GENERAL_RW_ATTR(iostat_period_ms);
@@ -1422,6 +1430,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(reserved_pin_section),
ATTR_LIST(allocate_section_hint),
ATTR_LIST(allocate_section_policy),
+ ATTR_LIST(ra_folio_order),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
--
2.52.0.107.ga0afd4fd5b-goog
^ permalink raw reply [flat|nested] 10+ messages in thread