From: Johannes Weiner <hannes@cmpxchg.org>
To: Nhat Pham <nphamcs@gmail.com>
Cc: akpm@linux-foundation.org, yosryahmed@google.com,
chengming.zhou@linux.dev, linux-mm@kvack.org,
kernel-team@meta.com, linux-kernel@vger.kernel.org
Subject: Re: [PATCH] zswap: do not crash the kernel on decompression failure
Date: Tue, 25 Feb 2025 19:51:49 -0500 [thread overview]
Message-ID: <20250226005149.GA1500140@cmpxchg.org> (raw)
In-Reply-To: <20250225213200.729056-1-nphamcs@gmail.com>
On Tue, Feb 25, 2025 at 01:32:00PM -0800, Nhat Pham wrote:
> Currently, we crash the kernel when a decompression failure occurs in
> zswap (either because of memory corruption, or a bug in the compression
> algorithm). This is overkill. We should only SIGBUS the unfortunate
> process asking for the zswap entry on zswap load, and skip the corrupted
> entry in zswap writeback.
>
> See [1] for a recent upstream discussion about this.
>
> [1]: https://lore.kernel.org/all/ZsiLElTykamcYZ6J@casper.infradead.org/
>
> Suggested-by: Matthew Wilcox <willy@infradead.org>
> Suggested-by: Yosry Ahmed <yosryahmed@google.com>
> Signed-off-by: Nhat Pham <nphamcs@gmail.com>
> ---
> mm/zswap.c | 85 +++++++++++++++++++++++++++++++++++++-----------------
> 1 file changed, 58 insertions(+), 27 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index f6316b66fb23..31d4397eed61 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -62,6 +62,8 @@ static u64 zswap_reject_reclaim_fail;
> static u64 zswap_reject_compress_fail;
> /* Compressed page was too big for the allocator to (optimally) store */
> static u64 zswap_reject_compress_poor;
> +/* Load and writeback failed due to decompression failure */
> +static u64 zswap_reject_decompress_fail;
"reject" refers to "rejected store", so the name doesn't quite make
sense. zswap_decompress_fail?
> /* Store failed because underlying allocator could not get memory */
> static u64 zswap_reject_alloc_fail;
> /* Store failed because the entry metadata could not be allocated (rare) */
> @@ -953,11 +955,12 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> return comp_ret == 0 && alloc_ret == 0;
> }
>
> -static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> +static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> {
> struct zpool *zpool = entry->pool->zpool;
> struct scatterlist input, output;
> struct crypto_acomp_ctx *acomp_ctx;
> + bool ret = true;
> u8 *src;
>
> acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> @@ -984,12 +987,19 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> sg_init_table(&output, 1);
> sg_set_folio(&output, folio, PAGE_SIZE, 0);
> acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
> - BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
> - BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
> + if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait) ||
> + acomp_ctx->req->dlen != PAGE_SIZE) {
> + ret = false;
> + zswap_reject_decompress_fail++;
> + pr_alert_ratelimited(
> + "decompression failed on zswap entry with offset %08lx\n",
> + entry->swpentry.val);
Since this is a pretty dramatic failure scenario, IMO it would be
useful to dump as much info as possible.
The exact return value of crypto_wait_req() could be useful,
entry->length and req->dlen too.
entry->pool->tfm_name just to make absolute sure there is no
confusion, as the compressor can be switched for new stores.
Maybe swp_type() and swp_offset() of entry->swpentry? Those could be
easy markers to see if the entry was corrupted for example.
> + }
> mutex_unlock(&acomp_ctx->mutex);
>
> if (src != acomp_ctx->buffer)
> zpool_unmap_handle(zpool, entry->handle);
> + return ret;
> }
>
> /*********************************
> @@ -1018,6 +1028,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> struct writeback_control wbc = {
> .sync_mode = WB_SYNC_NONE,
> };
> + int ret = 0;
>
> /* try to allocate swap cache folio */
> mpol = get_task_policy(current);
> @@ -1034,8 +1045,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> * and freed when invalidated by the concurrent shrinker anyway.
> */
> if (!folio_was_allocated) {
> - folio_put(folio);
> - return -EEXIST;
> + ret = -EEXIST;
> + goto put_folio;
> }
>
> /*
> @@ -1048,14 +1059,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> * be dereferenced.
> */
> tree = swap_zswap_tree(swpentry);
> - if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) {
> - delete_from_swap_cache(folio);
> - folio_unlock(folio);
> - folio_put(folio);
> - return -ENOMEM;
> + if (entry != xa_load(tree, offset)) {
> + ret = -ENOMEM;
> + goto fail;
> }
>
> - zswap_decompress(entry, folio);
> + if (!zswap_decompress(entry, folio)) {
> + ret = -EIO;
> + goto fail;
> + }
> +
> + xa_erase(tree, offset);
>
> count_vm_event(ZSWPWB);
> if (entry->objcg)
> @@ -1071,9 +1085,14 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>
> /* start writeback */
> __swap_writepage(folio, &wbc);
> - folio_put(folio);
> + goto put_folio;
>
> - return 0;
> +fail:
> + delete_from_swap_cache(folio);
> + folio_unlock(folio);
> +put_folio:
> + folio_put(folio);
> + return ret;
> }
Nice, yeah it's time for factoring out the error unwinding. If you
write it like this, you can save a jump in the main sequence:
__swap_writepage(folio, &wbc);
ret = 0;
put:
folio_put(folio);
return ret;
delete_unlock:
delete_from_swap_cache(folio);
folio_unlock(folio);
goto put;
>
> /*********************************
> @@ -1600,6 +1619,29 @@ bool zswap_load(struct folio *folio)
> if (WARN_ON_ONCE(folio_test_large(folio)))
> return true;
>
> + /*
> + * We cannot invalidate the zswap entry before decompressing it. If
> + * decompression fails, we must keep the entry in the tree so that
> + * a future read by another process on the same swap entry will also
> + * have to go through zswap. Otherwise, we risk silently reading
> + * corrupted data for the other process.
> + */
> + entry = xa_load(tree, offset);
> + if (!entry)
> + return false;
The explanation in the comment makes sense in the context of this
change. But fresh eyes reading this function and having never seen
that this *used to* open with xa_erase() will be confused. It answers
questions the reader doesn't have at this point - it's just a function
called zswap_load() beginning with an xa_load(), so what?
At first I was going to suggest moving it down to the swapcache
branch. But honestly after reading *that* comment again, in the new
sequence, I don't think the question will arise there either. It's
pretty self-evident that the whole "we can invalidate when reading
into the swapcache" does not apply if the read failed.
> + /*
> + * If decompression fails, we return true to notify the caller that the
> + * folio's data were in zswap, but do not mark the folio as up-to-date.
> + * This will effectively SIGBUS the calling process.
> + */
It would be good to put a lampshade on this weirdness that the return
value has nothing to do with success or not. This wasn't as important
a distinction, but with actual decompression failures I think it is.
Something like this?
if (!zswap_decompress(entry, folio)) {
/*
* The zswap_load() return value doesn't indicate success or
* failure, but whether zswap owns the swapped out contents.
* This MUST return true here, otherwise swap_readpage() will
* read garbage from the backend.
*
* Success is signaled by marking the folio uptodate.
*/
return true;
}
folio_mark_uptodate(folio);
> + if (!zswap_decompress(entry, folio))
> + return true;
> +
> + count_vm_event(ZSWPIN);
> + if (entry->objcg)
> + count_objcg_events(entry->objcg, ZSWPIN, 1);
> +
> /*
> * When reading into the swapcache, invalidate our entry. The
> * swapcache can be the authoritative owner of the page and
> @@ -1612,21 +1654,8 @@ bool zswap_load(struct folio *folio)
> * files, which reads into a private page and may free it if
> * the fault fails. We remain the primary owner of the entry.)
> */
> - if (swapcache)
> - entry = xa_erase(tree, offset);
> - else
> - entry = xa_load(tree, offset);
> -
> - if (!entry)
> - return false;
> -
> - zswap_decompress(entry, folio);
> -
> - count_vm_event(ZSWPIN);
> - if (entry->objcg)
> - count_objcg_events(entry->objcg, ZSWPIN, 1);
> -
> if (swapcache) {
> + xa_erase(tree, offset);
> zswap_entry_free(entry);
> folio_mark_dirty(folio);
> }
next prev parent reply other threads:[~2025-02-26 0:51 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-25 21:32 Nhat Pham
2025-02-25 22:25 ` Andrew Morton
2025-02-25 22:28 ` Nhat Pham
2025-02-26 0:51 ` Johannes Weiner [this message]
2025-02-26 2:45 ` Yosry Ahmed
2025-02-26 3:57 ` Johannes Weiner
2025-02-26 15:42 ` Yosry Ahmed
2025-02-26 23:39 ` Nhat Pham
2025-02-26 2:40 ` Yosry Ahmed
2025-02-26 23:16 ` Nhat Pham
2025-02-27 0:03 ` Yosry Ahmed
2025-02-26 3:12 ` Yosry Ahmed
2025-02-26 4:57 ` Johannes Weiner
2025-02-26 15:33 ` Yosry Ahmed
2025-02-26 23:20 ` Nhat Pham
2025-02-27 0:00 ` Yosry Ahmed
2025-02-26 23:29 ` Nhat Pham
2025-02-26 23:58 ` Yosry Ahmed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250226005149.GA1500140@cmpxchg.org \
--to=hannes@cmpxchg.org \
--cc=akpm@linux-foundation.org \
--cc=chengming.zhou@linux.dev \
--cc=kernel-team@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=nphamcs@gmail.com \
--cc=yosryahmed@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox