linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Dan Williams <dan.j.williams@intel.com>
To: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Cc: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	linux-xfs <linux-xfs@vger.kernel.org>,
	 Linux NVDIMM <nvdimm@lists.linux.dev>,
	Linux MM <linux-mm@kvack.org>,
	 linux-fsdevel <linux-fsdevel@vger.kernel.org>,
	"Darrick J. Wong" <djwong@kernel.org>,
	 david <david@fromorbit.com>,
	Christoph Hellwig <hch@infradead.org>,
	Jane Chu <jane.chu@oracle.com>
Subject: Re: [PATCH v10 8/9] xfs: Implement ->notify_failure() for XFS
Date: Tue, 15 Feb 2022 17:56:51 -0800	[thread overview]
Message-ID: <CAPcyv4hBpHsPRXZKtHtN0hVQhjZspZBz9egO=wn+54KDJokStw@mail.gmail.com> (raw)
In-Reply-To: <20220127124058.1172422-9-ruansy.fnst@fujitsu.com>

On Thu, Jan 27, 2022 at 4:41 AM Shiyang Ruan <ruansy.fnst@fujitsu.com> wrote:
>
> Introduce xfs_notify_failure.c to handle failure related works, such as
> implement ->notify_failure(), register/unregister dax holder in xfs, and
> so on.
>
> If the rmap feature of XFS enabled, we can query it to find files and
> metadata which are associated with the corrupt data.  For now all we do
> is kill processes with that file mapped into their address spaces, but
> future patches could actually do something about corrupt metadata.
>
> After that, the memory failure needs to notify the processes who are
> using those files.
>
> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> ---
>  fs/xfs/Makefile             |   1 +
>  fs/xfs/xfs_buf.c            |  12 ++
>  fs/xfs/xfs_fsops.c          |   3 +
>  fs/xfs/xfs_mount.h          |   1 +
>  fs/xfs/xfs_notify_failure.c | 222 ++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_notify_failure.h |  10 ++
>  6 files changed, 249 insertions(+)
>  create mode 100644 fs/xfs/xfs_notify_failure.c
>  create mode 100644 fs/xfs/xfs_notify_failure.h
>
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 04611a1068b4..389970b3e13b 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -84,6 +84,7 @@ xfs-y                         += xfs_aops.o \
>                                    xfs_message.o \
>                                    xfs_mount.o \
>                                    xfs_mru_cache.o \
> +                                  xfs_notify_failure.o \
>                                    xfs_pwork.o \
>                                    xfs_reflink.o \
>                                    xfs_stats.o \
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index b45e0d50a405..017010b3d601 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -19,6 +19,7 @@
>  #include "xfs_errortag.h"
>  #include "xfs_error.h"
>  #include "xfs_ag.h"
> +#include "xfs_notify_failure.h"
>
>  static struct kmem_cache *xfs_buf_cache;
>
> @@ -1892,6 +1893,8 @@ xfs_free_buftarg(
>         list_lru_destroy(&btp->bt_lru);
>
>         blkdev_issue_flush(btp->bt_bdev);
> +       if (btp->bt_daxdev)
> +               dax_unregister_holder(btp->bt_daxdev);
>         fs_put_dax(btp->bt_daxdev);
>
>         kmem_free(btp);
> @@ -1946,6 +1949,15 @@ xfs_alloc_buftarg(
>         btp->bt_dev =  bdev->bd_dev;
>         btp->bt_bdev = bdev;
>         btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
> +       if (btp->bt_daxdev) {
> +               if (dax_get_holder(btp->bt_daxdev)) {
> +                       xfs_err(mp, "DAX device already in use?!");

Per the earlier feedback this can be checked atomically inside of
dax_register_holder() with cmpxchg().

> +                       goto error_free;
> +               }
> +
> +               dax_register_holder(btp->bt_daxdev, mp,
> +                               &xfs_dax_holder_operations);
> +       }
>
>         /*
>          * Buffer IO error rate limiting. Limit it to no more than 10 messages
> diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> index 33e26690a8c4..d4d36c5bef11 100644
> --- a/fs/xfs/xfs_fsops.c
> +++ b/fs/xfs/xfs_fsops.c
> @@ -542,6 +542,9 @@ xfs_do_force_shutdown(
>         } else if (flags & SHUTDOWN_CORRUPT_INCORE) {
>                 tag = XFS_PTAG_SHUTDOWN_CORRUPT;
>                 why = "Corruption of in-memory data";
> +       } else if (flags & SHUTDOWN_CORRUPT_ONDISK) {
> +               tag = XFS_PTAG_SHUTDOWN_CORRUPT;
> +               why = "Corruption of on-disk metadata";
>         } else {
>                 tag = XFS_PTAG_SHUTDOWN_IOERROR;
>                 why = "Metadata I/O Error";
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 00720a02e761..47ff4ac53c4c 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -435,6 +435,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
>  #define SHUTDOWN_LOG_IO_ERROR  0x0002  /* write attempt to the log failed */
>  #define SHUTDOWN_FORCE_UMOUNT  0x0004  /* shutdown from a forced unmount */
>  #define SHUTDOWN_CORRUPT_INCORE        0x0008  /* corrupt in-memory data structures */
> +#define SHUTDOWN_CORRUPT_ONDISK        0x0010  /* corrupt metadata on device */
>
>  #define XFS_SHUTDOWN_STRINGS \
>         { SHUTDOWN_META_IO_ERROR,       "metadata_io" }, \
> diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
> new file mode 100644
> index 000000000000..6abaa043f4bc
> --- /dev/null
> +++ b/fs/xfs/xfs_notify_failure.c
> @@ -0,0 +1,222 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2021 Fujitsu.  All Rights Reserved.
> + */
> +
> +#include "xfs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_alloc.h"
> +#include "xfs_bit.h"
> +#include "xfs_btree.h"
> +#include "xfs_inode.h"
> +#include "xfs_icache.h"
> +#include "xfs_rmap.h"
> +#include "xfs_rmap_btree.h"
> +#include "xfs_rtalloc.h"
> +#include "xfs_trans.h"
> +
> +#include <linux/mm.h>
> +#include <linux/dax.h>
> +
> +struct failure_info {
> +       xfs_agblock_t           startblock;
> +       xfs_extlen_t            blockcount;
> +       int                     mf_flags;
> +};
> +
> +#if IS_ENABLED(CONFIG_MEMORY_FAILURE) && IS_ENABLED(CONFIG_FS_DAX)
> +static pgoff_t
> +xfs_failure_pgoff(
> +       struct xfs_mount                *mp,
> +       const struct xfs_rmap_irec      *rec,
> +       const struct failure_info       *notify)
> +{
> +       uint64_t                        pos = rec->rm_offset;
> +
> +       if (notify->startblock > rec->rm_startblock)
> +               pos += XFS_FSB_TO_B(mp,
> +                               notify->startblock - rec->rm_startblock);
> +       return pos >> PAGE_SHIFT;
> +}
> +
> +static unsigned long
> +xfs_failure_pgcnt(
> +       struct xfs_mount                *mp,
> +       const struct xfs_rmap_irec      *rec,
> +       const struct failure_info       *notify)
> +{
> +       xfs_agblock_t                   end_rec;
> +       xfs_agblock_t                   end_notify;
> +       xfs_agblock_t                   start_cross;
> +       xfs_agblock_t                   end_cross;
> +
> +       start_cross = max(rec->rm_startblock, notify->startblock);
> +
> +       end_rec = rec->rm_startblock + rec->rm_blockcount;
> +       end_notify = notify->startblock + notify->blockcount;
> +       end_cross = min(end_rec, end_notify);
> +
> +       return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
> +}
> +
> +static int
> +xfs_dax_failure_fn(
> +       struct xfs_btree_cur            *cur,
> +       const struct xfs_rmap_irec      *rec,
> +       void                            *data)
> +{
> +       struct xfs_mount                *mp = cur->bc_mp;
> +       struct xfs_inode                *ip;
> +       struct failure_info             *notify = data;
> +       int                             error = 0;
> +
> +       if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
> +           (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
> +               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> +               return -EFSCORRUPTED;
> +       }
> +
> +       /* Get files that incore, filter out others that are not in use. */
> +       error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
> +                        0, &ip);
> +       /* Continue the rmap query if the inode isn't incore */
> +       if (error == -ENODATA)
> +               return 0;
> +       if (error)
> +               return error;
> +
> +       error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
> +                                 xfs_failure_pgoff(mp, rec, notify),
> +                                 xfs_failure_pgcnt(mp, rec, notify),
> +                                 notify->mf_flags);
> +       xfs_irele(ip);
> +       return error;
> +}
> +#else
> +static int
> +xfs_dax_failure_fn(
> +       struct xfs_btree_cur            *cur,
> +       const struct xfs_rmap_irec      *rec,
> +       void                            *data)
> +{
> +       struct xfs_mount                *mp = cur->bc_mp;
> +
> +       /* No other option besides shutting down the fs. */
> +       xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> +       return -EFSCORRUPTED;
> +}
> +#endif /* CONFIG_MEMORY_FAILURE && CONFIG_FS_DAX */
> +
> +static int
> +xfs_dax_notify_ddev_failure(
> +       struct xfs_mount        *mp,
> +       xfs_daddr_t             daddr,
> +       xfs_daddr_t             bblen,
> +       int                     mf_flags)
> +{
> +       struct xfs_trans        *tp = NULL;
> +       struct xfs_btree_cur    *cur = NULL;
> +       struct xfs_buf          *agf_bp = NULL;
> +       struct failure_info     notify;
> +       int                     error = 0;
> +       xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, daddr);
> +       xfs_agnumber_t          agno = XFS_FSB_TO_AGNO(mp, fsbno);
> +       xfs_fsblock_t           end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen);
> +       xfs_agnumber_t          end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
> +
> +       /*
> +        * Once a file is found by rmap, we take the intersection of two ranges:
> +        * notification range and file extent range, to make sure we won't go
> +        * out of scope.
> +        */
> +       notify.mf_flags = mf_flags;
> +       notify.startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
> +       notify.blockcount = XFS_BB_TO_FSB(mp, bblen);
> +
> +       error = xfs_trans_alloc_empty(mp, &tp);
> +       if (error)
> +               return error;
> +
> +       for (; agno <= end_agno; agno++) {
> +               struct xfs_rmap_irec    ri_low = { };
> +               struct xfs_rmap_irec    ri_high;
> +
> +               error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
> +               if (error)
> +                       break;
> +
> +               cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agf_bp->b_pag);
> +
> +               /*
> +                * Set the rmap range from ri_low to ri_high, which represents
> +                * a [start, end] where we looking for the files or metadata.
> +                * The part of range out of a AG will be ignored.  So, it's fine
> +                * to set ri_low to "startblock" in all loops.  When it reaches
> +                * the last AG, set the ri_high to "endblock" to make sure we
> +                * actually end at the end.
> +                */
> +               memset(&ri_high, 0xFF, sizeof(ri_high));
> +               ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
> +               if (agno == end_agno)
> +                       ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
> +
> +               error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
> +                               xfs_dax_failure_fn, &notify);
> +               xfs_btree_del_cursor(cur, error);
> +               xfs_trans_brelse(tp, agf_bp);
> +               if (error)
> +                       break;
> +
> +               fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
> +       }
> +
> +       xfs_trans_cancel(tp);
> +       return error;
> +}
> +
> +static int
> +xfs_dax_notify_failure(
> +       struct dax_device       *dax_dev,
> +       u64                     offset,
> +       u64                     len,
> +       int                     mf_flags)
> +{
> +       struct xfs_mount        *mp = dax_get_holder(dax_dev);
> +
> +       if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
> +               xfs_warn(mp,
> +                        "notify_failure() not supported on realtime device!");
> +               return -EOPNOTSUPP;
> +       }
> +
> +       if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
> +           mp->m_logdev_targp != mp->m_ddev_targp) {
> +               xfs_err(mp, "ondisk log corrupt, shutting down fs!");
> +               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> +               return -EFSCORRUPTED;
> +       }
> +
> +       if (!xfs_has_rmapbt(mp)) {
> +               xfs_warn(mp, "notify_failure() needs rmapbt enabled!");

Doesn't this need to be resolved at mount time?

> +               return -EOPNOTSUPP;
> +       }
> +
> +       if (offset < mp->m_ddev_targp->bt_dax_part_off ||
> +           ((offset + len) > mp->m_ddev_targp->bt_bdev->bd_nr_sectors <<
> +                               SECTOR_SHIFT)) {

With the removal of partition support bt_dax_part_off can never be
non-zero and the offset / len validation should be done against the
boundaries of the dax device in terms of physical page offset and
nr_pages.

> +               xfs_warn(mp, "notify_failure() goes out of the scope.");
> +               return -ENXIO;
> +       }
> +
> +       offset -= mp->m_ddev_targp->bt_dax_part_off;
> +       return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
> +                       mf_flags);

Same here, all offset adjustment code can be dropped because failure
notification should be disabled at mount time if the mount point is
not associated with a whole disk device.

> +}
> +
> +const struct dax_holder_operations xfs_dax_holder_operations = {
> +       .notify_failure         = xfs_dax_notify_failure,
> +};
> diff --git a/fs/xfs/xfs_notify_failure.h b/fs/xfs/xfs_notify_failure.h
> new file mode 100644
> index 000000000000..f40cb315e7ce
> --- /dev/null
> +++ b/fs/xfs/xfs_notify_failure.h
> @@ -0,0 +1,10 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2021 Fujitsu.  All Rights Reserved.
> + */
> +#ifndef __XFS_NOTIFY_FAILURE_H__
> +#define __XFS_NOTIFY_FAILURE_H__
> +
> +extern const struct dax_holder_operations xfs_dax_holder_operations;
> +
> +#endif  /* __XFS_NOTIFY_FAILURE_H__ */
> --
> 2.34.1
>
>
>


  parent reply	other threads:[~2022-02-16  1:57 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-27 12:40 [PATCH v10 0/9] fsdax: introduce fs query to support reflink Shiyang Ruan
2022-01-27 12:40 ` [PATCH v10 1/9] dax: Introduce holder for dax_device Shiyang Ruan
2022-01-27 16:13   ` kernel test robot
2022-01-27 16:44   ` kernel test robot
2022-02-02 13:03   ` Christoph Hellwig
2022-02-13 12:58     ` [PATCH v10.1 " Shiyang Ruan
2022-02-15 22:06       ` Dan Williams
2022-01-27 12:40 ` [PATCH v10 2/9] mm: factor helpers for memory_failure_dev_pagemap Shiyang Ruan
2022-02-01 21:03   ` Matthew Wilcox
2022-02-15 22:11   ` Dan Williams
2022-01-27 12:40 ` [PATCH v10 3/9] pagemap,pmem: Introduce ->memory_failure() Shiyang Ruan
2022-02-15 22:38   ` Dan Williams
2022-01-27 12:40 ` [PATCH v10 4/9] fsdax: fix function description Shiyang Ruan
2022-02-02 13:04   ` Christoph Hellwig
2022-02-15 23:51     ` Dan Williams
2022-01-27 12:40 ` [PATCH v10 5/9] fsdax: Introduce dax_load_page() Shiyang Ruan
2022-02-16  1:34   ` Dan Williams
2022-02-16  3:02     ` Shiyang Ruan
2022-02-16  3:07       ` Dan Williams
2022-01-27 12:40 ` [PATCH v10 6/9] mm: move pgoff_address() to vma_pgoff_address() Shiyang Ruan
2022-02-16  1:37   ` Dan Williams
2022-01-27 12:40 ` [PATCH v10 7/9] mm: Introduce mf_dax_kill_procs() for fsdax case Shiyang Ruan
2022-02-16  1:47   ` Dan Williams
2022-02-16  1:49   ` Dan Williams
2022-01-27 12:40 ` [PATCH v10 8/9] xfs: Implement ->notify_failure() for XFS Shiyang Ruan
2022-01-27 17:56   ` kernel test robot
2022-01-27 19:39   ` kernel test robot
2022-02-01 20:41   ` Darrick J. Wong
2022-02-13 13:02     ` [PATCH v10.1 " Shiyang Ruan
2022-02-15  1:46       ` Darrick J. Wong
2022-02-15  9:42         ` Shiyang Ruan
2022-02-16  1:56   ` Dan Williams [this message]
2022-01-27 12:40 ` [PATCH v10 9/9] fsdax: set a CoW flag when associate reflink mappings Shiyang Ruan
2022-02-16  2:09   ` Dan Williams
2022-02-16  2:55     ` Shiyang Ruan
2022-02-16  3:09       ` Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAPcyv4hBpHsPRXZKtHtN0hVQhjZspZBz9egO=wn+54KDJokStw@mail.gmail.com' \
    --to=dan.j.williams@intel.com \
    --cc=david@fromorbit.com \
    --cc=djwong@kernel.org \
    --cc=hch@infradead.org \
    --cc=jane.chu@oracle.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=nvdimm@lists.linux.dev \
    --cc=ruansy.fnst@fujitsu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox