From: Dan Williams <dan.j.williams@intel.com>
To: linux-nvdimm@lists.01.org
Cc: linux-xfs@vger.kernel.org, Jan Kara <jack@suse.cz>,
"Darrick J. Wong" <darrick.wong@oracle.com>,
linux-rdma@vger.kernel.org, linux-api@vger.kernel.org,
Dave Chinner <david@fromorbit.com>,
Christoph Hellwig <hch@lst.de>,
"J. Bruce Fields" <bfields@fieldses.org>,
linux-mm@kvack.org, Jeff Moyer <jmoyer@redhat.com>,
linux-fsdevel@vger.kernel.org,
Jeff Layton <jlayton@poochiereds.net>,
Ross Zwisler <ross.zwisler@linux.intel.com>
Subject: [PATCH v7 04/12] fs: MAP_DIRECT core
Date: Fri, 06 Oct 2017 15:35:38 -0700 [thread overview]
Message-ID: <150732933864.22363.2459100387849051724.stgit@dwillia2-desk3.amr.corp.intel.com> (raw)
In-Reply-To: <150732931273.22363.8436792888326501071.stgit@dwillia2-desk3.amr.corp.intel.com>
Introduce a set of helper apis for filesystems to establish FL_LAYOUT
leases to protect against writes and block map updates while a
MAP_DIRECT mapping is established. While the lease protects against the
syscall write path and fallocate it does not protect against allocating
write-faults, so this relies on i_mapdcount to disable block map updates
from write faults.
Like the pnfs case MAP_DIRECT does its own timeout of the lease since we
need to have a process context for running map_direct_invalidate().
Cc: Jan Kara <jack@suse.cz>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Jeff Layton <jlayton@poochiereds.net>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
fs/Makefile | 2
fs/mapdirect.c | 232 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/mapdirect.h | 45 +++++++++
3 files changed, 278 insertions(+), 1 deletion(-)
create mode 100644 fs/mapdirect.c
create mode 100644 include/linux/mapdirect.h
diff --git a/fs/Makefile b/fs/Makefile
index 7bbaca9c67b1..c0e791d235d8 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -29,7 +29,7 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
-obj-$(CONFIG_FS_DAX) += dax.o
+obj-$(CONFIG_FS_DAX) += dax.o mapdirect.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
diff --git a/fs/mapdirect.c b/fs/mapdirect.c
new file mode 100644
index 000000000000..9ac7c1d946a2
--- /dev/null
+++ b/fs/mapdirect.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/mapdirect.h>
+#include <linux/workqueue.h>
+#include <linux/signal.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#define MAPDIRECT_BREAK 0
+#define MAPDIRECT_VALID 1
+
+struct map_direct_state {
+ atomic_t mds_ref;
+ atomic_t mds_vmaref;
+ unsigned long mds_state;
+ struct inode *mds_inode;
+ struct delayed_work mds_work;
+ struct fasync_struct *mds_fa;
+ struct vm_area_struct *mds_vma;
+};
+
+bool is_map_direct_valid(struct map_direct_state *mds)
+{
+ return test_bit(MAPDIRECT_VALID, &mds->mds_state);
+}
+EXPORT_SYMBOL_GPL(is_map_direct_valid);
+
+static void put_map_direct(struct map_direct_state *mds)
+{
+ if (!atomic_dec_and_test(&mds->mds_ref))
+ return;
+ kfree(mds);
+}
+
+int put_map_direct_vma(struct map_direct_state *mds)
+{
+ struct vm_area_struct *vma = mds->mds_vma;
+ struct file *file = vma->vm_file;
+ struct inode *inode = file_inode(file);
+ void *owner = mds;
+
+ if (!atomic_dec_and_test(&mds->mds_vmaref))
+ return 0;
+
+ /*
+ * Flush in-flight+forced lm_break events that may be
+ * referencing this dying vma.
+ */
+ mds->mds_vma = NULL;
+ set_bit(MAPDIRECT_BREAK, &mds->mds_state);
+ vfs_setlease(vma->vm_file, F_UNLCK, NULL, &owner);
+ flush_delayed_work(&mds->mds_work);
+ iput(inode);
+
+ put_map_direct(mds);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(put_map_direct_vma);
+
+void get_map_direct_vma(struct map_direct_state *mds)
+{
+ atomic_inc(&mds->mds_vmaref);
+}
+EXPORT_SYMBOL_GPL(get_map_direct_vma);
+
+static void map_direct_invalidate(struct work_struct *work)
+{
+ struct map_direct_state *mds;
+ struct vm_area_struct *vma;
+ struct inode *inode;
+ void *owner;
+
+ mds = container_of(work, typeof(*mds), mds_work.work);
+
+ clear_bit(MAPDIRECT_VALID, &mds->mds_state);
+
+ vma = ACCESS_ONCE(mds->mds_vma);
+ inode = mds->mds_inode;
+ if (vma) {
+ unsigned long len = vma->vm_end - vma->vm_start;
+ loff_t start = (loff_t) vma->vm_pgoff * PAGE_SIZE;
+
+ unmap_mapping_range(inode->i_mapping, start, len, 1);
+ }
+ owner = mds;
+ vfs_setlease(vma->vm_file, F_UNLCK, NULL, &owner);
+
+ put_map_direct(mds);
+}
+
+static bool map_direct_lm_break(struct file_lock *fl)
+{
+ struct map_direct_state *mds = fl->fl_owner;
+
+ /*
+ * Given that we need to take sleeping locks to invalidate the
+ * mapping we schedule that work with the original timeout set
+ * by the file-locks core. Then we tell the core to hold off on
+ * continuing with the lease break until the delayed work
+ * completes the invalidation and the lease unlock.
+ *
+ * Note that this assumes that i_mapdcount is protecting against
+ * block-map modifying write-faults since we are unable to use
+ * leases in that path due to locking constraints.
+ */
+ if (!test_and_set_bit(MAPDIRECT_BREAK, &mds->mds_state)) {
+ schedule_delayed_work(&mds->mds_work, lease_break_time * HZ);
+ kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
+ }
+
+ /* Tell the core lease code to wait for delayed work completion */
+ fl->fl_break_time = 0;
+
+ return false;
+}
+
+static int map_direct_lm_change(struct file_lock *fl, int arg,
+ struct list_head *dispose)
+{
+ struct map_direct_state *mds = fl->fl_owner;
+
+ WARN_ON(!(arg & F_UNLCK));
+
+ i_mapdcount_dec(mds->mds_inode);
+ return lease_modify(fl, arg, dispose);
+}
+
+static void map_direct_lm_setup(struct file_lock *fl, void **priv)
+{
+ struct file *file = fl->fl_file;
+ struct map_direct_state *mds = *priv;
+ struct fasync_struct *fa = mds->mds_fa;
+
+ /*
+ * Comment copied from lease_setup():
+ * fasync_insert_entry() returns the old entry if any. If there was no
+ * old entry, then it used "priv" and inserted it into the fasync list.
+ * Clear the pointer to indicate that it shouldn't be freed.
+ */
+ if (!fasync_insert_entry(fa->fa_fd, file, &fl->fl_fasync, fa))
+ *priv = NULL;
+
+ __f_setown(file, task_pid(current), PIDTYPE_PID, 0);
+}
+
+static const struct lock_manager_operations map_direct_lm_ops = {
+ .lm_break = map_direct_lm_break,
+ .lm_change = map_direct_lm_change,
+ .lm_setup = map_direct_lm_setup,
+};
+
+struct map_direct_state *map_direct_register(int fd, struct vm_area_struct *vma)
+{
+ struct map_direct_state *mds = kzalloc(sizeof(*mds), GFP_KERNEL);
+ struct file *file = vma->vm_file;
+ struct inode *inode = file_inode(file);
+ struct fasync_struct *fa;
+ struct file_lock *fl;
+ void *owner = mds;
+ int rc = -ENOMEM;
+
+ if (!mds)
+ return ERR_PTR(-ENOMEM);
+
+ mds->mds_vma = vma;
+ atomic_set(&mds->mds_ref, 1);
+ atomic_set(&mds->mds_vmaref, 1);
+ set_bit(MAPDIRECT_VALID, &mds->mds_state);
+ mds->mds_inode = inode;
+ ihold(inode);
+ INIT_DELAYED_WORK(&mds->mds_work, map_direct_invalidate);
+
+ fa = fasync_alloc();
+ if (!fa)
+ goto err_fasync_alloc;
+ mds->mds_fa = fa;
+ fa->fa_fd = fd;
+
+ fl = locks_alloc_lock();
+ if (!fl)
+ goto err_lock_alloc;
+
+ locks_init_lock(fl);
+ fl->fl_lmops = &map_direct_lm_ops;
+ fl->fl_flags = FL_LAYOUT;
+ fl->fl_type = F_RDLCK;
+ fl->fl_end = OFFSET_MAX;
+ fl->fl_owner = mds;
+ atomic_inc(&mds->mds_ref);
+ fl->fl_pid = current->tgid;
+ fl->fl_file = file;
+
+ rc = vfs_setlease(file, fl->fl_type, &fl, &owner);
+ if (rc)
+ goto err_setlease;
+ if (fl) {
+ WARN_ON(1);
+ owner = mds;
+ vfs_setlease(file, F_UNLCK, NULL, &owner);
+ owner = NULL;
+ rc = -ENXIO;
+ goto err_setlease;
+ }
+
+ i_mapdcount_inc(inode);
+ return mds;
+
+err_setlease:
+ locks_free_lock(fl);
+err_lock_alloc:
+ /* if owner is NULL then the lease machinery is reponsible @fa */
+ if (owner)
+ fasync_free(fa);
+err_fasync_alloc:
+ iput(inode);
+ kfree(mds);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_GPL(map_direct_register);
diff --git a/include/linux/mapdirect.h b/include/linux/mapdirect.h
new file mode 100644
index 000000000000..724e27d8615e
--- /dev/null
+++ b/include/linux/mapdirect.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __MAPDIRECT_H__
+#define __MAPDIRECT_H__
+#include <linux/err.h>
+
+struct inode;
+struct work_struct;
+struct vm_area_struct;
+struct map_direct_state;
+
+#if IS_ENABLED(CONFIG_FS_DAX)
+struct map_direct_state *map_direct_register(int fd, struct vm_area_struct *vma);
+int put_map_direct_vma(struct map_direct_state *mds);
+void get_map_direct_vma(struct map_direct_state *mds);
+bool is_map_direct_valid(struct map_direct_state *mds);
+#else
+static inline struct map_direct_state *map_direct_register(int fd,
+ struct vm_area_struct *vma)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+int put_map_direct_vma(struct map_direct_state *mds)
+{
+ return 0;
+}
+static inline void get_map_direct_vma(struct map_direct_state *mds)
+{
+}
+bool is_map_direct_valid(struct map_direct_state *mds)
+{
+ return false;
+}
+#endif
+#endif /* __MAPDIRECT_H__ */
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2017-10-06 22:42 UTC|newest]
Thread overview: 49+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-10-06 22:35 [PATCH v7 00/12] MAP_DIRECT for DAX RDMA and userspace flush Dan Williams
2017-10-06 22:35 ` [PATCH v7 01/12] mm: introduce MAP_SHARED_VALIDATE, a mechanism to safely define new mmap flags Dan Williams
2017-10-06 22:35 ` [PATCH v7 02/12] fs, mm: pass fd to ->mmap_validate() Dan Williams
2017-10-06 22:35 ` [PATCH v7 03/12] fs: introduce i_mapdcount Dan Williams
2017-10-09 3:08 ` Dave Chinner
2017-10-06 22:35 ` Dan Williams [this message]
2017-10-06 22:35 ` [PATCH v7 05/12] xfs: prepare xfs_break_layouts() for reuse with MAP_DIRECT Dan Williams
2017-10-06 22:35 ` [PATCH v7 06/12] xfs: wire up MAP_DIRECT Dan Williams
2017-10-09 3:40 ` Dave Chinner
2017-10-09 17:08 ` Dan Williams
2017-10-09 22:50 ` Dave Chinner
2017-10-06 22:35 ` [PATCH v7 07/12] dma-mapping: introduce dma_has_iommu() Dan Williams
2017-10-06 22:45 ` David Woodhouse
2017-10-06 22:52 ` Dan Williams
2017-10-06 23:10 ` David Woodhouse
2017-10-06 23:15 ` Dan Williams
2017-10-07 11:08 ` David Woodhouse
2017-10-07 23:33 ` Dan Williams
2017-10-06 23:12 ` Dan Williams
2017-10-08 3:45 ` [PATCH v8] dma-mapping: introduce dma_get_iommu_domain() Dan Williams
2017-10-09 10:37 ` Robin Murphy
2017-10-09 17:32 ` Dan Williams
2017-10-10 14:40 ` Raj, Ashok
2017-10-09 18:58 ` [PATCH v7 07/12] dma-mapping: introduce dma_has_iommu() Jason Gunthorpe
2017-10-09 19:05 ` Dan Williams
2017-10-09 19:18 ` Jason Gunthorpe
2017-10-09 19:28 ` Dan Williams
2017-10-10 17:25 ` Jason Gunthorpe
2017-10-10 17:39 ` Dan Williams
2017-10-10 18:05 ` Jason Gunthorpe
2017-10-10 20:17 ` Dan Williams
2017-10-12 18:27 ` Jason Gunthorpe
2017-10-12 20:10 ` Dan Williams
2017-10-13 6:50 ` Christoph Hellwig
2017-10-13 15:03 ` Jason Gunthorpe
2017-10-15 15:14 ` Matan Barak
2017-10-15 15:21 ` Dan Williams
2017-10-13 7:09 ` Christoph Hellwig
2017-10-06 22:36 ` [PATCH v7 08/12] fs, mapdirect: introduce ->lease_direct() Dan Williams
2017-10-06 22:36 ` [PATCH v7 09/12] xfs: wire up ->lease_direct() Dan Williams
2017-10-09 3:45 ` Dave Chinner
2017-10-09 17:10 ` Dan Williams
2017-10-06 22:36 ` [PATCH v7 10/12] device-dax: " Dan Williams
2017-10-06 22:36 ` [PATCH v7 11/12] IB/core: use MAP_DIRECT to fix / enable RDMA to DAX mappings Dan Williams
2017-10-08 4:02 ` [PATCH v8 1/2] iommu: up-level sg_num_pages() from amd-iommu Dan Williams
2017-10-08 4:04 ` [PATCH v8 2/2] IB/core: use MAP_DIRECT to fix / enable RDMA to DAX mappings Dan Williams
2017-10-08 6:45 ` kbuild test robot
2017-10-08 15:49 ` Dan Williams
2017-10-06 22:36 ` [PATCH v7 12/12] tools/testing/nvdimm: enable rdma unit tests Dan Williams
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=150732933864.22363.2459100387849051724.stgit@dwillia2-desk3.amr.corp.intel.com \
--to=dan.j.williams@intel.com \
--cc=bfields@fieldses.org \
--cc=darrick.wong@oracle.com \
--cc=david@fromorbit.com \
--cc=hch@lst.de \
--cc=jack@suse.cz \
--cc=jlayton@poochiereds.net \
--cc=jmoyer@redhat.com \
--cc=linux-api@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-nvdimm@lists.01.org \
--cc=linux-rdma@vger.kernel.org \
--cc=linux-xfs@vger.kernel.org \
--cc=ross.zwisler@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox