Re: [PATCH v11 8/9] PCI/P2PDMA: Allow userspace VMA allocations through sysfs

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Chaitanya Kulkarni <chaitanyak@nvidia.com>
To: Logan Gunthorpe <logang@deltatee.com>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"linux-nvme@lists.infradead.org" <linux-nvme@lists.infradead.org>,
	"linux-block@vger.kernel.org" <linux-block@vger.kernel.org>,
	"linux-pci@vger.kernel.org" <linux-pci@vger.kernel.org>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>
Cc: "Christoph Hellwig" <hch@lst.de>,
	"Greg Kroah-Hartman" <gregkh@linuxfoundation.org>,
	"Dan Williams" <dan.j.williams@intel.com>,
	"Jason Gunthorpe" <jgg@ziepe.ca>,
	"Christian König" <christian.koenig@amd.com>,
	"John Hubbard" <jhubbard@nvidia.com>,
	"Don Dutile" <ddutile@redhat.com>,
	"Matthew Wilcox" <willy@infradead.org>,
	"Daniel Vetter" <daniel.vetter@ffwll.ch>,
	"Minturn Dave B" <dave.b.minturn@intel.com>,
	"Jason Ekstrand" <jason@jlekstrand.net>,
	"Dave Hansen" <dave.hansen@linux.intel.com>,
	"Xiong Jianxin" <jianxin.xiong@intel.com>,
	"Bjorn Helgaas" <helgaas@kernel.org>,
	"Ira Weiny" <ira.weiny@intel.com>,
	"Robin Murphy" <robin.murphy@arm.com>,
	"Martin Oliveira" <martin.oliveira@eideticom.com>,
	"Chaitanya Kulkarni" <ckulkarnilinux@gmail.com>,
	"Ralph Campbell" <rcampbell@nvidia.com>,
	"Stephen Bates" <sbates@raithlin.com>,
	"Bjorn Helgaas" <bhelgaas@google.com>
Subject: Re: [PATCH v11 8/9] PCI/P2PDMA: Allow userspace VMA allocations through sysfs
Date: Tue, 25 Oct 2022 01:34:15 +0000	[thread overview]
Message-ID: <b94f0533-7009-5415-4ed4-f7443fbf7a2f@nvidia.com> (raw)
In-Reply-To: <20221021174116.7200-9-logang@deltatee.com>

On 10/21/22 10:41, Logan Gunthorpe wrote:
> Create a sysfs bin attribute called "allocate" under the existing
> "p2pmem" group. The only allowable operation on this file is the mmap()
> call.
> 
> When mmap() is called on this attribute, the kernel allocates a chunk of
> memory from the genalloc and inserts the pages into the VMA. The
> dev_pagemap .page_free callback will indicate when these pages are no
> longer used and they will be put back into the genalloc.
> 
> On device unbind, remove the sysfs file before the memremap_pages are
> cleaned up. This ensures unmap_mapping_range() is called on the files
> inode and no new mappings can be created.
> 
> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> Acked-by: Bjorn Helgaas <bhelgaas@google.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> ---
>   drivers/pci/p2pdma.c | 124 +++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 124 insertions(+)
> 
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index 88dc66ee1c46..27539770a613 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -89,6 +89,90 @@ static ssize_t published_show(struct device *dev, struct device_attribute *attr,
>   }
>   static DEVICE_ATTR_RO(published);
>   
> +static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
> +		struct bin_attribute *attr, struct vm_area_struct *vma)
> +{
> +	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
> +	size_t len = vma->vm_end - vma->vm_start;
> +	struct pci_p2pdma *p2pdma;
> +	struct percpu_ref *ref;
> +	unsigned long vaddr;
> +	void *kaddr;
> +	int ret;
> +
> +	/* prevent private mappings from being established */
> +	if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
> +		pci_info_ratelimited(pdev,
> +				     "%s: fail, attempted private mapping\n",
> +				     current->comm);
> +		return -EINVAL;
> +	}
> +
> +	if (vma->vm_pgoff) {
> +		pci_info_ratelimited(pdev,
> +				     "%s: fail, attempted mapping with non-zero offset\n",
> +				     current->comm);
> +		return -EINVAL;
> +	}
> +
> +	rcu_read_lock();
> +	p2pdma = rcu_dereference(pdev->p2pdma);
> +	if (!p2pdma) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
> +
> +	kaddr = (void *)gen_pool_alloc_owner(p2pdma->pool, len, (void **)&ref);
> +	if (!kaddr) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	/*
> +	 * vm_insert_page() can sleep, so a reference is taken to mapping
> +	 * such that rcu_read_unlock() can be done before inserting the
> +	 * pages
> +	 */
> +	if (unlikely(!percpu_ref_tryget_live_rcu(ref))) {
> +		ret = -ENODEV;
> +		goto out_free_mem;
> +	}
> +	rcu_read_unlock();
> +
> +	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
> +		ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr));
> +		if (ret) {
> +			gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
> +			return ret;
> +		}
> +		percpu_ref_get(ref);
> +		put_page(virt_to_page(kaddr));
> +		kaddr += PAGE_SIZE;
> +		len -= PAGE_SIZE;
> +	}
> +
> +	percpu_ref_put(ref);
> +
> +	return 0;
> +out_free_mem:
> +	gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
> +out:
> +	rcu_read_unlock();
> +	return ret;
> +}
> +
> +static struct bin_attribute p2pmem_alloc_attr = {
> +	.attr = { .name = "allocate", .mode = 0660 },
> +	.mmap = p2pmem_alloc_mmap,
> +	/*
> +	 * Some places where we want to call mmap (ie. python) will check
> +	 * that the file size is greater than the mmap size before allowing
> +	 * the mmap to continue. To work around this, just set the size
> +	 * to be very large.
> +	 */
> +	.size = SZ_1T,
> +};
> +
>   static struct attribute *p2pmem_attrs[] = {
>   	&dev_attr_size.attr,
>   	&dev_attr_available.attr,
> @@ -96,11 +180,32 @@ static struct attribute *p2pmem_attrs[] = {
>   	NULL,
>   };
>   
> +static struct bin_attribute *p2pmem_bin_attrs[] = {
> +	&p2pmem_alloc_attr,
> +	NULL,
> +};
> +
>   static const struct attribute_group p2pmem_group = {
>   	.attrs = p2pmem_attrs,
> +	.bin_attrs = p2pmem_bin_attrs,
>   	.name = "p2pmem",
>   };
>   
> +static void p2pdma_page_free(struct page *page)
> +{
> +	struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap);
> +	struct percpu_ref *ref;
> +
> +	gen_pool_free_owner(pgmap->provider->p2pdma->pool,
> +			    (uintptr_t)page_to_virt(page), PAGE_SIZE,
> +			    (void **)&ref);
> +	percpu_ref_put(ref);
> +}
> +
> +static const struct dev_pagemap_ops p2pdma_pgmap_ops = {
> +	.page_free = p2pdma_page_free,
> +};
> +
>   static void pci_p2pdma_release(void *data)
>   {
>   	struct pci_dev *pdev = data;
> @@ -152,6 +257,19 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
>   	return error;
>   }
>   
> +static void pci_p2pdma_unmap_mappings(void *data)
> +{
> +	struct pci_dev *pdev = data;
> +
> +	/*
> +	 * Removing the alloc attribute from sysfs will call
> +	 * unmap_mapping_range() on the inode, teardown any existing userspace
> +	 * mappings and prevent new ones from being created.
> +	 */
> +	sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
> +				     p2pmem_group.name);
> +}
> +
>   /**
>    * pci_p2pdma_add_resource - add memory for use as p2p memory
>    * @pdev: the device to add the memory to
> @@ -198,6 +316,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
>   	pgmap->range.end = pgmap->range.start + size - 1;
>   	pgmap->nr_range = 1;
>   	pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
> +	pgmap->ops = &p2pdma_pgmap_ops;
>   
>   	p2p_pgmap->provider = pdev;
>   	p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) -
> @@ -209,6 +328,11 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
>   		goto pgmap_free;
>   	}
>   
> +	error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings,
> +					 pdev);
> +	if (error)
> +		goto pages_free;
> +
>   	p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
>   	error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr,
>   			pci_bus_address(pdev, bar) + offset,

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck

next prev parent reply	other threads:[~2022-10-25  1:34 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-21 17:41 [PATCH v11 0/9] Userspace P2PDMA with O_DIRECT NVMe devices Logan Gunthorpe
2022-10-21 17:41 ` [PATCH v11 1/9] mm: allow multiple error returns in try_grab_page() Logan Gunthorpe
2022-10-24 15:00   ` Christoph Hellwig
2022-10-24 16:37   ` Dan Williams
2022-10-25  1:06   ` Chaitanya Kulkarni
2022-10-21 17:41 ` [PATCH v11 2/9] mm: introduce FOLL_PCI_P2PDMA to gate getting PCI P2PDMA pages Logan Gunthorpe
2022-10-24 15:00   ` Christoph Hellwig
2022-10-25  1:09   ` Chaitanya Kulkarni
2022-10-21 17:41 ` [PATCH v11 3/9] iov_iter: introduce iov_iter_get_pages_[alloc_]flags() Logan Gunthorpe
2022-10-25  1:14   ` Chaitanya Kulkarni
2022-10-25 15:35     ` Logan Gunthorpe
2022-10-25 15:41       ` Christoph Hellwig
2022-10-27  7:11   ` Jay Fang
2022-10-27 14:22     ` Logan Gunthorpe
2022-10-21 17:41 ` [PATCH v11 4/9] block: add check when merging zone device pages Logan Gunthorpe
2022-10-25  1:16   ` Chaitanya Kulkarni
2022-10-21 17:41 ` [PATCH v11 5/9] lib/scatterlist: " Logan Gunthorpe
2022-10-25  1:19   ` Chaitanya Kulkarni
2022-10-21 17:41 ` [PATCH v11 6/9] block: set FOLL_PCI_P2PDMA in __bio_iov_iter_get_pages() Logan Gunthorpe
2022-10-25  1:23   ` Chaitanya Kulkarni
2022-10-25 15:37     ` Logan Gunthorpe
2022-10-25  1:25   ` Chaitanya Kulkarni
2022-10-21 17:41 ` [PATCH v11 7/9] block: set FOLL_PCI_P2PDMA in bio_map_user_iov() Logan Gunthorpe
2022-10-25  1:26   ` Chaitanya Kulkarni
2022-10-21 17:41 ` [PATCH v11 8/9] PCI/P2PDMA: Allow userspace VMA allocations through sysfs Logan Gunthorpe
2022-10-25  1:29   ` Chaitanya Kulkarni
2022-10-25  1:34   ` Chaitanya Kulkarni [this message]
2022-10-21 17:41 ` [PATCH v11 9/9] ABI: sysfs-bus-pci: add documentation for p2pmem allocate Logan Gunthorpe
2022-10-25  1:29   ` Chaitanya Kulkarni
2022-10-24 15:03 ` [PATCH v11 0/9] Userspace P2PDMA with O_DIRECT NVMe devices Christoph Hellwig
2022-10-24 19:15   ` John Hubbard
2022-11-08  6:56     ` Christoph Hellwig
2022-11-09 17:28       ` Logan Gunthorpe
2022-11-09 18:33         ` Jens Axboe
2022-11-09 18:44 ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b94f0533-7009-5415-4ed4-f7443fbf7a2f@nvidia.com \
    --to=chaitanyak@nvidia.com \
    --cc=bhelgaas@google.com \
    --cc=christian.koenig@amd.com \
    --cc=ckulkarnilinux@gmail.com \
    --cc=dan.j.williams@intel.com \
    --cc=daniel.vetter@ffwll.ch \
    --cc=dave.b.minturn@intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=ddutile@redhat.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=hch@lst.de \
    --cc=helgaas@kernel.org \
    --cc=ira.weiny@intel.com \
    --cc=jason@jlekstrand.net \
    --cc=jgg@ziepe.ca \
    --cc=jhubbard@nvidia.com \
    --cc=jianxin.xiong@intel.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=logang@deltatee.com \
    --cc=martin.oliveira@eideticom.com \
    --cc=rcampbell@nvidia.com \
    --cc=robin.murphy@arm.com \
    --cc=sbates@raithlin.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox