* [PATCH v1 1/3] mm/gup: Introduce pin_user_pages_fd() for pinning shmem/hugetlbfs file pages
2023-10-03 7:44 [PATCH v1 0/3] mm/gup: Introduce pin_user_pages_fd() for pinning shmem/hugetlbfs file pages Vivek Kasireddy
@ 2023-10-03 7:44 ` Vivek Kasireddy
2023-10-06 8:03 ` David Hildenbrand
2023-10-10 13:51 ` Jason Gunthorpe
2023-10-03 7:44 ` [PATCH v1 2/3] udmabuf: Pin the pages using pin_user_pages_fd() API Vivek Kasireddy
2023-10-03 7:44 ` [PATCH v1 3/3] selftests/dma-buf/udmabuf: Add tests to verify data after page migration Vivek Kasireddy
2 siblings, 2 replies; 8+ messages in thread
From: Vivek Kasireddy @ 2023-10-03 7:44 UTC (permalink / raw)
To: dri-devel, linux-mm
Cc: Vivek Kasireddy, David Hildenbrand, Daniel Vetter, Mike Kravetz,
Hugh Dickins, Peter Xu, Gerd Hoffmann, Dongwon Kim,
Junxiao Chang, Jason Gunthorpe
For drivers that would like to longterm-pin the pages associated
with a file, the pin_user_pages_fd() API provides an option to
not only FOLL_PIN the pages but also to check and migrate them
if they reside in movable zone or CMA block. For now, this API
can only work with files belonging to shmem or hugetlbfs given
that the udmabuf driver is the only user.
It must be noted that the pages associated with hugetlbfs files
are expected to be found in the page cache. An error is returned
if they are not found. However, shmem pages can be swapped in or
allocated if they are not present in the page cache.
Cc: David Hildenbrand <david@redhat.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Dongwon Kim <dongwon.kim@intel.com>
Cc: Junxiao Chang <junxiao.chang@intel.com>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
---
include/linux/mm.h | 2 ++
mm/gup.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 89 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..af2121fb8101 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2457,6 +2457,8 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
+long pin_user_pages_fd(int fd, pgoff_t start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages);
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
diff --git a/mm/gup.c b/mm/gup.c
index 2f8a2d89fde1..e34b77a15fa8 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -3400,3 +3400,90 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
&locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_unlocked);
+
+/**
+ * pin_user_pages_fd() - pin user pages associated with a file
+ * @fd: the fd whose pages are to be pinned
+ * @start: starting file offset
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Attempt to pin (and migrate) pages associated with a file belonging to
+ * either shmem or hugetlbfs. An error is returned if pages associated with
+ * hugetlbfs files are not present in the page cache. However, shmem pages
+ * are swapped in or allocated if they are not present in the page cache.
+ *
+ * Returns number of pages pinned. This would be equal to the number of
+ * pages requested.
+ * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
+ * -errno.
+ */
+long pin_user_pages_fd(int fd, pgoff_t start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ struct page *page;
+ struct file *filep;
+ unsigned int flags, i;
+ long ret;
+
+ if (nr_pages <= 0)
+ return 0;
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
+ return 0;
+
+ if (start < 0)
+ return -EINVAL;
+
+ filep = fget(fd);
+ if (!filep)
+ return -EINVAL;
+
+ if (!shmem_file(filep) && !is_file_hugepages(filep))
+ return -EINVAL;
+
+ flags = memalloc_pin_save();
+ do {
+ for (i = 0; i < nr_pages; i++) {
+ if (shmem_mapping(filep->f_mapping)) {
+ page = shmem_read_mapping_page(filep->f_mapping,
+ start + i);
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ goto err;
+ }
+ } else {
+ page = find_get_page_flags(filep->f_mapping,
+ start + i,
+ FGP_ACCESSED);
+ if (!page) {
+ ret = -EINVAL;
+ goto err;
+ }
+ }
+ ret = try_grab_page(page, FOLL_PIN);
+ if (unlikely(ret))
+ goto err;
+
+ pages[i] = page;
+ put_page(pages[i]);
+ }
+
+ ret = check_and_migrate_movable_pages(nr_pages, pages);
+ } while (ret == -EAGAIN);
+
+err:
+ memalloc_pin_restore(flags);
+ fput(filep);
+ if (!ret)
+ return nr_pages;
+
+ while (i > 0 && pages[--i]) {
+ unpin_user_page(pages[i]);
+ pages[i] = NULL;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pin_user_pages_fd);
+
--
2.39.2
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH v1 1/3] mm/gup: Introduce pin_user_pages_fd() for pinning shmem/hugetlbfs file pages
2023-10-03 7:44 ` [PATCH v1 1/3] " Vivek Kasireddy
@ 2023-10-06 8:03 ` David Hildenbrand
2023-10-06 18:43 ` Jason Gunthorpe
2023-10-17 7:39 ` Kasireddy, Vivek
2023-10-10 13:51 ` Jason Gunthorpe
1 sibling, 2 replies; 8+ messages in thread
From: David Hildenbrand @ 2023-10-06 8:03 UTC (permalink / raw)
To: Vivek Kasireddy, dri-devel, linux-mm, Mike Kravetz
Cc: Daniel Vetter, Hugh Dickins, Peter Xu, Gerd Hoffmann,
Dongwon Kim, Junxiao Chang, Jason Gunthorpe
On 03.10.23 09:44, Vivek Kasireddy wrote:
> For drivers that would like to longterm-pin the pages associated
> with a file, the pin_user_pages_fd() API provides an option to
> not only FOLL_PIN the pages but also to check and migrate them
> if they reside in movable zone or CMA block. For now, this API
> can only work with files belonging to shmem or hugetlbfs given
> that the udmabuf driver is the only user.
Maybe add "Other files are rejected.". Wasn't clear to me before I
looked into the code.
>
> It must be noted that the pages associated with hugetlbfs files
> are expected to be found in the page cache. An error is returned
> if they are not found. However, shmem pages can be swapped in or
> allocated if they are not present in the page cache.
>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Gerd Hoffmann <kraxel@redhat.com>
> Cc: Dongwon Kim <dongwon.kim@intel.com>
> Cc: Junxiao Chang <junxiao.chang@intel.com>
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> ---
> include/linux/mm.h | 2 ++
> mm/gup.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 89 insertions(+)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index bf5d0b1b16f4..af2121fb8101 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2457,6 +2457,8 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> struct page **pages, unsigned int gup_flags);
> long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> struct page **pages, unsigned int gup_flags);
> +long pin_user_pages_fd(int fd, pgoff_t start, unsigned long nr_pages,
> + unsigned int gup_flags, struct page **pages);
>
> int get_user_pages_fast(unsigned long start, int nr_pages,
> unsigned int gup_flags, struct page **pages);
> diff --git a/mm/gup.c b/mm/gup.c
> index 2f8a2d89fde1..e34b77a15fa8 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -3400,3 +3400,90 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> &locked, gup_flags);
> }
> EXPORT_SYMBOL(pin_user_pages_unlocked);
> +
This does look quite neat, nice! Let's take a closer look ...
> +/**
> + * pin_user_pages_fd() - pin user pages associated with a file
> + * @fd: the fd whose pages are to be pinned
> + * @start: starting file offset
> + * @nr_pages: number of pages from start to pin
> + * @gup_flags: flags modifying pin behaviour
^ I assume we should drop that. At least for now the flags are
completely unused. And most likely we would want a different set of
flags later (GUPFD_ ...).
> + * @pages: array that receives pointers to the pages pinned.
> + * Should be at least nr_pages long.
> + *
> + * Attempt to pin (and migrate) pages associated with a file belonging to
I'd drop the "and migrate" part, it's more of an implementation detail.
> + * either shmem or hugetlbfs. An error is returned if pages associated with
> + * hugetlbfs files are not present in the page cache. However, shmem pages
> + * are swapped in or allocated if they are not present in the page cache.
Why don't we do the same for hugetlbfs? Would make the interface more
streamlined.
Certainly add that pinned pages have to be released using
unpin_user_pages().
> + *
> + * Returns number of pages pinned. This would be equal to the number of
> + * pages requested.
> + * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
> + * -errno.
> + */
> +long pin_user_pages_fd(int fd, pgoff_t start, unsigned long nr_pages,
> + unsigned int gup_flags, struct page **pages)
> +{
> + struct page *page;
> + struct file *filep;
> + unsigned int flags, i;
> + long ret;
> +
> + if (nr_pages <= 0)
> + return 0;
I think we should just forbid that and use a WARN_ON_ONCE() here /
return -EINVAL. So we'll never end up returning 0.
> + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
> + return 0;
> +
> + if (start < 0)
> + return -EINVAL;
> +
> + filep = fget(fd);
> + if (!filep)
> + return -EINVAL;
> +
> + if (!shmem_file(filep) && !is_file_hugepages(filep))
> + return -EINVAL;
> +
> + flags = memalloc_pin_save();
> + do {
> + for (i = 0; i < nr_pages; i++) {
> + if (shmem_mapping(filep->f_mapping)) {
> + page = shmem_read_mapping_page(filep->f_mapping,
> + start + i);
> + if (IS_ERR(page)) {
> + ret = PTR_ERR(page);
> + goto err;
> + }
> + } else {
> + page = find_get_page_flags(filep->f_mapping,
> + start + i,
> + FGP_ACCESSED);
> + if (!page) {
> + ret = -EINVAL;
> + goto err;
> + }
> + }
> + ret = try_grab_page(page, FOLL_PIN);
> + if (unlikely(ret))
> + goto err;
> +
> + pages[i] = page;
> + put_page(pages[i]);
> + }
> +
> + ret = check_and_migrate_movable_pages(nr_pages, pages);
> + } while (ret == -EAGAIN);
> +
> +err:
> + memalloc_pin_restore(flags);
> + fput(filep);
> + if (!ret)
> + return nr_pages;
> +
> + while (i > 0 && pages[--i]) {
> + unpin_user_page(pages[i]);
> + pages[i] = NULL;
If migrate_longterm_unpinnable_pages() failed, say with -ENOMEM, the
pages were already unpinned, but pages[i] has not been cleared, no?
> + }
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(pin_user_pages_fd);
> +
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH v1 1/3] mm/gup: Introduce pin_user_pages_fd() for pinning shmem/hugetlbfs file pages
2023-10-06 8:03 ` David Hildenbrand
@ 2023-10-06 18:43 ` Jason Gunthorpe
2023-10-17 7:39 ` Kasireddy, Vivek
1 sibling, 0 replies; 8+ messages in thread
From: Jason Gunthorpe @ 2023-10-06 18:43 UTC (permalink / raw)
To: David Hildenbrand
Cc: Vivek Kasireddy, dri-devel, linux-mm, Mike Kravetz,
Daniel Vetter, Hugh Dickins, Peter Xu, Gerd Hoffmann,
Dongwon Kim, Junxiao Chang
On Fri, Oct 06, 2023 at 10:03:33AM +0200, David Hildenbrand wrote:
> > + *
> > + * Returns number of pages pinned. This would be equal to the number of
> > + * pages requested.
> > + * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
> > + * -errno.
> > + */
> > +long pin_user_pages_fd(int fd, pgoff_t start, unsigned long nr_pages,
> > + unsigned int gup_flags, struct page **pages)
> > +{
> > + struct page *page;
> > + struct file *filep;
> > + unsigned int flags, i;
> > + long ret;
> > +
> > + if (nr_pages <= 0)
> > + return 0;
>
> I think we should just forbid that and use a WARN_ON_ONCE() here / return
> -EINVAL. So we'll never end up returning 0.
Why is the check even needed? It looked like it just runs through
normally and returns 0, that is fine..
Jason
^ permalink raw reply [flat|nested] 8+ messages in thread* RE: [PATCH v1 1/3] mm/gup: Introduce pin_user_pages_fd() for pinning shmem/hugetlbfs file pages
2023-10-06 8:03 ` David Hildenbrand
2023-10-06 18:43 ` Jason Gunthorpe
@ 2023-10-17 7:39 ` Kasireddy, Vivek
1 sibling, 0 replies; 8+ messages in thread
From: Kasireddy, Vivek @ 2023-10-17 7:39 UTC (permalink / raw)
To: David Hildenbrand, dri-devel, linux-mm, Mike Kravetz
Cc: Daniel Vetter, Hugh Dickins, Peter Xu, Gerd Hoffmann, Kim,
Dongwon, Chang, Junxiao, Jason Gunthorpe
Hi David,
> > For drivers that would like to longterm-pin the pages associated
> > with a file, the pin_user_pages_fd() API provides an option to
> > not only FOLL_PIN the pages but also to check and migrate them
> > if they reside in movable zone or CMA block. For now, this API
> > can only work with files belonging to shmem or hugetlbfs given
> > that the udmabuf driver is the only user.
>
> Maybe add "Other files are rejected.". Wasn't clear to me before I
> looked into the code.
Ok, will add it in v2.
>
> >
> > It must be noted that the pages associated with hugetlbfs files
> > are expected to be found in the page cache. An error is returned
> > if they are not found. However, shmem pages can be swapped in or
> > allocated if they are not present in the page cache.
> >
> > Cc: David Hildenbrand <david@redhat.com>
> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> > Cc: Mike Kravetz <mike.kravetz@oracle.com>
> > Cc: Hugh Dickins <hughd@google.com>
> > Cc: Peter Xu <peterx@redhat.com>
> > Cc: Gerd Hoffmann <kraxel@redhat.com>
> > Cc: Dongwon Kim <dongwon.kim@intel.com>
> > Cc: Junxiao Chang <junxiao.chang@intel.com>
> > Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> > Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> > ---
> > include/linux/mm.h | 2 ++
> > mm/gup.c | 87
> ++++++++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 89 insertions(+)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index bf5d0b1b16f4..af2121fb8101 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -2457,6 +2457,8 @@ long get_user_pages_unlocked(unsigned long
> start, unsigned long nr_pages,
> > struct page **pages, unsigned int gup_flags);
> > long pin_user_pages_unlocked(unsigned long start, unsigned long
> nr_pages,
> > struct page **pages, unsigned int gup_flags);
> > +long pin_user_pages_fd(int fd, pgoff_t start, unsigned long nr_pages,
> > + unsigned int gup_flags, struct page **pages);
> >
> > int get_user_pages_fast(unsigned long start, int nr_pages,
> > unsigned int gup_flags, struct page **pages);
> > diff --git a/mm/gup.c b/mm/gup.c
> > index 2f8a2d89fde1..e34b77a15fa8 100644
> > --- a/mm/gup.c
> > +++ b/mm/gup.c
> > @@ -3400,3 +3400,90 @@ long pin_user_pages_unlocked(unsigned long
> start, unsigned long nr_pages,
> > &locked, gup_flags);
> > }
> > EXPORT_SYMBOL(pin_user_pages_unlocked);
> > +
>
> This does look quite neat, nice! Let's take a closer look ...
>
> > +/**
> > + * pin_user_pages_fd() - pin user pages associated with a file
> > + * @fd: the fd whose pages are to be pinned
> > + * @start: starting file offset
> > + * @nr_pages: number of pages from start to pin
> > + * @gup_flags: flags modifying pin behaviour
>
> ^ I assume we should drop that. At least for now the flags are
> completely unused. And most likely we would want a different set of
> flags later (GUPFD_ ...).
Right now, FOLL_LONGTERM is the only accepted value for gup_flags but
yes, as you suggest, this can be made implicit by dropping gup_flags.
>
> > + * @pages: array that receives pointers to the pages pinned.
> > + * Should be at least nr_pages long.
> > + *
> > + * Attempt to pin (and migrate) pages associated with a file belonging to
>
> I'd drop the "and migrate" part, it's more of an implementation detail.
>
> > + * either shmem or hugetlbfs. An error is returned if pages associated with
> > + * hugetlbfs files are not present in the page cache. However, shmem
> pages
> > + * are swapped in or allocated if they are not present in the page cache.
>
> Why don't we do the same for hugetlbfs? Would make the interface more
> streamlined.
I am going off of what Mike has stated previously:
"It may not matter to your users, but the semantics for hugetlb and shmem
pages is different. hugetlb requires the pages exist in the page cache
while shmem will create/add pages to the cache if necessary."
However, if we were to allocate a hugepage (assuming one is not present in the
page cache at a given index), what needs to be done in addition to calling these APIs?
folio = alloc_hugetlb_folio_nodemask(h, NUMA_NO_NODE, NULL, GFP_USER)
hugetlb_add_to_page_cache(folio, mapping, idx)
>
> Certainly add that pinned pages have to be released using
> unpin_user_pages().
Sure, will include that in v2.
>
> > + *
> > + * Returns number of pages pinned. This would be equal to the number of
> > + * pages requested.
> > + * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
> > + * -errno.
> > + */
> > +long pin_user_pages_fd(int fd, pgoff_t start, unsigned long nr_pages,
> > + unsigned int gup_flags, struct page **pages)
> > +{
> > + struct page *page;
> > + struct file *filep;
> > + unsigned int flags, i;
> > + long ret;
> > +
> > + if (nr_pages <= 0)
> > + return 0;
>
> I think we should just forbid that and use a WARN_ON_ONCE() here /
> return -EINVAL. So we'll never end up returning 0.
I think I'll drop this check in v2 as Jason suggested.
>
> > + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
> > + return 0;
> > +
> > + if (start < 0)
> > + return -EINVAL;
> > +
> > + filep = fget(fd);
> > + if (!filep)
> > + return -EINVAL;
> > +
> > + if (!shmem_file(filep) && !is_file_hugepages(filep))
> > + return -EINVAL;
> > +
> > + flags = memalloc_pin_save();
> > + do {
> > + for (i = 0; i < nr_pages; i++) {
> > + if (shmem_mapping(filep->f_mapping)) {
> > + page = shmem_read_mapping_page(filep-
> >f_mapping,
> > + start + i);
> > + if (IS_ERR(page)) {
> > + ret = PTR_ERR(page);
> > + goto err;
> > + }
> > + } else {
> > + page = find_get_page_flags(filep->f_mapping,
> > + start + i,
> > + FGP_ACCESSED);
> > + if (!page) {
> > + ret = -EINVAL;
> > + goto err;
> > + }
> > + }
> > + ret = try_grab_page(page, FOLL_PIN);
> > + if (unlikely(ret))
> > + goto err;
> > +
> > + pages[i] = page;
> > + put_page(pages[i]);
> > + }
> > +
> > + ret = check_and_migrate_movable_pages(nr_pages, pages);
> > + } while (ret == -EAGAIN);
> > +
> > +err:
> > + memalloc_pin_restore(flags);
> > + fput(filep);
> > + if (!ret)
> > + return nr_pages;
> > +
> > + while (i > 0 && pages[--i]) {
> > + unpin_user_page(pages[i]);
> > + pages[i] = NULL;
>
> If migrate_longterm_unpinnable_pages() failed, say with -ENOMEM, the
> pages were already unpinned, but pages[i] has not been cleared, no?
You are right; the above while should not be executed in that case. I added
this chunk to cleanup after any errors thrown in the for loop above. I guess
I need to add a new error label to cleanup after errors thrown by
check_and_migrate_movable_pages().
Thanks,
Vivek
>
> > + }
> > + return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(pin_user_pages_fd);
> > +
>
> --
> Cheers,
>
> David / dhildenb
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v1 1/3] mm/gup: Introduce pin_user_pages_fd() for pinning shmem/hugetlbfs file pages
2023-10-03 7:44 ` [PATCH v1 1/3] " Vivek Kasireddy
2023-10-06 8:03 ` David Hildenbrand
@ 2023-10-10 13:51 ` Jason Gunthorpe
1 sibling, 0 replies; 8+ messages in thread
From: Jason Gunthorpe @ 2023-10-10 13:51 UTC (permalink / raw)
To: Vivek Kasireddy
Cc: dri-devel, linux-mm, David Hildenbrand, Daniel Vetter,
Mike Kravetz, Hugh Dickins, Peter Xu, Gerd Hoffmann, Dongwon Kim,
Junxiao Chang
On Tue, Oct 03, 2023 at 12:44:45AM -0700, Vivek Kasireddy wrote:
> +/**
> + * pin_user_pages_fd() - pin user pages associated with a file
> + * @fd: the fd whose pages are to be pinned
> + * @start: starting file offset
> + * @nr_pages: number of pages from start to pin
> + * @gup_flags: flags modifying pin behaviour
> + * @pages: array that receives pointers to the pages pinned.
> + * Should be at least nr_pages long.
> + *
> + * Attempt to pin (and migrate) pages associated with a file belonging to
> + * either shmem or hugetlbfs. An error is returned if pages associated with
> + * hugetlbfs files are not present in the page cache. However, shmem pages
> + * are swapped in or allocated if they are not present in the page cache.
> + *
> + * Returns number of pages pinned. This would be equal to the number of
> + * pages requested.
> + * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
> + * -errno.
> + */
> +long pin_user_pages_fd(int fd, pgoff_t start, unsigned long nr_pages,
> + unsigned int gup_flags, struct page **pages)
> +{
> + struct page *page;
> + struct file *filep;
> + unsigned int flags, i;
> + long ret;
> +
> + if (nr_pages <= 0)
> + return 0;
> + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
> + return 0;
> +
> + if (start < 0)
> + return -EINVAL;
> +
> + filep = fget(fd);
> + if (!filep)
> + return -EINVAL;
I think the caller should pass in the file *
In some cases we will need to hold a reference on it for a long time.
> + if (!shmem_file(filep) && !is_file_hugepages(filep))
> + return -EINVAL;
> +
> + flags = memalloc_pin_save();
> + do {
> + for (i = 0; i < nr_pages; i++) {
> + if (shmem_mapping(filep->f_mapping)) {
> + page = shmem_read_mapping_page(filep->f_mapping,
> + start + i);
> + if (IS_ERR(page)) {
> + ret = PTR_ERR(page);
> + goto err;
> + }
> + } else {
> + page = find_get_page_flags(filep->f_mapping,
> + start + i,
> + FGP_ACCESSED);
> + if (!page) {
> + ret = -EINVAL;
> + goto err;
> + }
I don't know these APIs at all, but I admit to being surprised we need
the special case for shmem ?
> + ret = try_grab_page(page, FOLL_PIN);
> + if (unlikely(ret))
> + goto err;
> +
> + pages[i] = page;
> + put_page(pages[i]);
> + }
> +
> + ret = check_and_migrate_movable_pages(nr_pages, pages);
> + } while (ret == -EAGAIN);
It seems OK, but I do wish it was faster :) Maybe for another day.
Jason
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH v1 2/3] udmabuf: Pin the pages using pin_user_pages_fd() API
2023-10-03 7:44 [PATCH v1 0/3] mm/gup: Introduce pin_user_pages_fd() for pinning shmem/hugetlbfs file pages Vivek Kasireddy
2023-10-03 7:44 ` [PATCH v1 1/3] " Vivek Kasireddy
@ 2023-10-03 7:44 ` Vivek Kasireddy
2023-10-03 7:44 ` [PATCH v1 3/3] selftests/dma-buf/udmabuf: Add tests to verify data after page migration Vivek Kasireddy
2 siblings, 0 replies; 8+ messages in thread
From: Vivek Kasireddy @ 2023-10-03 7:44 UTC (permalink / raw)
To: dri-devel, linux-mm
Cc: Vivek Kasireddy, David Hildenbrand, Daniel Vetter, Mike Kravetz,
Hugh Dickins, Peter Xu, Jason Gunthorpe, Gerd Hoffmann,
Dongwon Kim, Junxiao Chang
Using pin_user_pages_fd() will ensure that the pages are pinned
correctly using FOLL_PIN. And, this also ensures that we don't
accidentally break features such as memory hotunplug as it would
not allow pinning pages in the movable zone.
This patch also adds back support for mapping hugetlbfs pages
by noting the subpage offsets within the huge pages and uses
this information while populating the scatterlist.
Cc: David Hildenbrand <david@redhat.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Dongwon Kim <dongwon.kim@intel.com>
Cc: Junxiao Chang <junxiao.chang@intel.com>
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
---
drivers/dma-buf/udmabuf.c | 82 +++++++++++++++++++++++++++++----------
1 file changed, 61 insertions(+), 21 deletions(-)
diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c
index 820c993c8659..9ef1eaf4df4b 100644
--- a/drivers/dma-buf/udmabuf.c
+++ b/drivers/dma-buf/udmabuf.c
@@ -10,6 +10,7 @@
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/shmem_fs.h>
+#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <linux/udmabuf.h>
#include <linux/vmalloc.h>
@@ -28,6 +29,7 @@ struct udmabuf {
struct page **pages;
struct sg_table *sg;
struct miscdevice *device;
+ pgoff_t *subpgoff;
};
static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf)
@@ -90,23 +92,31 @@ static struct sg_table *get_sg_table(struct device *dev, struct dma_buf *buf,
{
struct udmabuf *ubuf = buf->priv;
struct sg_table *sg;
+ struct scatterlist *sgl;
+ pgoff_t offset;
+ unsigned long i = 0;
int ret;
sg = kzalloc(sizeof(*sg), GFP_KERNEL);
if (!sg)
return ERR_PTR(-ENOMEM);
- ret = sg_alloc_table_from_pages(sg, ubuf->pages, ubuf->pagecount,
- 0, ubuf->pagecount << PAGE_SHIFT,
- GFP_KERNEL);
+
+ ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL);
if (ret < 0)
- goto err;
+ goto err_alloc;
+
+ for_each_sg(sg->sgl, sgl, ubuf->pagecount, i) {
+ offset = ubuf->subpgoff ? ubuf->subpgoff[i] : 0;
+ sg_set_page(sgl, ubuf->pages[i], PAGE_SIZE, offset);
+ }
ret = dma_map_sgtable(dev, sg, direction, 0);
if (ret < 0)
- goto err;
+ goto err_map;
return sg;
-err:
+err_map:
sg_free_table(sg);
+err_alloc:
kfree(sg);
return ERR_PTR(ret);
}
@@ -142,7 +152,9 @@ static void release_udmabuf(struct dma_buf *buf)
put_sg_table(dev, ubuf->sg, DMA_BIDIRECTIONAL);
for (pg = 0; pg < ubuf->pagecount; pg++)
- put_page(ubuf->pages[pg]);
+ unpin_user_page(ubuf->pages[pg]);
+
+ kfree(ubuf->subpgoff);
kfree(ubuf->pages);
kfree(ubuf);
}
@@ -202,12 +214,13 @@ static long udmabuf_create(struct miscdevice *device,
{
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
struct file *memfd = NULL;
- struct address_space *mapping = NULL;
struct udmabuf *ubuf;
struct dma_buf *buf;
- pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit;
- struct page *page;
- int seals, ret = -EINVAL;
+ pgoff_t pgoff, pgcnt, pgbuf = 0, pglimit, nr_pages;
+ pgoff_t subpgoff, maxsubpgs;
+ struct hstate *hpstate;
+ long ret = -EINVAL;
+ int seals;
u32 i, flags;
ubuf = kzalloc(sizeof(*ubuf), GFP_KERNEL);
@@ -241,8 +254,7 @@ static long udmabuf_create(struct miscdevice *device,
memfd = fget(list[i].memfd);
if (!memfd)
goto err;
- mapping = memfd->f_mapping;
- if (!shmem_mapping(mapping))
+ if (!shmem_file(memfd) && !is_file_hugepages(memfd))
goto err;
seals = memfd_fcntl(memfd, F_GET_SEALS, 0);
if (seals == -EINVAL)
@@ -253,14 +265,41 @@ static long udmabuf_create(struct miscdevice *device,
goto err;
pgoff = list[i].offset >> PAGE_SHIFT;
pgcnt = list[i].size >> PAGE_SHIFT;
- for (pgidx = 0; pgidx < pgcnt; pgidx++) {
- page = shmem_read_mapping_page(mapping, pgoff + pgidx);
- if (IS_ERR(page)) {
- ret = PTR_ERR(page);
+ if (is_file_hugepages(memfd)) {
+ if (!ubuf->subpgoff) {
+ ubuf->subpgoff = kmalloc_array(ubuf->pagecount,
+ sizeof(*ubuf->subpgoff),
+ GFP_KERNEL);
+ if (!ubuf->subpgoff) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+ hpstate = hstate_file(memfd);
+ pgoff = list[i].offset >> huge_page_shift(hpstate);
+ subpgoff = (list[i].offset &
+ ~huge_page_mask(hpstate)) >> PAGE_SHIFT;
+ maxsubpgs = huge_page_size(hpstate) >> PAGE_SHIFT;
+ }
+
+ do {
+ nr_pages = shmem_file(memfd) ? pgcnt : 1;
+ ret = pin_user_pages_fd(list[i].memfd, pgoff,
+ nr_pages, FOLL_LONGTERM,
+ ubuf->pages + pgbuf);
+ if (ret < 0)
goto err;
+
+ if (is_file_hugepages(memfd)) {
+ ubuf->subpgoff[pgbuf] = subpgoff << PAGE_SHIFT;
+ if (++subpgoff == maxsubpgs) {
+ subpgoff = 0;
+ pgoff++;
+ }
}
- ubuf->pages[pgbuf++] = page;
- }
+ pgbuf += nr_pages;
+ pgcnt -= nr_pages;
+ } while (pgcnt > 0);
fput(memfd);
memfd = NULL;
}
@@ -283,10 +322,11 @@ static long udmabuf_create(struct miscdevice *device,
return dma_buf_fd(buf, flags);
err:
- while (pgbuf > 0)
- put_page(ubuf->pages[--pgbuf]);
+ while (pgbuf > 0 && ubuf->pages[--pgbuf])
+ unpin_user_page(ubuf->pages[pgbuf]);
if (memfd)
fput(memfd);
+ kfree(ubuf->subpgoff);
kfree(ubuf->pages);
kfree(ubuf);
return ret;
--
2.39.2
^ permalink raw reply [flat|nested] 8+ messages in thread* [PATCH v1 3/3] selftests/dma-buf/udmabuf: Add tests to verify data after page migration
2023-10-03 7:44 [PATCH v1 0/3] mm/gup: Introduce pin_user_pages_fd() for pinning shmem/hugetlbfs file pages Vivek Kasireddy
2023-10-03 7:44 ` [PATCH v1 1/3] " Vivek Kasireddy
2023-10-03 7:44 ` [PATCH v1 2/3] udmabuf: Pin the pages using pin_user_pages_fd() API Vivek Kasireddy
@ 2023-10-03 7:44 ` Vivek Kasireddy
2 siblings, 0 replies; 8+ messages in thread
From: Vivek Kasireddy @ 2023-10-03 7:44 UTC (permalink / raw)
To: dri-devel, linux-mm
Cc: Vivek Kasireddy, Shuah Khan, David Hildenbrand, Daniel Vetter,
Mike Kravetz, Hugh Dickins, Peter Xu, Jason Gunthorpe,
Gerd Hoffmann, Dongwon Kim, Junxiao Chang
Since the memfd pages associated with a udmabuf may be migrated
as part of udmabuf create, we need to verify the data coherency
after successful migration. The new tests added in this patch try
to do just that using 4k sized pages and also 2 MB sized huge
pages for the memfd.
Successful completion of the tests would mean that there is no
disconnect between the memfd pages and the ones associated with
a udmabuf. And, these tests can also be augmented in the future
to test newer udmabuf features (such as handling memfd hole punch).
Cc: Shuah Khan <shuah@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Dongwon Kim <dongwon.kim@intel.com>
Cc: Junxiao Chang <junxiao.chang@intel.com>
Based-on-patch-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
---
.../selftests/drivers/dma-buf/udmabuf.c | 151 +++++++++++++++++-
1 file changed, 147 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/drivers/dma-buf/udmabuf.c b/tools/testing/selftests/drivers/dma-buf/udmabuf.c
index c812080e304e..d76c813fe652 100644
--- a/tools/testing/selftests/drivers/dma-buf/udmabuf.c
+++ b/tools/testing/selftests/drivers/dma-buf/udmabuf.c
@@ -9,26 +9,132 @@
#include <errno.h>
#include <fcntl.h>
#include <malloc.h>
+#include <stdbool.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
+#include <sys/mman.h>
#include <linux/memfd.h>
#include <linux/udmabuf.h>
#define TEST_PREFIX "drivers/dma-buf/udmabuf"
#define NUM_PAGES 4
+#define NUM_ENTRIES 4
+#define MEMFD_SIZE 1024 /* in pages */
-static int memfd_create(const char *name, unsigned int flags)
+static unsigned int page_size;
+
+static int create_memfd_with_seals(off64_t size, bool hpage)
+{
+ int memfd, ret;
+ unsigned int flags = MFD_ALLOW_SEALING;
+
+ if (hpage)
+ flags |= MFD_HUGETLB;
+
+ memfd = memfd_create("udmabuf-test", flags);
+ if (memfd < 0) {
+ printf("%s: [skip,no-memfd]\n", TEST_PREFIX);
+ exit(77);
+ }
+
+ ret = fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK);
+ if (ret < 0) {
+ printf("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
+ exit(77);
+ }
+
+ ret = ftruncate(memfd, size);
+ if (ret == -1) {
+ printf("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
+ exit(1);
+ }
+
+ return memfd;
+}
+
+static int create_udmabuf_list(int devfd, int memfd, off64_t memfd_size)
+{
+ struct udmabuf_create_list *list;
+ int ubuf_fd, i;
+
+ list = malloc(sizeof(struct udmabuf_create_list) +
+ sizeof(struct udmabuf_create_item) * NUM_ENTRIES);
+ if (!list) {
+ printf("%s: [FAIL, udmabuf-malloc]\n", TEST_PREFIX);
+ exit(1);
+ }
+
+ for (i = 0; i < NUM_ENTRIES; i++) {
+ list->list[i].memfd = memfd;
+ list->list[i].offset = i * (memfd_size / NUM_ENTRIES);
+ list->list[i].size = getpagesize() * NUM_PAGES;
+ }
+
+ list->count = NUM_ENTRIES;
+ list->flags = UDMABUF_FLAGS_CLOEXEC;
+ ubuf_fd = ioctl(devfd, UDMABUF_CREATE_LIST, list);
+ free(list);
+ if (ubuf_fd < 0) {
+ printf("%s: [FAIL, udmabuf-create]\n", TEST_PREFIX);
+ exit(1);
+ }
+
+ return ubuf_fd;
+}
+
+static void write_to_memfd(void *addr, off64_t size, char chr)
+{
+ int i;
+
+ for (i = 0; i < size / page_size; i++) {
+ *((char *)addr + (i * page_size)) = chr;
+ }
+}
+
+static void *mmap_fd(int fd, off64_t size)
{
- return syscall(__NR_memfd_create, name, flags);
+ void *addr;
+
+ addr = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+ if (addr == MAP_FAILED) {
+ printf("%s: ubuf_fd mmap fail\n", TEST_PREFIX);
+ exit(1);
+ }
+
+ return addr;
+}
+
+static int compare_chunks(void *addr1, void *addr2, off64_t memfd_size)
+{
+ off64_t off;
+ int i = 0, j, k = 0, ret = 0;
+ char char1, char2;
+
+ while (i < NUM_ENTRIES) {
+ off = i * (memfd_size / NUM_ENTRIES);
+ for (j = 0; j < NUM_PAGES; j++, k++) {
+ char1 = *((char *)addr1 + off + (j * getpagesize()));
+ char2 = *((char *)addr2 + (k * getpagesize()));
+ if (char1 != char2) {
+ ret = -1;
+ goto err;
+ }
+ }
+ i++;
+ }
+err:
+ munmap(addr1, memfd_size);
+ munmap(addr2, NUM_ENTRIES * NUM_PAGES * getpagesize());
+ return ret;
}
int main(int argc, char *argv[])
{
struct udmabuf_create create;
int devfd, memfd, buf, ret;
- off_t size;
- void *mem;
+ off64_t size;
+ void *addr1, *addr2;
devfd = open("/dev/udmabuf", O_RDWR);
if (devfd < 0) {
@@ -90,6 +196,9 @@ int main(int argc, char *argv[])
}
/* should work */
+ page_size = getpagesize();
+ addr1 = mmap_fd(memfd, size);
+ write_to_memfd(addr1, size, 'a');
create.memfd = memfd;
create.offset = 0;
create.size = size;
@@ -98,6 +207,40 @@ int main(int argc, char *argv[])
printf("%s: [FAIL,test-4]\n", TEST_PREFIX);
exit(1);
}
+ munmap(addr1, size);
+ close(buf);
+ close(memfd);
+
+ /* should work (migration of 4k size pages)*/
+ size = MEMFD_SIZE * page_size;
+ memfd = create_memfd_with_seals(size, false);
+ addr1 = mmap_fd(memfd, size);
+ write_to_memfd(addr1, size, 'a');
+ buf = create_udmabuf_list(devfd, memfd, size);
+ addr2 = mmap_fd(buf, NUM_PAGES * NUM_ENTRIES * getpagesize());
+ write_to_memfd(addr1, size, 'b');
+ ret = compare_chunks(addr1, addr2, size);
+ if (ret < 0) {
+ printf("%s: [FAIL,test-5]\n", TEST_PREFIX);
+ exit(1);
+ }
+ close(buf);
+ close(memfd);
+
+ /* should work (migration of 2MB size huge pages)*/
+ page_size = getpagesize() * 512; /* 2 MB */
+ size = MEMFD_SIZE * page_size;
+ memfd = create_memfd_with_seals(size, true);
+ addr1 = mmap_fd(memfd, size);
+ write_to_memfd(addr1, size, 'a');
+ buf = create_udmabuf_list(devfd, memfd, size);
+ addr2 = mmap_fd(buf, NUM_PAGES * NUM_ENTRIES * getpagesize());
+ write_to_memfd(addr1, size, 'b');
+ ret = compare_chunks(addr1, addr2, size);
+ if (ret < 0) {
+ printf("%s: [FAIL,test-6]\n", TEST_PREFIX);
+ exit(1);
+ }
fprintf(stderr, "%s: ok\n", TEST_PREFIX);
close(buf);
--
2.39.2
^ permalink raw reply [flat|nested] 8+ messages in thread