Re: [PATCH] mm/khugepaged: skip shmem with armed userfaultfd

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Yang Shi <shy828301@gmail.com>
To: David Stevens <stevensd@chromium.org>,
	Peter Xu <peterx@redhat.com>,
	 David Hildenbrand <david@redhat.com>,
	"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: linux-mm@kvack.org, Andrew Morton <akpm@linux-foundation.org>,
	 linux-kernel@vger.kernel.org
Subject: Re: [PATCH] mm/khugepaged: skip shmem with armed userfaultfd
Date: Wed, 1 Feb 2023 09:36:37 -0800	[thread overview]
Message-ID: <CAHbLzkpbV2LOoTpWwSOS+UUsYJiZX4vO78jZSr6xmpAGNGoH5w@mail.gmail.com> (raw)
In-Reply-To: <20230201034137.2463113-1-stevensd@google.com>

On Tue, Jan 31, 2023 at 7:42 PM David Stevens <stevensd@chromium.org> wrote:
>
> From: David Stevens <stevensd@chromium.org>
>
> Collapsing memory in a vma that has an armed userfaultfd results in
> zero-filling any missing pages, which breaks user-space paging for those
> filled pages. Avoid khugepage bypassing userfaultfd by not collapsing
> pages in shmem reached via scanning a vma with an armed userfaultfd if
> doing so would zero-fill any pages.
>
> Signed-off-by: David Stevens <stevensd@chromium.org>
> ---
>  mm/khugepaged.c | 35 ++++++++++++++++++++++++-----------
>  1 file changed, 24 insertions(+), 11 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 79be13133322..48e944fb8972 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1736,8 +1736,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
>   *    + restore gaps in the page cache;
>   *    + unlock and free huge page;
>   */
> -static int collapse_file(struct mm_struct *mm, unsigned long addr,
> -                        struct file *file, pgoff_t start,
> +static int collapse_file(struct mm_struct *mm, struct vm_area_struct *vma,
> +                        unsigned long addr, struct file *file, pgoff_t start,
>                          struct collapse_control *cc)
>  {
>         struct address_space *mapping = file->f_mapping;
> @@ -1784,6 +1784,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>          * be able to map it or use it in another way until we unlock it.
>          */
>
> +       if (is_shmem)
> +               mmap_read_lock(mm);

If you release mmap_lock before then reacquire it here, the vma is not
trusted anymore. It is not safe to use the vma anymore.

Since you already read uffd_was_armed before releasing mmap_lock, so
you could pass it directly to collapse_file w/o dereferencing vma
again. The problem may be false positive (not userfaultfd armed
anymore), but it should be fine. Khugepaged could collapse this area
in the next round.

Also +userfaultfd folks.

> +
>         xas_set(&xas, start);
>         for (index = start; index < end; index++) {
>                 struct page *page = xas_next(&xas);
> @@ -1792,6 +1795,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>                 VM_BUG_ON(index != xas.xa_index);
>                 if (is_shmem) {
>                         if (!page) {
> +                               if (userfaultfd_armed(vma)) {
> +                                       result = SCAN_EXCEED_NONE_PTE;
> +                                       goto xa_locked;
> +                               }
>                                 /*
>                                  * Stop if extent has been truncated or
>                                  * hole-punched, and is now completely
> @@ -2095,6 +2102,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>                 hpage->mapping = NULL;
>         }
>
> +       if (is_shmem)
> +               mmap_read_unlock(mm);
>         if (hpage)
>                 unlock_page(hpage);
>  out:
> @@ -2108,8 +2117,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>         return result;
>  }
>
> -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> -                                   struct file *file, pgoff_t start,
> +static int hpage_collapse_scan_file(struct mm_struct *mm, struct vm_area_struct *vma,
> +                                   unsigned long addr, struct file *file, pgoff_t start,
>                                     struct collapse_control *cc)
>  {
>         struct page *page = NULL;
> @@ -2118,6 +2127,9 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
>         int present, swap;
>         int node = NUMA_NO_NODE;
>         int result = SCAN_SUCCEED;
> +       bool uffd_was_armed = userfaultfd_armed(vma);
> +
> +       mmap_read_unlock(mm);
>
>         present = 0;
>         swap = 0;
> @@ -2193,13 +2205,16 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
>         }
>         rcu_read_unlock();
>
> +       if (uffd_was_armed && present < HPAGE_PMD_NR)
> +               result = SCAN_EXCEED_SWAP_PTE;
> +
>         if (result == SCAN_SUCCEED) {
>                 if (cc->is_khugepaged &&
>                     present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
>                         result = SCAN_EXCEED_NONE_PTE;
>                         count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
>                 } else {
> -                       result = collapse_file(mm, addr, file, start, cc);
> +                       result = collapse_file(mm, vma, addr, file, start, cc);
>                 }
>         }
>
> @@ -2207,8 +2222,8 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
>         return result;
>  }
>  #else
> -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> -                                   struct file *file, pgoff_t start,
> +static int hpage_collapse_scan_file(struct mm_struct *mm, struct vm_area_struct *vma,
> +                                   unsigned long addr, struct file *file, pgoff_t start,
>                                     struct collapse_control *cc)
>  {
>         BUILD_BUG();
> @@ -2304,8 +2319,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
>                                 pgoff_t pgoff = linear_page_index(vma,
>                                                 khugepaged_scan.address);
>
> -                               mmap_read_unlock(mm);
> -                               *result = hpage_collapse_scan_file(mm,
> +                               *result = hpage_collapse_scan_file(mm, vma,
>                                                                    khugepaged_scan.address,
>                                                                    file, pgoff, cc);
>                                 mmap_locked = false;
> @@ -2656,9 +2670,8 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
>                         struct file *file = get_file(vma->vm_file);
>                         pgoff_t pgoff = linear_page_index(vma, addr);
>
> -                       mmap_read_unlock(mm);
>                         mmap_locked = false;
> -                       result = hpage_collapse_scan_file(mm, addr, file, pgoff,
> +                       result = hpage_collapse_scan_file(mm, vma, addr, file, pgoff,
>                                                           cc);
>                         fput(file);
>                 } else {
> --
> 2.39.1.456.gfc5497dd1b-goog
>
>

next prev parent reply	other threads:[~2023-02-01 17:36 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-01  3:41 David Stevens
2023-02-01 17:36 ` Yang Shi [this message]
2023-02-01 20:52   ` Peter Xu
2023-02-01 23:57     ` Yang Shi
2023-02-02 20:04       ` Peter Xu
2023-02-02 21:11         ` Yang Shi
2023-02-02  9:56     ` David Stevens
2023-02-02 17:40       ` Yang Shi
2023-02-02 20:22         ` Peter Xu
2023-02-03  6:09           ` David Stevens
2023-02-03 14:56             ` Peter Xu
2023-02-01 23:09 ` Kirill A. Shutemov
2023-02-02  9:30   ` David Stevens

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAHbLzkpbV2LOoTpWwSOS+UUsYJiZX4vO78jZSr6xmpAGNGoH5w@mail.gmail.com \
    --to=shy828301@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@redhat.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=peterx@redhat.com \
    --cc=stevensd@chromium.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox