linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Mina Almasry <almasrymina@google.com>
To: Mike Kravetz <mike.kravetz@oracle.com>,
	Andrew Morton <akpm@linux-foundation.org>
Cc: Wei Xu <weixugc@google.com>,
	James Houghton <jthoughton@google.com>,
	linux-mm@kvack.org,  linux-kernel@vger.kernel.org
Subject: Re: [PATCH v1] hugetlb, userfaultfd: Fix reservation restore on userfaultfd error
Date: Tue, 16 Nov 2021 16:00:35 -0800	[thread overview]
Message-ID: <CAHS8izM0EjNmyc25fOqsU03G84+nGeyWoXo+oJWC5TQx52OH9A@mail.gmail.com> (raw)
In-Reply-To: <20211116235733.3774702-1-almasrymina@google.com>

[-- Attachment #1: Type: text/plain, Size: 2218 bytes --]

Hi Mike,

On Tue, Nov 16, 2021 at 3:57 PM Mina Almasry <almasrymina@google.com> wrote:
>
> Currently in the is_continue case in hugetlb_mcopy_atomic_pte(), if we
> bail out using "goto out_release_unlock;" in the cases where idx >=
> size, or !huge_pte_none(), the code will detect that new_pagecache_page
> == false, and so call restore_reserve_on_error().
> In this case I see restore_reserve_on_error() delete the reservation,
> and the following call to remove_inode_hugepages() will increment
> h->resv_hugepages causing a 100% reproducible leak.
>

Attached is the .c file with the 100% repro.

> We should treat the is_continue case similar to adding a page into the
> pagecache and set new_pagecache_page to true, to indicate that there is
> no reservation to restore on the error path, and we need not call
> restore_reserve_on_error().
>
> Cc: Wei Xu <weixugc@google.com>
>
> Fixes: c7b1850dfb41 ("hugetlb: don't pass page cache pages to restore_reserve_on_error")
> Signed-off-by: Mina Almasry <almasrymina@google.com>
> Reported-by: James Houghton <jthoughton@google.com>

Not sure if this is a Cc: stable issue. If it is, I can add in v2.

> ---
>  mm/hugetlb.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index e09159c957e3..25a7a3d84607 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5741,6 +5741,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>                 page = find_lock_page(mapping, idx);
>                 if (!page)
>                         goto out;
> +               /*
> +                * Set new_pagecache_page to true, as we've added a page to the
> +                * pagecache, but userfaultfd hasn't set up a mapping for this
> +                * page yet. If we bail out before setting up the mapping, we
> +                * want to indicate to restore_reserve_on_error() that we've
> +                * added the page to the page cache.
> +                */
> +               new_pagecache_page = true;
>         } else if (!*pagep) {
>                 /* If a page already exists, then it's UFFDIO_COPY for
>                  * a non-missing case. Return -EEXIST.
> --
> 2.34.0.rc1.387.gb447b232ab-goog

[-- Attachment #2: hugetlb-leak-test.c --]
[-- Type: text/x-csrc, Size: 2964 bytes --]

#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <string.h>
#include <fcntl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <linux/userfaultfd.h>

#ifndef UFFD_FEATURE_MINOR_HUGETLBFS
#define UFFD_FEATURE_MINOR_HUGETLBFS (1 << 9)
#endif

#ifndef UFFDIO_REGISTER_MODE_MINOR
#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1 << 2)
#endif

#ifndef UFFDIO_CONTINUE_MODE_DONTWAKE
#define UFFDIO_CONTINUE_MODE_DONTWAKE ((uint64_t)1 << 0)
#endif

struct uffdio_continue {
	struct uffdio_range range;
	uint64_t mode;
	int64_t bytes_mapped;
};


#ifndef UFFDIO_CONTINUE
#define _UFFDIO_CONTINUE (0x07)
#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, struct uffdio_continue)
#endif

#ifndef UFFD_PAGEFAULT_FLAG_MINOR
#define UFFD_PAGEFAULT_FLAG_MINOR (1 << 2)
#endif

int userfaultfd(int mode) {
	return syscall(__NR_userfaultfd, mode);
}

int main() {

	int fd = open("/mnt/huge/test", O_RDWR | O_CREAT | S_IRUSR | S_IWUSR);
	if (fd == -1) {
		perror("opening tmpfile in hugetlbfs-mount failed");
		return -1;
	}

	const size_t len_hugepage = 512 * 4096; // 2MB
	const size_t len = 2 * len_hugepage;
	if (ftruncate(fd, len) < 0) {
		perror("ftruncate failed");
		return -1;
	}
	int uffd = userfaultfd(O_CLOEXEC | O_NONBLOCK);
	if (uffd < 0) {
		perror("uffd not created");
		return -1;
	}

	char *hva = (char*)(mmap(NULL, len, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
	if (hva == MAP_FAILED) {
		perror("mmap hva failed");
		return -1;
	}

	char *primary = (char*)(mmap(hva, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
	if (primary == MAP_FAILED) {
		perror("mmap primary failed");
		return -1;
	}

	char *secondary = (char*)(mmap(hva, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
	if (primary == MAP_FAILED) {
		perror("mmap secondary failed");
		return -1;
	}

	struct uffdio_api api;
	api.api = UFFD_API;
	api.features = UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MISSING_HUGETLBFS;
	if (ioctl(uffd, UFFDIO_API, &api) == -1) {
		perror("UFFDIO_API failed");
		return -1;
	}

	memset(primary + len_hugepage, 0, len_hugepage);

	struct uffdio_register reg;
	reg.range.start = (unsigned long)primary;
	reg.range.len = len;
	reg.mode = UFFDIO_REGISTER_MODE_MINOR | UFFDIO_REGISTER_MODE_MISSING;
	reg.ioctls = 0;
	if (ioctl(uffd, UFFDIO_REGISTER, &reg) == -1) {
		perror("register failed");
		return -1;
	}

	memset(secondary, 0, len);

	struct uffdio_continue cont = {
		.range = (struct uffdio_range) {
			.start = (uint64_t)primary,
			.len = len,
		},
		.mode = 0,
		.bytes_mapped = 0,
	};
	if (ioctl(uffd, UFFDIO_CONTINUE, &cont) < 0) {
		perror("UFFDIO_CONTINUE failed");
		printf("Mapped %d bytes\n", cont.bytes_mapped);
	}

	close(uffd);
	munmap(secondary, len);
	munmap(primary, len);
	munmap(hva, len);

	close(fd);

	// After this unlink, a hugepage will be leaked.
	if (unlink("/mnt/huge/test") < 0) {
		perror("unlink failed");
	}

	return 0;
}

  reply	other threads:[~2021-11-17  0:01 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-16 23:57 Mina Almasry
2021-11-17  0:00 ` Mina Almasry [this message]
2021-11-17  0:32 ` Andrew Morton
2021-11-17  0:58 ` Mike Kravetz

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAHS8izM0EjNmyc25fOqsU03G84+nGeyWoXo+oJWC5TQx52OH9A@mail.gmail.com \
    --to=almasrymina@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=jthoughton@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mike.kravetz@oracle.com \
    --cc=weixugc@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox