linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm,madvise,hugetlb: check for 0-length range after end address adjustment
@ 2025-01-31 14:37 Ricardo Cañuelo Navarro
  2025-01-31 14:58 ` Ricardo Cañuelo Navarro
  2025-01-31 22:23 ` Andrew Morton
  0 siblings, 2 replies; 4+ messages in thread
From: Ricardo Cañuelo Navarro @ 2025-01-31 14:37 UTC (permalink / raw)
  To: akpm, riel; +Cc: linux-mm, stable, kernel-dev, revest

Add a sanity check to madvise_dontneed_free() to address a corner case
in madvise where a race condition causes the current vma being processed
to be backed by a different page size.

During a madvise(MADV_DONTNEED) call on a memory region registered with
a userfaultfd, there's a period of time where the process mm lock is
temporarily released in order to send a UFFD_EVENT_REMOVE and let
userspace handle the event. During this time, the vma covering the
current address range may change due to an explicit mmap done
concurrently by another thread.

If, after that change, the memory region, which was originally backed by
4KB pages, is now backed by hugepages, the end address is rounded down
to a hugepage boundary to avoid data loss (see "Fixes" below). This
rounding may cause the end address to be truncated to the same address
as the start.

Make this corner case follow the same semantics as in other similar
cases where the requested region has zero length (ie. return 0).

This will make madvise_walk_vmas() continue to the next vma in the
range (this time holding the process mm lock) which, due to the prev
pointer becoming stale because of the vma change, will be the same
hugepage-backed vma that was just checked before. The next time
madvise_dontneed_free() runs for this vma, if the start address isn't
aligned to a hugepage boundary, it'll return -EINVAL, which is also in
line with the madvise api.

From userspace perspective, madvise() will return EINVAL because the
start address isn't aligned according to the new vma alignment
requirements (hugepage), even though it was correctly page-aligned when
the call was issued.

Fixes: 8ebe0a5eaaeb ("mm,madvise,hugetlb: fix unexpected data loss with MADV_DONTNEED on hugetlbfs")
Cc: stable@vger.kernel.org
Signed-off-by: Ricardo Cañuelo Navarro <rcn@igalia.com>
---
 mm/madvise.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 49f3a75046f6..c8e28d51978a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -933,7 +933,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
 			 */
 			end = vma->vm_end;
 		}
-		VM_WARN_ON(start >= end);
+		if (start == end)
+			return 0;
+		VM_WARN_ON(start > end);
 	}
 
 	if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
-- 
2.48.1



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] mm,madvise,hugetlb: check for 0-length range after end address adjustment
  2025-01-31 14:37 [PATCH] mm,madvise,hugetlb: check for 0-length range after end address adjustment Ricardo Cañuelo Navarro
@ 2025-01-31 14:58 ` Ricardo Cañuelo Navarro
  2025-01-31 22:23 ` Andrew Morton
  1 sibling, 0 replies; 4+ messages in thread
From: Ricardo Cañuelo Navarro @ 2025-01-31 14:58 UTC (permalink / raw)
  To: Ricardo Cañuelo Navarro
  Cc: akpm, riel, linux-mm, stable, kernel-dev, revest

Hi all,

Some more context about the patch. The issue (WARNING in
madvise_vma_behavior) was found by a private syzbot instance, so I can't
share the link, but it can be triggered by an unprivileged user with
this reproducer:

--8<------------------------------------------------------------------

#define _GNU_SOURCE

#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>

#include <linux/futex.h>

#ifndef __NR_userfaultfd
#define __NR_userfaultfd 323
#endif

static void sleep_ms(uint64_t ms)
{
  usleep(ms * 1000);
}

static uint64_t current_time_ms(void)
{
  struct timespec ts;
  if (clock_gettime(CLOCK_MONOTONIC, &ts))
    exit(1);
  return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}

static void thread_start(void* (*fn)(void*), void* arg)
{
  pthread_t th;
  pthread_attr_t attr;
  pthread_attr_init(&attr);
  pthread_attr_setstacksize(&attr, 128 << 10);
  int i = 0;
  for (; i < 100; i++) {
    if (pthread_create(&th, &attr, fn, arg) == 0) {
      pthread_attr_destroy(&attr);
      return;
    }
    if (errno == EAGAIN) {
      usleep(50);
      continue;
    }
    break;
  }
  exit(1);
}

typedef struct {
  int state;
} event_t;

static void event_init(event_t* ev)
{
  ev->state = 0;
}

static void event_reset(event_t* ev)
{
  ev->state = 0;
}

static void event_set(event_t* ev)
{
  if (ev->state)
    exit(1);
  __atomic_store_n(&ev->state, 1, __ATOMIC_RELEASE);
  syscall(SYS_futex, &ev->state, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1000000);
}

static void event_wait(event_t* ev)
{
  while (!__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
    syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, 0);
}

static int event_isset(event_t* ev)
{
  return __atomic_load_n(&ev->state, __ATOMIC_ACQUIRE);
}

static int event_timedwait(event_t* ev, uint64_t timeout)
{
  uint64_t start = current_time_ms();
  uint64_t now = start;
  for (;;) {
    uint64_t remain = timeout - (now - start);
    struct timespec ts;
    ts.tv_sec = remain / 1000;
    ts.tv_nsec = (remain % 1000) * 1000 * 1000;
    syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, &ts);
    if (__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
      return 1;
    now = current_time_ms();
    if (now - start > timeout)
      return 0;
  }
}

struct thread_t {
  int created, call;
  event_t ready, done;
};

static struct thread_t threads[16];
static void execute_call(int call);
static int running;

static void* thr(void* arg)
{
  struct thread_t* th = (struct thread_t*)arg;
  for (;;) {
    event_wait(&th->ready);
    event_reset(&th->ready);
    execute_call(th->call);
    __atomic_fetch_sub(&running, 1, __ATOMIC_RELAXED);
    event_set(&th->done);
  }
  return 0;
}

static void loop(void)
{
  if (write(1, "executing program\n", sizeof("executing program\n") - 1)) {
  }
  int i, call, thread;
  for (call = 0; call < 8; call++) {
	  for (thread = 0; thread < (int)(sizeof(threads) / sizeof(threads[0]));
	       thread++) {
		  struct thread_t* th = &threads[thread];
		  if (!th->created) {
			  th->created = 1;
			  event_init(&th->ready);
			  event_init(&th->done);
			  event_set(&th->done);
			  thread_start(thr, th);
		  }
		  if (!event_isset(&th->done))
			  continue;
		  event_reset(&th->done);
		  th->call = call;
		  __atomic_fetch_add(&running, 1, __ATOMIC_RELAXED);
		  event_set(&th->ready);
		  event_timedwait(&th->done, 50);
		  break;
	  }
  }
  for (i = 0; i < 100 && __atomic_load_n(&running, __ATOMIC_RELAXED); i++)
    sleep_ms(1);
}

uint64_t r[1] = {0xffffffffffffffff};

void execute_call(int call)
{
  intptr_t res = 0;
  switch (call) {
  case 0:
    *(uint64_t*)0x20000040 = 0x20000004;
    *(uint32_t*)0x20000048 = 4;
    *(uint32_t*)0x2000004c = 2;
    *(uint32_t*)0x20000050 = 0;
    syscall(__NR_mq_notify, /*mqd=*/-1, /*notif=*/0x20000040ul);
    break;
  case 1:
    syscall(__NR_mremap, /*addr=*/0x20002000ul, /*len=*/0x3000ul,
            /*newlen=*/0x4000ul, /*flags=MREMAP_FIXED|MREMAP_MAYMOVE*/ 3ul,
            /*newaddr=*/0x20422000ul);
    break;
  case 2:
    res = syscall(__NR_userfaultfd,
                  /*flags=UFFD_USER_MODE_ONLY|O_CLOEXEC*/ 0x80001ul);
    if (res != -1)
      r[0] = res;
    break;
  case 3:
    *(uint64_t*)0x200000c0 = 0xaa;
    *(uint64_t*)0x200000c8 = 0xc;
    *(uint64_t*)0x200000d0 = 0;
    syscall(__NR_ioctl, /*fd=*/r[0], /*cmd=*/0xc018aa3f, /*arg=*/0x200000c0ul);
    break;
  case 4:
    *(uint64_t*)0x20000140 = 0x200e2000;
    *(uint64_t*)0x20000148 = 0xc00000;
    *(uint64_t*)0x20000150 = 1;
    *(uint64_t*)0x20000158 = 0;
    syscall(__NR_ioctl, /*fd=*/r[0], /*cmd=*/0xc020aa00, /*arg=*/0x20000140ul);
    break;
  case 5:
    syscall(
        __NR_mmap, /*addr=*/0x20000000ul, /*len=*/0x400000ul,
        /*prot=PROT_GROWSUP|PROT_SEM|PROT_WRITE|PROT_READ|PROT_EXEC*/
        0x200000ful,
        /*flags=MAP_SYNC|MAP_NONBLOCK|MAP_HUGETLB|MAP_FIXED|MAP_ANONYMOUS|0x2*/
        0xd0032ul, /*fd=*/-1, /*offset=*/0ul);
    break;
  case 6:
    syscall(__NR_madvise, /*addr=*/0x20000000ul, /*len=*/0x600003ul,
            /*advice=MADV_DONTNEED*/ 4ul);
    break;
  case 7:
    syscall(
        __NR_mmap, /*addr=*/0x20000000ul, /*len=*/0xff5000ul, /*prot=*/0ul,
        /*flags=MAP_POPULATE|MAP_NORESERVE|MAP_NONBLOCK|MAP_HUGETLB|MAP_FIXED|0x2000000000821*/
        0x200000005c831ul, /*fd=*/-1, /*offset=*/0ul);
    break;
  }
}
int main(void)
{
  syscall(__NR_mmap, /*addr=*/0x1ffff000ul, /*len=*/0x1000ul, /*prot=*/0ul,
          /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
          /*offset=*/0ul);
  syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0x1000000ul,
          /*prot=PROT_WRITE|PROT_READ|PROT_EXEC*/ 7ul,
          /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
          /*offset=*/0ul);
  syscall(__NR_mmap, /*addr=*/0x21000000ul, /*len=*/0x1000ul, /*prot=*/0ul,
          /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
          /*offset=*/0ul);
  loop();
  return 0;
}

--8<------------------------------------------------------------------

Cheers,
Ricardo


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] mm,madvise,hugetlb: check for 0-length range after end address adjustment
  2025-01-31 14:37 [PATCH] mm,madvise,hugetlb: check for 0-length range after end address adjustment Ricardo Cañuelo Navarro
  2025-01-31 14:58 ` Ricardo Cañuelo Navarro
@ 2025-01-31 22:23 ` Andrew Morton
  2025-02-03  7:14   ` Ricardo Cañuelo Navarro
  1 sibling, 1 reply; 4+ messages in thread
From: Andrew Morton @ 2025-01-31 22:23 UTC (permalink / raw)
  To: Ricardo Cañuelo Navarro; +Cc: riel, linux-mm, stable, kernel-dev, revest

On Fri, 31 Jan 2025 15:37:49 +0100 Ricardo Cañuelo Navarro <rcn@igalia.com> wrote:

> Add a sanity check to madvise_dontneed_free() to address a corner case
> in madvise where a race condition causes the current vma being processed
> to be backed by a different page size.
> 
> During a madvise(MADV_DONTNEED) call on a memory region registered with
> a userfaultfd, there's a period of time where the process mm lock is
> temporarily released in order to send a UFFD_EVENT_REMOVE and let
> userspace handle the event. During this time, the vma covering the
> current address range may change due to an explicit mmap done
> concurrently by another thread.
> 
> If, after that change, the memory region, which was originally backed by
> 4KB pages, is now backed by hugepages, the end address is rounded down
> to a hugepage boundary to avoid data loss (see "Fixes" below). This
> rounding may cause the end address to be truncated to the same address
> as the start.
> 
> Make this corner case follow the same semantics as in other similar
> cases where the requested region has zero length (ie. return 0).
> 
> This will make madvise_walk_vmas() continue to the next vma in the
> range (this time holding the process mm lock) which, due to the prev
> pointer becoming stale because of the vma change, will be the same
> hugepage-backed vma that was just checked before. The next time
> madvise_dontneed_free() runs for this vma, if the start address isn't
> aligned to a hugepage boundary, it'll return -EINVAL, which is also in
> line with the madvise api.
> 
> >From userspace perspective, madvise() will return EINVAL because the
> start address isn't aligned according to the new vma alignment
> requirements (hugepage), even though it was correctly page-aligned when
> the call was issued.
> 
> ...
>
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -933,7 +933,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
>  			 */
>  			end = vma->vm_end;
>  		}
> -		VM_WARN_ON(start >= end);
> +		if (start == end)
> +			return 0;
> +		VM_WARN_ON(start > end);
>  	}

Perhaps add a comment telling the user how this situation can come about?


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] mm,madvise,hugetlb: check for 0-length range after end address adjustment
  2025-01-31 22:23 ` Andrew Morton
@ 2025-02-03  7:14   ` Ricardo Cañuelo Navarro
  0 siblings, 0 replies; 4+ messages in thread
From: Ricardo Cañuelo Navarro @ 2025-02-03  7:14 UTC (permalink / raw)
  To: Andrew Morton; +Cc: riel, linux-mm, stable, kernel-dev, revest

Hi Andrew,

On Fri, Jan 31 2025 at 14:23:21, Andrew Morton <akpm@linux-foundation.org> wrote:

> Perhaps add a comment telling the user how this situation can come about?

Sure, I'll add the description to v2. Thanks for reviewing it!

Ricardo


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-02-03  7:15 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-01-31 14:37 [PATCH] mm,madvise,hugetlb: check for 0-length range after end address adjustment Ricardo Cañuelo Navarro
2025-01-31 14:58 ` Ricardo Cañuelo Navarro
2025-01-31 22:23 ` Andrew Morton
2025-02-03  7:14   ` Ricardo Cañuelo Navarro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox