* Re: [PATCH] mm,madvise,hugetlb: check for 0-length range after end address adjustment
2025-01-31 14:37 [PATCH] mm,madvise,hugetlb: check for 0-length range after end address adjustment Ricardo Cañuelo Navarro
@ 2025-01-31 14:58 ` Ricardo Cañuelo Navarro
2025-01-31 22:23 ` Andrew Morton
1 sibling, 0 replies; 4+ messages in thread
From: Ricardo Cañuelo Navarro @ 2025-01-31 14:58 UTC (permalink / raw)
To: Ricardo Cañuelo Navarro
Cc: akpm, riel, linux-mm, stable, kernel-dev, revest
Hi all,
Some more context about the patch. The issue (WARNING in
madvise_vma_behavior) was found by a private syzbot instance, so I can't
share the link, but it can be triggered by an unprivileged user with
this reproducer:
--8<------------------------------------------------------------------
#define _GNU_SOURCE
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#include <linux/futex.h>
#ifndef __NR_userfaultfd
#define __NR_userfaultfd 323
#endif
static void sleep_ms(uint64_t ms)
{
usleep(ms * 1000);
}
static uint64_t current_time_ms(void)
{
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts))
exit(1);
return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}
static void thread_start(void* (*fn)(void*), void* arg)
{
pthread_t th;
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setstacksize(&attr, 128 << 10);
int i = 0;
for (; i < 100; i++) {
if (pthread_create(&th, &attr, fn, arg) == 0) {
pthread_attr_destroy(&attr);
return;
}
if (errno == EAGAIN) {
usleep(50);
continue;
}
break;
}
exit(1);
}
typedef struct {
int state;
} event_t;
static void event_init(event_t* ev)
{
ev->state = 0;
}
static void event_reset(event_t* ev)
{
ev->state = 0;
}
static void event_set(event_t* ev)
{
if (ev->state)
exit(1);
__atomic_store_n(&ev->state, 1, __ATOMIC_RELEASE);
syscall(SYS_futex, &ev->state, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1000000);
}
static void event_wait(event_t* ev)
{
while (!__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, 0);
}
static int event_isset(event_t* ev)
{
return __atomic_load_n(&ev->state, __ATOMIC_ACQUIRE);
}
static int event_timedwait(event_t* ev, uint64_t timeout)
{
uint64_t start = current_time_ms();
uint64_t now = start;
for (;;) {
uint64_t remain = timeout - (now - start);
struct timespec ts;
ts.tv_sec = remain / 1000;
ts.tv_nsec = (remain % 1000) * 1000 * 1000;
syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, &ts);
if (__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
return 1;
now = current_time_ms();
if (now - start > timeout)
return 0;
}
}
struct thread_t {
int created, call;
event_t ready, done;
};
static struct thread_t threads[16];
static void execute_call(int call);
static int running;
static void* thr(void* arg)
{
struct thread_t* th = (struct thread_t*)arg;
for (;;) {
event_wait(&th->ready);
event_reset(&th->ready);
execute_call(th->call);
__atomic_fetch_sub(&running, 1, __ATOMIC_RELAXED);
event_set(&th->done);
}
return 0;
}
static void loop(void)
{
if (write(1, "executing program\n", sizeof("executing program\n") - 1)) {
}
int i, call, thread;
for (call = 0; call < 8; call++) {
for (thread = 0; thread < (int)(sizeof(threads) / sizeof(threads[0]));
thread++) {
struct thread_t* th = &threads[thread];
if (!th->created) {
th->created = 1;
event_init(&th->ready);
event_init(&th->done);
event_set(&th->done);
thread_start(thr, th);
}
if (!event_isset(&th->done))
continue;
event_reset(&th->done);
th->call = call;
__atomic_fetch_add(&running, 1, __ATOMIC_RELAXED);
event_set(&th->ready);
event_timedwait(&th->done, 50);
break;
}
}
for (i = 0; i < 100 && __atomic_load_n(&running, __ATOMIC_RELAXED); i++)
sleep_ms(1);
}
uint64_t r[1] = {0xffffffffffffffff};
void execute_call(int call)
{
intptr_t res = 0;
switch (call) {
case 0:
*(uint64_t*)0x20000040 = 0x20000004;
*(uint32_t*)0x20000048 = 4;
*(uint32_t*)0x2000004c = 2;
*(uint32_t*)0x20000050 = 0;
syscall(__NR_mq_notify, /*mqd=*/-1, /*notif=*/0x20000040ul);
break;
case 1:
syscall(__NR_mremap, /*addr=*/0x20002000ul, /*len=*/0x3000ul,
/*newlen=*/0x4000ul, /*flags=MREMAP_FIXED|MREMAP_MAYMOVE*/ 3ul,
/*newaddr=*/0x20422000ul);
break;
case 2:
res = syscall(__NR_userfaultfd,
/*flags=UFFD_USER_MODE_ONLY|O_CLOEXEC*/ 0x80001ul);
if (res != -1)
r[0] = res;
break;
case 3:
*(uint64_t*)0x200000c0 = 0xaa;
*(uint64_t*)0x200000c8 = 0xc;
*(uint64_t*)0x200000d0 = 0;
syscall(__NR_ioctl, /*fd=*/r[0], /*cmd=*/0xc018aa3f, /*arg=*/0x200000c0ul);
break;
case 4:
*(uint64_t*)0x20000140 = 0x200e2000;
*(uint64_t*)0x20000148 = 0xc00000;
*(uint64_t*)0x20000150 = 1;
*(uint64_t*)0x20000158 = 0;
syscall(__NR_ioctl, /*fd=*/r[0], /*cmd=*/0xc020aa00, /*arg=*/0x20000140ul);
break;
case 5:
syscall(
__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0x400000ul,
/*prot=PROT_GROWSUP|PROT_SEM|PROT_WRITE|PROT_READ|PROT_EXEC*/
0x200000ful,
/*flags=MAP_SYNC|MAP_NONBLOCK|MAP_HUGETLB|MAP_FIXED|MAP_ANONYMOUS|0x2*/
0xd0032ul, /*fd=*/-1, /*offset=*/0ul);
break;
case 6:
syscall(__NR_madvise, /*addr=*/0x20000000ul, /*len=*/0x600003ul,
/*advice=MADV_DONTNEED*/ 4ul);
break;
case 7:
syscall(
__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0xff5000ul, /*prot=*/0ul,
/*flags=MAP_POPULATE|MAP_NORESERVE|MAP_NONBLOCK|MAP_HUGETLB|MAP_FIXED|0x2000000000821*/
0x200000005c831ul, /*fd=*/-1, /*offset=*/0ul);
break;
}
}
int main(void)
{
syscall(__NR_mmap, /*addr=*/0x1ffff000ul, /*len=*/0x1000ul, /*prot=*/0ul,
/*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
/*offset=*/0ul);
syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0x1000000ul,
/*prot=PROT_WRITE|PROT_READ|PROT_EXEC*/ 7ul,
/*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
/*offset=*/0ul);
syscall(__NR_mmap, /*addr=*/0x21000000ul, /*len=*/0x1000ul, /*prot=*/0ul,
/*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
/*offset=*/0ul);
loop();
return 0;
}
--8<------------------------------------------------------------------
Cheers,
Ricardo
^ permalink raw reply [flat|nested] 4+ messages in thread* Re: [PATCH] mm,madvise,hugetlb: check for 0-length range after end address adjustment
2025-01-31 14:37 [PATCH] mm,madvise,hugetlb: check for 0-length range after end address adjustment Ricardo Cañuelo Navarro
2025-01-31 14:58 ` Ricardo Cañuelo Navarro
@ 2025-01-31 22:23 ` Andrew Morton
2025-02-03 7:14 ` Ricardo Cañuelo Navarro
1 sibling, 1 reply; 4+ messages in thread
From: Andrew Morton @ 2025-01-31 22:23 UTC (permalink / raw)
To: Ricardo Cañuelo Navarro; +Cc: riel, linux-mm, stable, kernel-dev, revest
On Fri, 31 Jan 2025 15:37:49 +0100 Ricardo Cañuelo Navarro <rcn@igalia.com> wrote:
> Add a sanity check to madvise_dontneed_free() to address a corner case
> in madvise where a race condition causes the current vma being processed
> to be backed by a different page size.
>
> During a madvise(MADV_DONTNEED) call on a memory region registered with
> a userfaultfd, there's a period of time where the process mm lock is
> temporarily released in order to send a UFFD_EVENT_REMOVE and let
> userspace handle the event. During this time, the vma covering the
> current address range may change due to an explicit mmap done
> concurrently by another thread.
>
> If, after that change, the memory region, which was originally backed by
> 4KB pages, is now backed by hugepages, the end address is rounded down
> to a hugepage boundary to avoid data loss (see "Fixes" below). This
> rounding may cause the end address to be truncated to the same address
> as the start.
>
> Make this corner case follow the same semantics as in other similar
> cases where the requested region has zero length (ie. return 0).
>
> This will make madvise_walk_vmas() continue to the next vma in the
> range (this time holding the process mm lock) which, due to the prev
> pointer becoming stale because of the vma change, will be the same
> hugepage-backed vma that was just checked before. The next time
> madvise_dontneed_free() runs for this vma, if the start address isn't
> aligned to a hugepage boundary, it'll return -EINVAL, which is also in
> line with the madvise api.
>
> >From userspace perspective, madvise() will return EINVAL because the
> start address isn't aligned according to the new vma alignment
> requirements (hugepage), even though it was correctly page-aligned when
> the call was issued.
>
> ...
>
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -933,7 +933,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
> */
> end = vma->vm_end;
> }
> - VM_WARN_ON(start >= end);
> + if (start == end)
> + return 0;
> + VM_WARN_ON(start > end);
> }
Perhaps add a comment telling the user how this situation can come about?
^ permalink raw reply [flat|nested] 4+ messages in thread