* [PATCH V1 5.15.y 1/2] mm/mprotect: use long for page accountings and retval
2025-11-25 4:46 [PATCH V1 5.15.y 0/2] Fix bad pmd due to race between change_prot_numa() and THP migration Harry Yoo
@ 2025-11-25 4:46 ` Harry Yoo
2025-11-26 14:15 ` Sasha Levin
2025-11-25 4:46 ` [PATCH V1 5.15.y 2/2] mm/mprotect: delete pmd_none_or_clear_bad_unless_trans_huge() Harry Yoo
1 sibling, 1 reply; 4+ messages in thread
From: Harry Yoo @ 2025-11-25 4:46 UTC (permalink / raw)
To: stable
Cc: Liam.Howlett, akpm, baohua, baolin.wang, david, dev.jain, hughd,
jane.chu, jannh, kas, lance.yang, linux-mm, lorenzo.stoakes,
npache, pfalcato, ryan.roberts, vbabka, ziy, Peter Xu,
Mike Kravetz, James Houghton, Andrea Arcangeli, Axel Rasmussen,
David Hildenbrand, Muchun Song, Nadav Amit, Harry Yoo
From: Peter Xu <peterx@redhat.com>
commit a79390f5d6a78647fd70856bd42b22d994de0ba2 upstream.
Switch to use type "long" for page accountings and retval across the whole
procedure of change_protection().
The change should have shrinked the possible maximum page number to be
half comparing to previous (ULONG_MAX / 2), but it shouldn't overflow on
any system either because the maximum possible pages touched by change
protection should be ULONG_MAX / PAGE_SIZE.
Two reasons to switch from "unsigned long" to "long":
1. It suites better on count_vm_numa_events(), whose 2nd parameter takes
a long type.
2. It paves way for returning negative (error) values in the future.
Currently the only caller that consumes this retval is change_prot_numa(),
where the unsigned long was converted to an int. Since at it, touching up
the numa code to also take a long, so it'll avoid any possible overflow
too during the int-size convertion.
Link: https://lkml.kernel.org/r/20230104225207.1066932-3-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: James Houghton <jthoughton@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ Adjust context ]
Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
---
include/linux/hugetlb.h | 4 ++--
include/linux/mm.h | 2 +-
mm/hugetlb.c | 4 ++--
mm/mempolicy.c | 2 +-
mm/mprotect.c | 26 +++++++++++++-------------
5 files changed, 19 insertions(+), 19 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 60572d423586e..ca26849a8e359 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -208,7 +208,7 @@ struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pud);
-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot);
bool is_hugetlb_entry_migration(pte_t pte);
@@ -379,7 +379,7 @@ static inline void move_hugetlb_state(struct page *oldpage,
{
}
-static inline unsigned long hugetlb_change_protection(
+static inline long hugetlb_change_protection(
struct vm_area_struct *vma, unsigned long address,
unsigned long end, pgprot_t newprot)
{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3598925561b13..8df1027982532 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1910,7 +1910,7 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
MM_CP_UFFD_WP_RESOLVE)
-extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+extern long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
extern int mprotect_fixup(struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 70ceac102a8db..d583f9394be5f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5644,7 +5644,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
return i ? i : err;
}
-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
struct mm_struct *mm = vma->vm_mm;
@@ -5652,7 +5652,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
- unsigned long pages = 0;
+ long pages = 0;
bool shared_pmd = false;
struct mmu_notifier_range range;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f089de8564cad..3d984d070e3fe 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -634,7 +634,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
- int nr_updated;
+ long nr_updated;
nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
if (nr_updated)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ed18dc49533f6..58822900c6d65 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -35,13 +35,13 @@
#include "internal.h"
-static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
{
pte_t *pte, oldpte;
spinlock_t *ptl;
- unsigned long pages = 0;
+ long pages = 0;
int target_node = NUMA_NO_NODE;
bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
@@ -219,13 +219,13 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
return 0;
}
-static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
+static inline long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
pmd_t *pmd;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0;
unsigned long nr_huge_updates = 0;
struct mmu_notifier_range range;
@@ -233,7 +233,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pmd = pmd_offset(pud, addr);
do {
- unsigned long this_pages;
+ long this_pages;
next = pmd_addr_end(addr, end);
@@ -291,13 +291,13 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
return pages;
}
-static inline unsigned long change_pud_range(struct vm_area_struct *vma,
+static inline long change_pud_range(struct vm_area_struct *vma,
p4d_t *p4d, unsigned long addr, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
pud_t *pud;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0;
pud = pud_offset(p4d, addr);
do {
@@ -311,13 +311,13 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma,
return pages;
}
-static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
+static inline long change_p4d_range(struct vm_area_struct *vma,
pgd_t *pgd, unsigned long addr, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
p4d_t *p4d;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0;
p4d = p4d_offset(pgd, addr);
do {
@@ -331,7 +331,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
return pages;
}
-static unsigned long change_protection_range(struct vm_area_struct *vma,
+static long change_protection_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
{
@@ -339,7 +339,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
pgd_t *pgd;
unsigned long next;
unsigned long start = addr;
- unsigned long pages = 0;
+ long pages = 0;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
@@ -361,11 +361,11 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
return pages;
}
-unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
{
- unsigned long pages;
+ long pages;
BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
--
2.43.0
^ permalink raw reply [flat|nested] 4+ messages in thread* [PATCH V1 5.15.y 2/2] mm/mprotect: delete pmd_none_or_clear_bad_unless_trans_huge()
2025-11-25 4:46 [PATCH V1 5.15.y 0/2] Fix bad pmd due to race between change_prot_numa() and THP migration Harry Yoo
2025-11-25 4:46 ` [PATCH V1 5.15.y 1/2] mm/mprotect: use long for page accountings and retval Harry Yoo
@ 2025-11-25 4:46 ` Harry Yoo
1 sibling, 0 replies; 4+ messages in thread
From: Harry Yoo @ 2025-11-25 4:46 UTC (permalink / raw)
To: stable
Cc: Liam.Howlett, akpm, baohua, baolin.wang, david, dev.jain, hughd,
jane.chu, jannh, kas, lance.yang, linux-mm, lorenzo.stoakes,
npache, pfalcato, ryan.roberts, vbabka, ziy, Alistair Popple,
Anshuman Khandual, Axel Rasmussen, Christophe Leroy,
Christoph Hellwig, David Hildenbrand, Huang, Ying, Ira Weiny,
Jason Gunthorpe, Kirill A . Shutemov, Lorenzo Stoakes,
Matthew Wilcox, Mel Gorman, Miaohe Lin, Mike Kravetz,
Mike Rapoport, Minchan Kim, Naoya Horiguchi, Pavel Tatashin,
Peter Xu, Peter Zijlstra, Qi Zheng, Ralph Campbell,
SeongJae Park, Song Liu, Steven Price, Suren Baghdasaryan,
Thomas Hellström, Will Deacon, Yang Shi, Yu Zhao,
Zack Rusin
From: Hugh Dickins <hughd@google.com>
commit 670ddd8cdcbd1d07a4571266ae3517f821728c3a upstream.
change_pmd_range() had special pmd_none_or_clear_bad_unless_trans_huge(),
required to avoid "bad" choices when setting automatic NUMA hinting under
mmap_read_lock(); but most of that is already covered in pte_offset_map()
now. change_pmd_range() just wants a pmd_none() check before wasting time
on MMU notifiers, then checks on the read-once _pmd value to work out
what's needed for huge cases. If change_pte_range() returns -EAGAIN to
retry if pte_offset_map_lock() fails, nothing more special is needed.
Link: https://lkml.kernel.org/r/725a42a9-91e9-c868-925-e3a5fd40bb4f@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Song Liu <song@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zack Rusin <zackr@vmware.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ Background:
It was reported that a bad pmd is seen when automatic NUMA balancing
is marking page table entries as prot_numa:
[2437548.196018] mm/pgtable-generic.c:50: bad pmd 00000000af22fc02(dffffffe71fbfe02)
[2437548.235022] Call Trace:
[2437548.238234] <TASK>
[2437548.241060] dump_stack_lvl+0x46/0x61
[2437548.245689] panic+0x106/0x2e5
[2437548.249497] pmd_clear_bad+0x3c/0x3c
[2437548.253967] change_pmd_range.isra.0+0x34d/0x3a7
[2437548.259537] change_p4d_range+0x156/0x20e
[2437548.264392] change_protection_range+0x116/0x1a9
[2437548.269976] change_prot_numa+0x15/0x37
[2437548.274774] task_numa_work+0x1b8/0x302
[2437548.279512] task_work_run+0x62/0x95
[2437548.283882] exit_to_user_mode_loop+0x1a4/0x1a9
[2437548.289277] exit_to_user_mode_prepare+0xf4/0xfc
[2437548.294751] ? sysvec_apic_timer_interrupt+0x34/0x81
[2437548.300677] irqentry_exit_to_user_mode+0x5/0x25
[2437548.306153] asm_sysvec_apic_timer_interrupt+0x16/0x1b
This is due to a race condition between change_prot_numa() and
THP migration because the kernel doesn't check is_swap_pmd() and
pmd_trans_huge() atomically:
change_prot_numa() THP migration
======================================================================
- change_pmd_range()
-> is_swap_pmd() returns false,
meaning it's not a PMD migration
entry.
- do_huge_pmd_numa_page()
-> migrate_misplaced_page() sets
migration entries for the THP.
- change_pmd_range()
-> pmd_none_or_clear_bad_unless_trans_huge()
-> pmd_none() and pmd_trans_huge() returns false
- pmd_none_or_clear_bad_unless_trans_huge()
-> pmd_bad() returns true for the migration entry!
The upstream commit 670ddd8cdcbd ("mm/mprotect: delete
pmd_none_or_clear_bad_unless_trans_huge()") closes this race condition
by checking is_swap_pmd() and pmd_trans_huge() atomically.
Backporting note:
Unlike mainline, pte_offset_map_lock() does not check if the pmd
entry is a migration entry or a hugepage; acquires PTL unconditionally
instead of returning failure. Therefore, it is necessary to keep the
!is_swap_pmd() && !pmd_trans_huge() && !pmd_devmap() check before
acquiring the PTL.
After acquiring it, open-code the mainline semantics of
pte_offset_map_lock() so that change_pte_range() fails if the pmd value
has changed (under the PTL). This requires adding one more parameter
(for passing pmd value that is read before calling the function) to
change_pte_range(). ]
---
mm/mprotect.c | 97 ++++++++++++++++++++++-----------------------------
1 file changed, 41 insertions(+), 56 deletions(-)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 58822900c6d65..15a966de86676 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -36,10 +36,11 @@
#include "internal.h"
static long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end, pgprot_t newprot,
- unsigned long cp_flags)
+ pmd_t pmd_old, unsigned long addr, unsigned long end,
+ pgprot_t newprot, unsigned long cp_flags)
{
pte_t *pte, oldpte;
+ pmd_t pmd_val;
spinlock_t *ptl;
long pages = 0;
int target_node = NUMA_NO_NODE;
@@ -48,21 +49,16 @@ static long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
- /*
- * Can be called with only the mmap_lock for reading by
- * prot_numa so we must check the pmd isn't constantly
- * changing from under us from pmd_none to pmd_trans_huge
- * and/or the other way around.
- */
- if (pmd_trans_unstable(pmd))
- return 0;
- /*
- * The pmd points to a regular pte so the pmd can't change
- * from under us even if the mmap_lock is only hold for
- * reading.
- */
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ /* Make sure pmd didn't change after acquiring ptl */
+ pmd_val = pmd_read_atomic(pmd);
+ /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
+ barrier();
+ if (!pmd_same(pmd_old, pmd_val)) {
+ pte_unmap_unlock(pte, ptl);
+ return -EAGAIN;
+ }
/* Get target node for single threaded private VMAs */
if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
@@ -194,31 +190,6 @@ static long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
return pages;
}
-/*
- * Used when setting automatic NUMA hinting protection where it is
- * critical that a numa hinting PMD is not confused with a bad PMD.
- */
-static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
-{
- pmd_t pmdval = pmd_read_atomic(pmd);
-
- /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- barrier();
-#endif
-
- if (pmd_none(pmdval))
- return 1;
- if (pmd_trans_huge(pmdval))
- return 0;
- if (unlikely(pmd_bad(pmdval))) {
- pmd_clear_bad(pmd);
- return 1;
- }
-
- return 0;
-}
-
static inline long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
@@ -233,21 +204,33 @@ static inline long change_pmd_range(struct vm_area_struct *vma,
pmd = pmd_offset(pud, addr);
do {
- long this_pages;
-
+ long ret;
+ pmd_t _pmd;
+again:
next = pmd_addr_end(addr, end);
+ _pmd = pmd_read_atomic(pmd);
+ /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ barrier();
+#endif
/*
* Automatic NUMA balancing walks the tables with mmap_lock
* held for read. It's possible a parallel update to occur
- * between pmd_trans_huge() and a pmd_none_or_clear_bad()
- * check leading to a false positive and clearing.
- * Hence, it's necessary to atomically read the PMD value
- * for all the checks.
+ * between pmd_trans_huge(), is_swap_pmd(), and
+ * a pmd_none_or_clear_bad() check leading to a false positive
+ * and clearing. Hence, it's necessary to atomically read
+ * the PMD value for all the checks.
*/
- if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
- pmd_none_or_clear_bad_unless_trans_huge(pmd))
- goto next;
+ if (!is_swap_pmd(_pmd) && !pmd_devmap(_pmd) && !pmd_trans_huge(_pmd)) {
+ if (pmd_none(_pmd))
+ goto next;
+
+ if (pmd_bad(_pmd)) {
+ pmd_clear_bad(pmd);
+ goto next;
+ }
+ }
/* invoke the mmu notifier if the pmd is populated */
if (!range.start) {
@@ -257,15 +240,15 @@ static inline long change_pmd_range(struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
}
- if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
__split_huge_pmd(vma, pmd, addr, false, NULL);
} else {
- int nr_ptes = change_huge_pmd(vma, pmd, addr,
+ ret = change_huge_pmd(vma, pmd, addr,
newprot, cp_flags);
- if (nr_ptes) {
- if (nr_ptes == HPAGE_PMD_NR) {
+ if (ret) {
+ if (ret == HPAGE_PMD_NR) {
pages += HPAGE_PMD_NR;
nr_huge_updates++;
}
@@ -276,9 +259,11 @@ static inline long change_pmd_range(struct vm_area_struct *vma,
}
/* fall through, the trans huge pmd just split */
}
- this_pages = change_pte_range(vma, pmd, addr, next, newprot,
- cp_flags);
- pages += this_pages;
+ ret = change_pte_range(vma, pmd, _pmd, addr, next,
+ newprot, cp_flags);
+ if (ret < 0)
+ goto again;
+ pages += ret;
next:
cond_resched();
} while (pmd++, addr = next, addr != end);
--
2.43.0
^ permalink raw reply [flat|nested] 4+ messages in thread