* [PATCH] mm: hugetlb: independent PMD page table shared count
@ 2024-12-14 10:44 Liu Shixin
2024-12-15 5:11 ` kernel test robot
` (4 more replies)
0 siblings, 5 replies; 9+ messages in thread
From: Liu Shixin @ 2024-12-14 10:44 UTC (permalink / raw)
To: Andrew Morton, Muchun Song, Kenneth W Chen, Kefeng Wang, Nanyong Sun
Cc: linux-mm, linux-kernel, Liu Shixin
The folio refcount may be increased unexpectly through try_get_folio() by
caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to
check whether a pmd page table is shared. The check is incorrect if the
refcount is increased by the above caller, and this can cause the page
table leaked:
BUG: Bad page state in process sh pfn:109324
page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324
flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff)
page_type: f2(table)
raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000
raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000
page dumped because: nonzero mapcount
...
CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7
Tainted: [B]=BAD_PAGE
Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
Call trace:
show_stack+0x20/0x38 (C)
dump_stack_lvl+0x80/0xf8
dump_stack+0x18/0x28
bad_page+0x8c/0x130
free_page_is_bad_report+0xa4/0xb0
free_unref_page+0x3cc/0x620
__folio_put+0xf4/0x158
split_huge_pages_all+0x1e0/0x3e8
split_huge_pages_write+0x25c/0x2d8
full_proxy_write+0x64/0xd8
vfs_write+0xcc/0x280
ksys_write+0x70/0x110
__arm64_sys_write+0x24/0x38
invoke_syscall+0x50/0x120
el0_svc_common.constprop.0+0xc8/0xf0
do_el0_svc+0x24/0x38
el0_svc+0x34/0x128
el0t_64_sync_handler+0xc8/0xd0
el0t_64_sync+0x190/0x198
The issue may be triggered by damon, offline_page, page_idle etc. which
will increase the refcount of page table.
Fix it by introducing independent PMD page table shared count.
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
include/linux/mm.h | 1 +
include/linux/mm_types.h | 28 ++++++++++++++++++++++++++++
mm/hugetlb.c | 16 +++++++---------
3 files changed, 36 insertions(+), 9 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c39c4945946c..50fbf2a1b0ad 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3115,6 +3115,7 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
if (!pmd_ptlock_init(ptdesc))
return false;
__folio_set_pgtable(folio);
+ ptdesc_pmd_pts_init(ptdesc);
lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7361a8f3ab68..740b765ac91e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -445,6 +445,7 @@ FOLIO_MATCH(compound_head, _head_2a);
* @pt_index: Used for s390 gmap.
* @pt_mm: Used for x86 pgds.
* @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
+ * @pt_share_count: Used for HugeTLB PMD page table share count.
* @_pt_pad_2: Padding to ensure proper alignment.
* @ptl: Lock for the page table.
* @__page_type: Same as page->page_type. Unused for page tables.
@@ -471,6 +472,9 @@ struct ptdesc {
pgoff_t pt_index;
struct mm_struct *pt_mm;
atomic_t pt_frag_refcount;
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+ atomic_t pt_share_count;
+#endif
};
union {
@@ -516,6 +520,30 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
const struct page *: (const struct ptdesc *)(p), \
struct page *: (struct ptdesc *)(p)))
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
+{
+ atomic_set(&ptdesc->pt_share_count, 0);
+}
+
+static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc)
+{
+ atomic_inc(&ptdesc->pt_share_count);
+}
+
+static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc)
+{
+ atomic_dec(&ptdesc->pt_share_count);
+}
+
+static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc)
+{
+ return atomic_read(&ptdesc->pt_share_count);
+}
+#else
+#define ptdesc_pmd_pts_init NULL
+#endif
+
/*
* Used for sizing the vmemmap region on some architectures
*/
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ea2ed8e301ef..60846b060b87 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7212,7 +7212,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
spte = hugetlb_walk(svma, saddr,
vma_mmu_pagesize(svma));
if (spte) {
- get_page(virt_to_page(spte));
+ ptdesc_pmd_pts_inc(virt_to_ptdesc(spte));
break;
}
}
@@ -7227,7 +7227,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
mm_inc_nr_pmds(mm);
} else {
- put_page(virt_to_page(spte));
+ ptdesc_pmd_pts_dec(virt_to_ptdesc(spte));
}
spin_unlock(&mm->page_table_lock);
out:
@@ -7239,10 +7239,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* unmap huge page backed by shared pte.
*
- * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
- * indicated by page_count > 1, unmap is achieved by clearing pud and
- * decrementing the ref count. If count == 1, the pte page is not shared.
- *
* Called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
@@ -7251,18 +7247,20 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
+ unsigned long sz = huge_page_size(hstate_vma(vma));
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
- BUG_ON(page_count(virt_to_page(ptep)) == 0);
- if (page_count(virt_to_page(ptep)) == 1)
+ if (sz != PMD_SIZE)
+ return 0;
+ if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
return 0;
pud_clear(pud);
- put_page(virt_to_page(ptep));
+ ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
mm_dec_nr_pmds(mm);
return 1;
}
--
2.34.1
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] mm: hugetlb: independent PMD page table shared count
2024-12-14 10:44 [PATCH] mm: hugetlb: independent PMD page table shared count Liu Shixin
@ 2024-12-15 5:11 ` kernel test robot
2024-12-15 5:11 ` kernel test robot
` (3 subsequent siblings)
4 siblings, 0 replies; 9+ messages in thread
From: kernel test robot @ 2024-12-15 5:11 UTC (permalink / raw)
To: Liu Shixin, Andrew Morton, Muchun Song, Kenneth W Chen,
Kefeng Wang, Nanyong Sun
Cc: llvm, oe-kbuild-all, Linux Memory Management List, linux-kernel,
Liu Shixin
Hi Liu,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
url: https://github.com/intel-lab-lkp/linux/commits/Liu-Shixin/mm-hugetlb-independent-PMD-page-table-shared-count/20241214-184912
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20241214104401.1052550-1-liushixin2%40huawei.com
patch subject: [PATCH] mm: hugetlb: independent PMD page table shared count
config: i386-buildonly-randconfig-006-20241215 (https://download.01.org/0day-ci/archive/20241215/202412151253.G1eTYt78-lkp@intel.com/config)
compiler: clang version 19.1.3 (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b254b6afab99)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241215/202412151253.G1eTYt78-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202412151253.G1eTYt78-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from arch/x86/kernel/asm-offsets.c:14:
In file included from include/linux/suspend.h:5:
In file included from include/linux/swap.h:9:
In file included from include/linux/memcontrol.h:21:
>> include/linux/mm.h:3192:21: error: called object type 'void *' is not a function or function pointer
3192 | ptdesc_pmd_pts_init(ptdesc);
| ~~~~~~~~~~~~~~~~~~~^
1 error generated.
make[3]: *** [scripts/Makefile.build:102: arch/x86/kernel/asm-offsets.s] Error 1 shuffle=1966107866
make[3]: Target 'prepare' not remade because of errors.
make[2]: *** [Makefile:1263: prepare0] Error 2 shuffle=1966107866
make[2]: Target 'prepare' not remade because of errors.
make[1]: *** [Makefile:251: __sub-make] Error 2 shuffle=1966107866
make[1]: Target 'prepare' not remade because of errors.
make: *** [Makefile:251: __sub-make] Error 2 shuffle=1966107866
make: Target 'prepare' not remade because of errors.
vim +3192 include/linux/mm.h
3184
3185 static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
3186 {
3187 struct folio *folio = ptdesc_folio(ptdesc);
3188
3189 if (!pmd_ptlock_init(ptdesc))
3190 return false;
3191 __folio_set_pgtable(folio);
> 3192 ptdesc_pmd_pts_init(ptdesc);
3193 lruvec_stat_add_folio(folio, NR_PAGETABLE);
3194 return true;
3195 }
3196
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] mm: hugetlb: independent PMD page table shared count
2024-12-14 10:44 [PATCH] mm: hugetlb: independent PMD page table shared count Liu Shixin
2024-12-15 5:11 ` kernel test robot
@ 2024-12-15 5:11 ` kernel test robot
2024-12-15 7:45 ` kernel test robot
` (2 subsequent siblings)
4 siblings, 0 replies; 9+ messages in thread
From: kernel test robot @ 2024-12-15 5:11 UTC (permalink / raw)
To: Liu Shixin, Andrew Morton, Muchun Song, Kenneth W Chen,
Kefeng Wang, Nanyong Sun
Cc: oe-kbuild-all, Linux Memory Management List, linux-kernel, Liu Shixin
Hi Liu,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
url: https://github.com/intel-lab-lkp/linux/commits/Liu-Shixin/mm-hugetlb-independent-PMD-page-table-shared-count/20241214-184912
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20241214104401.1052550-1-liushixin2%40huawei.com
patch subject: [PATCH] mm: hugetlb: independent PMD page table shared count
config: i386-buildonly-randconfig-001-20241215 (https://download.01.org/0day-ci/archive/20241215/202412151252.7ESa61O9-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241215/202412151252.7ESa61O9-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202412151252.7ESa61O9-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from include/uapi/linux/posix_types.h:5,
from include/uapi/linux/types.h:14,
from include/linux/types.h:6,
from include/linux/kasan-checks.h:5,
from include/asm-generic/rwonce.h:26,
from ./arch/x86/include/generated/asm/rwonce.h:1,
from include/linux/compiler.h:337,
from include/linux/build_bug.h:5,
from include/linux/container_of.h:5,
from include/linux/list.h:5,
from include/linux/swait.h:5,
from include/linux/completion.h:12,
from include/linux/crypto.h:15,
from arch/x86/kernel/asm-offsets.c:9:
include/linux/mm.h: In function 'pagetable_pmd_ctor':
>> include/linux/stddef.h:8:14: error: called object is not a function or function pointer
8 | #define NULL ((void *)0)
| ^
include/linux/mm_types.h:553:29: note: in expansion of macro 'NULL'
553 | #define ptdesc_pmd_pts_init NULL
| ^~~~
include/linux/mm.h:3192:9: note: in expansion of macro 'ptdesc_pmd_pts_init'
3192 | ptdesc_pmd_pts_init(ptdesc);
| ^~~~~~~~~~~~~~~~~~~
make[3]: *** [scripts/Makefile.build:102: arch/x86/kernel/asm-offsets.s] Error 1 shuffle=1659946440
make[3]: Target 'prepare' not remade because of errors.
make[2]: *** [Makefile:1263: prepare0] Error 2 shuffle=1659946440
make[2]: Target 'prepare' not remade because of errors.
make[1]: *** [Makefile:251: __sub-make] Error 2 shuffle=1659946440
make[1]: Target 'prepare' not remade because of errors.
make: *** [Makefile:251: __sub-make] Error 2 shuffle=1659946440
make: Target 'prepare' not remade because of errors.
vim +8 include/linux/stddef.h
^1da177e4c3f415 Linus Torvalds 2005-04-16 6
^1da177e4c3f415 Linus Torvalds 2005-04-16 7 #undef NULL
^1da177e4c3f415 Linus Torvalds 2005-04-16 @8 #define NULL ((void *)0)
6e2182874324727 Richard Knutsson 2006-09-30 9
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] mm: hugetlb: independent PMD page table shared count
2024-12-14 10:44 [PATCH] mm: hugetlb: independent PMD page table shared count Liu Shixin
2024-12-15 5:11 ` kernel test robot
2024-12-15 5:11 ` kernel test robot
@ 2024-12-15 7:45 ` kernel test robot
2024-12-16 15:34 ` David Hildenbrand
2024-12-16 18:33 ` jane.chu
4 siblings, 0 replies; 9+ messages in thread
From: kernel test robot @ 2024-12-15 7:45 UTC (permalink / raw)
To: Liu Shixin, Andrew Morton, Muchun Song, Kenneth W Chen,
Kefeng Wang, Nanyong Sun
Cc: llvm, oe-kbuild-all, Linux Memory Management List, linux-kernel,
Liu Shixin
Hi Liu,
kernel test robot noticed the following build warnings:
[auto build test WARNING on akpm-mm/mm-everything]
url: https://github.com/intel-lab-lkp/linux/commits/Liu-Shixin/mm-hugetlb-independent-PMD-page-table-shared-count/20241214-184912
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20241214104401.1052550-1-liushixin2%40huawei.com
patch subject: [PATCH] mm: hugetlb: independent PMD page table shared count
config: hexagon-randconfig-001-20241215 (https://download.01.org/0day-ci/archive/20241215/202412151541.SQEZStgy-lkp@intel.com/config)
compiler: clang version 20.0.0git (https://github.com/llvm/llvm-project 2dc22615fd46ab2566d0f26d5ba234ab12dc4bf8)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241215/202412151541.SQEZStgy-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202412151541.SQEZStgy-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from kernel/dma/direct.c:7:
In file included from include/linux/memblock.h:12:
include/linux/mm.h:3192:21: error: called object type 'void *' is not a function or function pointer
3192 | ptdesc_pmd_pts_init(ptdesc);
| ~~~~~~~~~~~~~~~~~~~^
>> kernel/dma/direct.c:147:20: warning: shift count >= width of type [-Wshift-count-overflow]
146 | if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
147 | phys_limit < DMA_BIT_MASK(64) &&
| ~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~
148 | !(gfp & (GFP_DMA32 | GFP_DMA))) {
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/dma-mapping.h:73:54: note: expanded from macro 'DMA_BIT_MASK'
73 | #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
| ^
include/linux/compiler.h:55:47: note: expanded from macro 'if'
55 | #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
| ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
include/linux/compiler.h:57:52: note: expanded from macro '__trace_if_var'
57 | #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
| ^~~~
>> kernel/dma/direct.c:147:20: warning: shift count >= width of type [-Wshift-count-overflow]
146 | if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
147 | phys_limit < DMA_BIT_MASK(64) &&
| ~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~
148 | !(gfp & (GFP_DMA32 | GFP_DMA))) {
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/dma-mapping.h:73:54: note: expanded from macro 'DMA_BIT_MASK'
73 | #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
| ^
include/linux/compiler.h:55:47: note: expanded from macro 'if'
55 | #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
| ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
include/linux/compiler.h:57:61: note: expanded from macro '__trace_if_var'
57 | #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
| ^~~~
>> kernel/dma/direct.c:147:20: warning: shift count >= width of type [-Wshift-count-overflow]
146 | if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
147 | phys_limit < DMA_BIT_MASK(64) &&
| ~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~
148 | !(gfp & (GFP_DMA32 | GFP_DMA))) {
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/dma-mapping.h:73:54: note: expanded from macro 'DMA_BIT_MASK'
73 | #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
| ^
include/linux/compiler.h:55:47: note: expanded from macro 'if'
55 | #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
| ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~
include/linux/compiler.h:57:86: note: expanded from macro '__trace_if_var'
57 | #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
| ~~~~~~~~~~~~~~~~~^~~~~
include/linux/compiler.h:68:3: note: expanded from macro '__trace_if_value'
68 | (cond) ? \
| ^~~~
3 warnings and 1 error generated.
vim +147 kernel/dma/direct.c
aea7e2a86a94b25 kernel/dma/direct.c Christoph Hellwig 2021-10-21 117
26749b3201ab05e kernel/dma/direct.c Christoph Hellwig 2020-06-15 118 static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
92826e967535db2 kernel/dma/direct.c Christoph Hellwig 2022-04-23 119 gfp_t gfp, bool allow_highmem)
a8463d4b0e47d1f lib/dma-noop.c Christian Borntraeger 2016-02-02 120 {
90ae409f9eb3bca kernel/dma/direct.c Christoph Hellwig 2019-08-20 121 int node = dev_to_node(dev);
080321d3b3139b3 lib/dma-direct.c Christoph Hellwig 2017-12-22 122 struct page *page = NULL;
a7ba70f1787f977 kernel/dma/direct.c Nicolas Saenz Julienne 2019-11-21 123 u64 phys_limit;
a8463d4b0e47d1f lib/dma-noop.c Christian Borntraeger 2016-02-02 124
633d5fce78a61e8 kernel/dma/direct.c David Rientjes 2020-06-11 125 WARN_ON_ONCE(!PAGE_ALIGNED(size));
633d5fce78a61e8 kernel/dma/direct.c David Rientjes 2020-06-11 126
aea7e2a86a94b25 kernel/dma/direct.c Christoph Hellwig 2021-10-21 127 if (is_swiotlb_for_alloc(dev))
aea7e2a86a94b25 kernel/dma/direct.c Christoph Hellwig 2021-10-21 128 return dma_direct_alloc_swiotlb(dev, size);
aea7e2a86a94b25 kernel/dma/direct.c Christoph Hellwig 2021-10-21 129
25a4ce564921db0 kernel/dma/direct.c Petr Tesarik 2023-02-20 130 gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
633d5fce78a61e8 kernel/dma/direct.c David Rientjes 2020-06-11 131 page = dma_alloc_contiguous(dev, size, gfp);
92826e967535db2 kernel/dma/direct.c Christoph Hellwig 2022-04-23 132 if (page) {
92826e967535db2 kernel/dma/direct.c Christoph Hellwig 2022-04-23 133 if (!dma_coherent_ok(dev, page_to_phys(page), size) ||
92826e967535db2 kernel/dma/direct.c Christoph Hellwig 2022-04-23 134 (!allow_highmem && PageHighMem(page))) {
633d5fce78a61e8 kernel/dma/direct.c David Rientjes 2020-06-11 135 dma_free_contiguous(dev, page, size);
90ae409f9eb3bca kernel/dma/direct.c Christoph Hellwig 2019-08-20 136 page = NULL;
90ae409f9eb3bca kernel/dma/direct.c Christoph Hellwig 2019-08-20 137 }
92826e967535db2 kernel/dma/direct.c Christoph Hellwig 2022-04-23 138 }
95f183916d4b0bc lib/dma-direct.c Christoph Hellwig 2018-01-09 139 again:
90ae409f9eb3bca kernel/dma/direct.c Christoph Hellwig 2019-08-20 140 if (!page)
633d5fce78a61e8 kernel/dma/direct.c David Rientjes 2020-06-11 141 page = alloc_pages_node(node, gfp, get_order(size));
95f183916d4b0bc lib/dma-direct.c Christoph Hellwig 2018-01-09 142 if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
f689a3ab7b8ece9 kernel/dma/direct.c Chen Yu 2024-08-31 143 __free_pages(page, get_order(size));
95f183916d4b0bc lib/dma-direct.c Christoph Hellwig 2018-01-09 144 page = NULL;
95f183916d4b0bc lib/dma-direct.c Christoph Hellwig 2018-01-09 145
de7eab301de7886 lib/dma-direct.c Takashi Iwai 2018-04-16 146 if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
a7ba70f1787f977 kernel/dma/direct.c Nicolas Saenz Julienne 2019-11-21 @147 phys_limit < DMA_BIT_MASK(64) &&
de7eab301de7886 lib/dma-direct.c Takashi Iwai 2018-04-16 148 !(gfp & (GFP_DMA32 | GFP_DMA))) {
de7eab301de7886 lib/dma-direct.c Takashi Iwai 2018-04-16 149 gfp |= GFP_DMA32;
de7eab301de7886 lib/dma-direct.c Takashi Iwai 2018-04-16 150 goto again;
de7eab301de7886 lib/dma-direct.c Takashi Iwai 2018-04-16 151 }
de7eab301de7886 lib/dma-direct.c Takashi Iwai 2018-04-16 152
fbce251baa6e357 kernel/dma/direct.c Christoph Hellwig 2019-02-13 153 if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) {
95f183916d4b0bc lib/dma-direct.c Christoph Hellwig 2018-01-09 154 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
95f183916d4b0bc lib/dma-direct.c Christoph Hellwig 2018-01-09 155 goto again;
95f183916d4b0bc lib/dma-direct.c Christoph Hellwig 2018-01-09 156 }
95f183916d4b0bc lib/dma-direct.c Christoph Hellwig 2018-01-09 157 }
95f183916d4b0bc lib/dma-direct.c Christoph Hellwig 2018-01-09 158
b18814e767a4455 kernel/dma/direct.c Christoph Hellwig 2018-11-04 159 return page;
b18814e767a4455 kernel/dma/direct.c Christoph Hellwig 2018-11-04 160 }
b18814e767a4455 kernel/dma/direct.c Christoph Hellwig 2018-11-04 161
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] mm: hugetlb: independent PMD page table shared count
2024-12-14 10:44 [PATCH] mm: hugetlb: independent PMD page table shared count Liu Shixin
` (2 preceding siblings ...)
2024-12-15 7:45 ` kernel test robot
@ 2024-12-16 15:34 ` David Hildenbrand
2024-12-17 2:02 ` Liu Shixin
2024-12-16 18:33 ` jane.chu
4 siblings, 1 reply; 9+ messages in thread
From: David Hildenbrand @ 2024-12-16 15:34 UTC (permalink / raw)
To: Liu Shixin, Andrew Morton, Muchun Song, Kenneth W Chen,
Kefeng Wang, Nanyong Sun
Cc: linux-mm, linux-kernel
On 14.12.24 11:44, Liu Shixin wrote:
> The folio refcount may be increased unexpectly through try_get_folio() by
> caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to
> check whether a pmd page table is shared. The check is incorrect if the
> refcount is increased by the above caller, and this can cause the page
> table leaked:
Are you sure it is "leaked" ?
I assume what happens is that we end up freeing a page table without
calling its constructor. That's why page freeing code complains about
"nonzero mapcount" (overlayed by something else).
> > BUG: Bad page state in process sh pfn:109324
> page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324
> flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff)
> page_type: f2(table)
> raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000
> raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000
> page dumped because: nonzero mapcount
> ...
> CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7
> Tainted: [B]=BAD_PAGE
> Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
> Call trace:
> show_stack+0x20/0x38 (C)
> dump_stack_lvl+0x80/0xf8
> dump_stack+0x18/0x28
> bad_page+0x8c/0x130
> free_page_is_bad_report+0xa4/0xb0
> free_unref_page+0x3cc/0x620
> __folio_put+0xf4/0x158
> split_huge_pages_all+0x1e0/0x3e8
> split_huge_pages_write+0x25c/0x2d8
> full_proxy_write+0x64/0xd8
> vfs_write+0xcc/0x280
> ksys_write+0x70/0x110
> __arm64_sys_write+0x24/0x38
> invoke_syscall+0x50/0x120
> el0_svc_common.constprop.0+0xc8/0xf0
> do_el0_svc+0x24/0x38
> el0_svc+0x34/0x128
> el0t_64_sync_handler+0xc8/0xd0
> el0t_64_sync+0x190/0x198
>
> The issue may be triggered by damon, offline_page, page_idle etc. which
> will increase the refcount of page table.
Right, many do have a racy folio_test_lru() check in there, that
prevents "most of the harm", but not all of them.
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] mm: hugetlb: independent PMD page table shared count
2024-12-14 10:44 [PATCH] mm: hugetlb: independent PMD page table shared count Liu Shixin
` (3 preceding siblings ...)
2024-12-16 15:34 ` David Hildenbrand
@ 2024-12-16 18:33 ` jane.chu
2024-12-17 11:55 ` David Hildenbrand
4 siblings, 1 reply; 9+ messages in thread
From: jane.chu @ 2024-12-16 18:33 UTC (permalink / raw)
To: Liu Shixin, Andrew Morton, Muchun Song, Kenneth W Chen,
Kefeng Wang, Nanyong Sun
Cc: linux-mm, linux-kernel
On 12/14/2024 2:44 AM, Liu Shixin wrote:
> The folio refcount may be increased unexpectly through try_get_folio() by
> caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to
> check whether a pmd page table is shared. The check is incorrect if the
> refcount is increased by the above caller, and this can cause the page
> table leaked:
hugetlb and THP don't overlap, right? how does split_huge_pages() end
up messing up huge_pmd_share() ?
Am I missing something?
>
> BUG: Bad page state in process sh pfn:109324
> page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324
> flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff)
> page_type: f2(table)
> raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000
> raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000
> page dumped because: nonzero mapcount
> ...
> CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7
> Tainted: [B]=BAD_PAGE
> Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
> Call trace:
> show_stack+0x20/0x38 (C)
> dump_stack_lvl+0x80/0xf8
> dump_stack+0x18/0x28
> bad_page+0x8c/0x130
> free_page_is_bad_report+0xa4/0xb0
> free_unref_page+0x3cc/0x620
> __folio_put+0xf4/0x158
> split_huge_pages_all+0x1e0/0x3e8
> split_huge_pages_write+0x25c/0x2d8
> full_proxy_write+0x64/0xd8
> vfs_write+0xcc/0x280
> ksys_write+0x70/0x110
> __arm64_sys_write+0x24/0x38
> invoke_syscall+0x50/0x120
> el0_svc_common.constprop.0+0xc8/0xf0
> do_el0_svc+0x24/0x38
> el0_svc+0x34/0x128
> el0t_64_sync_handler+0xc8/0xd0
> el0t_64_sync+0x190/0x198
>
> The issue may be triggered by damon, offline_page, page_idle etc. which
> will increase the refcount of page table.
>
> Fix it by introducing independent PMD page table shared count.
>
> Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
> Signed-off-by: Liu Shixin <liushixin2@huawei.com>
thanks,
-jane
> ---
> include/linux/mm.h | 1 +
> include/linux/mm_types.h | 28 ++++++++++++++++++++++++++++
> mm/hugetlb.c | 16 +++++++---------
> 3 files changed, 36 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index c39c4945946c..50fbf2a1b0ad 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3115,6 +3115,7 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
> if (!pmd_ptlock_init(ptdesc))
> return false;
> __folio_set_pgtable(folio);
> + ptdesc_pmd_pts_init(ptdesc);
> lruvec_stat_add_folio(folio, NR_PAGETABLE);
> return true;
> }
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 7361a8f3ab68..740b765ac91e 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -445,6 +445,7 @@ FOLIO_MATCH(compound_head, _head_2a);
> * @pt_index: Used for s390 gmap.
> * @pt_mm: Used for x86 pgds.
> * @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
> + * @pt_share_count: Used for HugeTLB PMD page table share count.
> * @_pt_pad_2: Padding to ensure proper alignment.
> * @ptl: Lock for the page table.
> * @__page_type: Same as page->page_type. Unused for page tables.
> @@ -471,6 +472,9 @@ struct ptdesc {
> pgoff_t pt_index;
> struct mm_struct *pt_mm;
> atomic_t pt_frag_refcount;
> +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
> + atomic_t pt_share_count;
> +#endif
> };
>
> union {
> @@ -516,6 +520,30 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
> const struct page *: (const struct ptdesc *)(p), \
> struct page *: (struct ptdesc *)(p)))
>
> +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
> +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
> +{
> + atomic_set(&ptdesc->pt_share_count, 0);
> +}
> +
> +static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc)
> +{
> + atomic_inc(&ptdesc->pt_share_count);
> +}
> +
> +static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc)
> +{
> + atomic_dec(&ptdesc->pt_share_count);
> +}
> +
> +static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc)
> +{
> + return atomic_read(&ptdesc->pt_share_count);
> +}
> +#else
> +#define ptdesc_pmd_pts_init NULL
> +#endif
> +
> /*
> * Used for sizing the vmemmap region on some architectures
> */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ea2ed8e301ef..60846b060b87 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -7212,7 +7212,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
> spte = hugetlb_walk(svma, saddr,
> vma_mmu_pagesize(svma));
> if (spte) {
> - get_page(virt_to_page(spte));
> + ptdesc_pmd_pts_inc(virt_to_ptdesc(spte));
> break;
> }
> }
> @@ -7227,7 +7227,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
> (pmd_t *)((unsigned long)spte & PAGE_MASK));
> mm_inc_nr_pmds(mm);
> } else {
> - put_page(virt_to_page(spte));
> + ptdesc_pmd_pts_dec(virt_to_ptdesc(spte));
> }
> spin_unlock(&mm->page_table_lock);
> out:
> @@ -7239,10 +7239,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
> /*
> * unmap huge page backed by shared pte.
> *
> - * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
> - * indicated by page_count > 1, unmap is achieved by clearing pud and
> - * decrementing the ref count. If count == 1, the pte page is not shared.
> - *
> * Called with page table lock held.
> *
> * returns: 1 successfully unmapped a shared pte page
> @@ -7251,18 +7247,20 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
> int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
> unsigned long addr, pte_t *ptep)
> {
> + unsigned long sz = huge_page_size(hstate_vma(vma));
> pgd_t *pgd = pgd_offset(mm, addr);
> p4d_t *p4d = p4d_offset(pgd, addr);
> pud_t *pud = pud_offset(p4d, addr);
>
> i_mmap_assert_write_locked(vma->vm_file->f_mapping);
> hugetlb_vma_assert_locked(vma);
> - BUG_ON(page_count(virt_to_page(ptep)) == 0);
> - if (page_count(virt_to_page(ptep)) == 1)
> + if (sz != PMD_SIZE)
> + return 0;
> + if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
> return 0;
>
> pud_clear(pud);
> - put_page(virt_to_page(ptep));
> + ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
> mm_dec_nr_pmds(mm);
> return 1;
> }
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] mm: hugetlb: independent PMD page table shared count
2024-12-16 15:34 ` David Hildenbrand
@ 2024-12-17 2:02 ` Liu Shixin
2024-12-17 11:56 ` David Hildenbrand
0 siblings, 1 reply; 9+ messages in thread
From: Liu Shixin @ 2024-12-17 2:02 UTC (permalink / raw)
To: David Hildenbrand, Andrew Morton, Muchun Song, Kenneth W Chen,
Kefeng Wang, Nanyong Sun
Cc: linux-mm, linux-kernel
On 2024/12/16 23:34, David Hildenbrand wrote:
> On 14.12.24 11:44, Liu Shixin wrote:
>> The folio refcount may be increased unexpectly through try_get_folio() by
>> caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to
>> check whether a pmd page table is shared. The check is incorrect if the
>> refcount is increased by the above caller, and this can cause the page
>> table leaked:
>
> Are you sure it is "leaked" ?
>
> I assume what happens is that we end up freeing a page table without calling its constructor. That's why page freeing code complains about "nonzero mapcount" (overlayed by something else).
1. The page table itself will be discarded after reporting the "nonzero mapcount".
2. The HugeTLB page mapped by the page table miss freeing since we treat the page table as shared
and a shared page table will not be to unmap.
>
> > > BUG: Bad page state in process sh pfn:109324
>> page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324
>> flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff)
>> page_type: f2(table)
>> raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000
>> raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000
>> page dumped because: nonzero mapcount
>> ...
>> CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7
>> Tainted: [B]=BAD_PAGE
>> Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
>> Call trace:
>> show_stack+0x20/0x38 (C)
>> dump_stack_lvl+0x80/0xf8
>> dump_stack+0x18/0x28
>> bad_page+0x8c/0x130
>> free_page_is_bad_report+0xa4/0xb0
>> free_unref_page+0x3cc/0x620
>> __folio_put+0xf4/0x158
>> split_huge_pages_all+0x1e0/0x3e8
>> split_huge_pages_write+0x25c/0x2d8
>> full_proxy_write+0x64/0xd8
>> vfs_write+0xcc/0x280
>> ksys_write+0x70/0x110
>> __arm64_sys_write+0x24/0x38
>> invoke_syscall+0x50/0x120
>> el0_svc_common.constprop.0+0xc8/0xf0
>> do_el0_svc+0x24/0x38
>> el0_svc+0x34/0x128
>> el0t_64_sync_handler+0xc8/0xd0
>> el0t_64_sync+0x190/0x198
>>
>> The issue may be triggered by damon, offline_page, page_idle etc. which
>> will increase the refcount of page table.
>
> Right, many do have a racy folio_test_lru() check in there, that prevents "most of the harm", but not all of them.
Yes, this makes the problems nearly impossible to happen for some function, but not really safe.
thanks,
>
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] mm: hugetlb: independent PMD page table shared count
2024-12-16 18:33 ` jane.chu
@ 2024-12-17 11:55 ` David Hildenbrand
0 siblings, 0 replies; 9+ messages in thread
From: David Hildenbrand @ 2024-12-17 11:55 UTC (permalink / raw)
To: jane.chu, Liu Shixin, Andrew Morton, Muchun Song, Kenneth W Chen,
Kefeng Wang, Nanyong Sun
Cc: linux-mm, linux-kernel
On 16.12.24 19:33, jane.chu@oracle.com wrote:
> On 12/14/2024 2:44 AM, Liu Shixin wrote:
>
>> The folio refcount may be increased unexpectly through try_get_folio() by
>> caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to
>> check whether a pmd page table is shared. The check is incorrect if the
>> refcount is increased by the above caller, and this can cause the page
>> table leaked:
>
> hugetlb and THP don't overlap, right? how does split_huge_pages() end
> up messing up huge_pmd_share() ?
>
> Am I missing something?
>
If first grabs a reference to then check if it's a THP. So we can end up
grabbing anything temporarily.
In general, we'll have to be smarter about what we try grabbing, but
handling races for now is tricky.
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] mm: hugetlb: independent PMD page table shared count
2024-12-17 2:02 ` Liu Shixin
@ 2024-12-17 11:56 ` David Hildenbrand
0 siblings, 0 replies; 9+ messages in thread
From: David Hildenbrand @ 2024-12-17 11:56 UTC (permalink / raw)
To: Liu Shixin, Andrew Morton, Muchun Song, Kenneth W Chen,
Kefeng Wang, Nanyong Sun
Cc: linux-mm, linux-kernel
On 17.12.24 03:02, Liu Shixin wrote:
>
>
> On 2024/12/16 23:34, David Hildenbrand wrote:
>> On 14.12.24 11:44, Liu Shixin wrote:
>>> The folio refcount may be increased unexpectly through try_get_folio() by
>>> caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to
>>> check whether a pmd page table is shared. The check is incorrect if the
>>> refcount is increased by the above caller, and this can cause the page
>>> table leaked:
>>
>> Are you sure it is "leaked" ?
>>
>> I assume what happens is that we end up freeing a page table without calling its constructor. That's why page freeing code complains about "nonzero mapcount" (overlayed by something else).
>
> 1. The page table itself will be discarded after reporting the "nonzero mapcount".
>
> 2. The HugeTLB page mapped by the page table miss freeing since we treat the page table as shared
> and a shared page table will not be to unmap.
Ah, the page table still maps something, that makes sense. So we're
leaking that indeed.
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2024-12-17 11:56 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-12-14 10:44 [PATCH] mm: hugetlb: independent PMD page table shared count Liu Shixin
2024-12-15 5:11 ` kernel test robot
2024-12-15 5:11 ` kernel test robot
2024-12-15 7:45 ` kernel test robot
2024-12-16 15:34 ` David Hildenbrand
2024-12-17 2:02 ` Liu Shixin
2024-12-17 11:56 ` David Hildenbrand
2024-12-16 18:33 ` jane.chu
2024-12-17 11:55 ` David Hildenbrand
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox