* [PATCH 1/2] mm/vmalloc: huge vmalloc backing pages should be split rather than compound
2022-04-22 6:01 [PATCH 0/2] Request to test fix for "x86/Kconfig: select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP)" Nicholas Piggin
@ 2022-04-22 6:01 ` Nicholas Piggin
2022-04-22 16:23 ` Linus Torvalds
2022-04-22 6:01 ` [PATCH 2/2] Revert "vmalloc: replace VM_NO_HUGE_VMAP with VM_ALLOW_HUGE_VMAP" Nicholas Piggin
1 sibling, 1 reply; 6+ messages in thread
From: Nicholas Piggin @ 2022-04-22 6:01 UTC (permalink / raw)
To: Paul Menzel
Cc: Nicholas Piggin, x86, Song Liu, Edgecombe, Rick P, Torvalds,
Linus, akpm, linux-kernel, linux-mm
Huge vmalloc higher-order backing pages were allocated with __GFP_COMP
in order to allow the sub-pages to be refcounted by callers such as
"remap_vmalloc_page [sic]" (remap_vmalloc_range).
However a similar problem exists for other struct page fields callers
use, for example fb_deferred_io_fault() takes a vmalloc'ed page and
not only refcounts it but uses ->lru, ->mapping, ->index. This is not
compatible with compound sub-pages.
The correct approach is to use split high-order pages for the huge
vmalloc backing. These allow callers to treat them in exactly the same
way as individually-allocated order-0 pages.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
mm/vmalloc.c | 36 +++++++++++++++++++++---------------
1 file changed, 21 insertions(+), 15 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 07da85ae825b..cadfbb5155ea 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2653,15 +2653,18 @@ static void __vunmap(const void *addr, int deallocate_pages)
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
- unsigned int page_order = vm_area_page_order(area);
- int i, step = 1U << page_order;
+ int i;
- for (i = 0; i < area->nr_pages; i += step) {
+ for (i = 0; i < area->nr_pages; i++) {
struct page *page = area->pages[i];
BUG_ON(!page);
- mod_memcg_page_state(page, MEMCG_VMALLOC, -step);
- __free_pages(page, page_order);
+ mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
+ /*
+ * High-order allocs for huge vmallocs are split, so
+ * can be freed as an array of order-0 allocations
+ */
+ __free_pages(page, 0);
cond_resched();
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
@@ -2914,12 +2917,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
if (nr != nr_pages_request)
break;
}
- } else
- /*
- * Compound pages required for remap_vmalloc_page if
- * high-order pages.
- */
- gfp |= __GFP_COMP;
+ }
/* High-order pages or fallback path if "bulk" fails. */
@@ -2933,6 +2931,15 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
page = alloc_pages_node(nid, gfp, order);
if (unlikely(!page))
break;
+ /*
+ * Higher order allocations must be able to be treated as
+ * indepdenent small pages by callers (as they can with
+ * small-page vmallocs). Some drivers do their own refcounting
+ * on vmalloc_to_page() pages, some use page->mapping,
+ * page->lru, etc.
+ */
+ if (order)
+ split_page(page, order);
/*
* Careful, we allocate and map page-order pages, but
@@ -2992,11 +2999,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
if (gfp_mask & __GFP_ACCOUNT) {
- int i, step = 1U << page_order;
+ int i;
- for (i = 0; i < area->nr_pages; i += step)
- mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC,
- step);
+ for (i = 0; i < area->nr_pages; i++)
+ mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
}
/*
--
2.35.1
^ permalink raw reply [flat|nested] 6+ messages in thread* [PATCH 2/2] Revert "vmalloc: replace VM_NO_HUGE_VMAP with VM_ALLOW_HUGE_VMAP"
2022-04-22 6:01 [PATCH 0/2] Request to test fix for "x86/Kconfig: select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP)" Nicholas Piggin
2022-04-22 6:01 ` [PATCH 1/2] mm/vmalloc: huge vmalloc backing pages should be split rather than compound Nicholas Piggin
@ 2022-04-22 6:01 ` Nicholas Piggin
2022-04-22 17:08 ` Linus Torvalds
1 sibling, 1 reply; 6+ messages in thread
From: Nicholas Piggin @ 2022-04-22 6:01 UTC (permalink / raw)
To: Paul Menzel
Cc: Nicholas Piggin, x86, Song Liu, Edgecombe, Rick P, Torvalds,
Linus, akpm, linux-kernel, linux-mm
This reverts commit 559089e0a93d44280ec3ab478830af319c56dbe3
The previous commit fixes huge vmalloc for drivers that use the
vmalloc_to_page() struct pages.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/Kconfig | 6 ++++--
arch/powerpc/kernel/module.c | 2 +-
arch/s390/kvm/pv.c | 7 ++++++-
include/linux/vmalloc.h | 4 ++--
mm/vmalloc.c | 17 +++++++----------
5 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 31c4fdc4a4ba..29b0167c088b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -854,8 +854,10 @@ config HAVE_ARCH_HUGE_VMAP
#
# Archs that select this would be capable of PMD-sized vmaps (i.e.,
-# arch_vmap_pmd_supported() returns true). The VM_ALLOW_HUGE_VMAP flag
-# must be used to enable allocations to use hugepages.
+# arch_vmap_pmd_supported() returns true), and they must make no assumptions
+# that vmalloc memory is mapped with PAGE_SIZE ptes. The VM_NO_HUGE_VMAP flag
+# can be used to prohibit arch-specific allocations from using hugepages to
+# help with this (e.g., modules may require it).
#
config HAVE_ARCH_HUGE_VMALLOC
depends on HAVE_ARCH_HUGE_VMAP
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index 97a76a8619fb..40a583e9d3c7 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -101,7 +101,7 @@ __module_alloc(unsigned long size, unsigned long start, unsigned long end, bool
* too.
*/
return __vmalloc_node_range(size, 1, start, end, gfp, prot,
- VM_FLUSH_RESET_PERMS,
+ VM_FLUSH_RESET_PERMS | VM_NO_HUGE_VMAP,
NUMA_NO_NODE, __builtin_return_address(0));
}
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index cc7c9599f43e..7f7c0d6af2ce 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -137,7 +137,12 @@ static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
/* Allocate variable storage */
vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
vlen += uv_info.guest_virt_base_stor_len;
- kvm->arch.pv.stor_var = vzalloc(vlen);
+ /*
+ * The Create Secure Configuration Ultravisor Call does not support
+ * using large pages for the virtual memory area.
+ * This is a hardware limitation.
+ */
+ kvm->arch.pv.stor_var = vmalloc_no_huge(vlen);
if (!kvm->arch.pv.stor_var)
goto out_err;
return 0;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index b159c2789961..3b1df7da402d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -26,7 +26,7 @@ struct notifier_block; /* in notifier.h */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
#define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */
#define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */
-#define VM_ALLOW_HUGE_VMAP 0x00000400 /* Allow for huge pages on archs with HAVE_ARCH_HUGE_VMALLOC */
+#define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */
#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
!defined(CONFIG_KASAN_VMALLOC)
@@ -153,7 +153,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
const void *caller) __alloc_size(1);
void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
int node, const void *caller) __alloc_size(1);
-void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+void *vmalloc_no_huge(unsigned long size) __alloc_size(1);
extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index cadfbb5155ea..09470361dc03 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3101,7 +3101,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return NULL;
}
- if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
+ if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) {
unsigned long size_per_node;
/*
@@ -3268,24 +3268,21 @@ void *vmalloc(unsigned long size)
EXPORT_SYMBOL(vmalloc);
/**
- * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
- * @size: allocation size
- * @gfp_mask: flags for the page level allocator
+ * vmalloc_no_huge - allocate virtually contiguous memory using small pages
+ * @size: allocation size
*
- * Allocate enough pages to cover @size from the page level
+ * Allocate enough non-huge pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
- * If @size is greater than or equal to PMD_SIZE, allow using
- * huge pages for the memory
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
+void *vmalloc_no_huge(unsigned long size)
{
return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
- gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP,
NUMA_NO_NODE, __builtin_return_address(0));
}
-EXPORT_SYMBOL_GPL(vmalloc_huge);
+EXPORT_SYMBOL(vmalloc_no_huge);
/**
* vzalloc - allocate virtually contiguous memory with zero fill
--
2.35.1
^ permalink raw reply [flat|nested] 6+ messages in thread