* [PATCH v3 1/3] mm: sparsemem: split the huge PMD mapping of vmemmap pages
2021-06-16 9:49 [PATCH v3 0/3] Split huge PMD mapping of vmemmap pages Muchun Song
@ 2021-06-16 9:49 ` Muchun Song
2021-06-16 9:49 ` [PATCH v3 2/3] mm: sparsemem: use huge PMD mapping for " Muchun Song
2021-06-16 9:49 ` [PATCH v3 3/3] mm: hugetlb: introduce CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON Muchun Song
2 siblings, 0 replies; 4+ messages in thread
From: Muchun Song @ 2021-06-16 9:49 UTC (permalink / raw)
To: mike.kravetz, akpm, osalvador, mhocko, song.bao.hua, david,
chenhuang5, bodeddub, corbet
Cc: duanxiongchun, fam.zheng, linux-doc, linux-kernel, linux-mm, Muchun Song
In [1], PMD mappings of vmemmap pages were disabled if the the feature
hugetlb_free_vmemmap was enabled. This was done to simplify the initial
implementation of vmmemap freeing for hugetlb pages. Now, remove this
simplification by allowing PMD mapping and switching to PTE mappings as
needed for allocated hugetlb pages.
When a hugetlb page is allocated, the vmemmap page tables are walked to
free vmemmap pages. During this walk, split huge PMD mappings to PTE
mappings as required. In the unlikely case PTE pages can not be
allocated, return error(ENOMEM) and do not optimize vmemmap of the
hugetlb page.
When HugeTLB pages are freed from the pool, we do not attempt to
coalesce and move back to a PMD mapping because it is much more complex.
[1] https://lkml.kernel.org/r/20210510030027.56044-8-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
---
include/linux/mm.h | 4 +-
mm/hugetlb_vmemmap.c | 5 +-
mm/sparse-vmemmap.c | 163 +++++++++++++++++++++++++++++++++++++++------------
3 files changed, 129 insertions(+), 43 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cadc8cc2c715..8284e8ed30c9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3055,8 +3055,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
}
#endif
-void vmemmap_remap_free(unsigned long start, unsigned long end,
- unsigned long reuse);
+int vmemmap_remap_free(unsigned long start, unsigned long end,
+ unsigned long reuse);
int vmemmap_remap_alloc(unsigned long start, unsigned long end,
unsigned long reuse, gfp_t gfp_mask);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index f9f9bb212319..06802056f296 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -258,9 +258,8 @@ void free_huge_page_vmemmap(struct hstate *h, struct page *head)
* to the page which @vmemmap_reuse is mapped to, then free the pages
* which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
*/
- vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse);
-
- SetHPageVmemmapOptimized(head);
+ if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
+ SetHPageVmemmapOptimized(head);
}
void __init hugetlb_vmemmap_init(struct hstate *h)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 693de0aec7a8..b402c00bf75b 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -38,6 +38,7 @@
* vmemmap_remap_walk - walk vmemmap page table
*
* @remap_pte: called for each lowest-level entry (PTE).
+ * @nr_walked: the number of walked pte.
* @reuse_page: the page which is reused for the tail vmemmap pages.
* @reuse_addr: the virtual address of the @reuse_page page.
* @vmemmap_pages: the list head of the vmemmap pages that can be freed
@@ -46,11 +47,44 @@
struct vmemmap_remap_walk {
void (*remap_pte)(pte_t *pte, unsigned long addr,
struct vmemmap_remap_walk *walk);
+ unsigned long nr_walked;
struct page *reuse_page;
unsigned long reuse_addr;
struct list_head *vmemmap_pages;
};
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
+ struct vmemmap_remap_walk *walk)
+{
+ pmd_t __pmd;
+ int i;
+ unsigned long addr = start;
+ struct page *page = pmd_page(*pmd);
+ pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
+
+ if (!pgtable)
+ return -ENOMEM;
+
+ pmd_populate_kernel(&init_mm, &__pmd, pgtable);
+
+ for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
+ pte_t entry, *pte;
+ pgprot_t pgprot = PAGE_KERNEL;
+
+ entry = mk_pte(page + i, pgprot);
+ pte = pte_offset_kernel(&__pmd, addr);
+ set_pte_at(&init_mm, addr, pte, entry);
+ }
+
+ /* Make pte visible before pmd. See comment in __pte_alloc(). */
+ smp_wmb();
+ pmd_populate_kernel(&init_mm, pmd, pgtable);
+
+ flush_tlb_kernel_range(start, start + PMD_SIZE);
+
+ return 0;
+}
+
static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end,
struct vmemmap_remap_walk *walk)
@@ -69,58 +103,80 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
*/
addr += PAGE_SIZE;
pte++;
+ walk->nr_walked++;
}
- for (; addr != end; addr += PAGE_SIZE, pte++)
+ for (; addr != end; addr += PAGE_SIZE, pte++) {
walk->remap_pte(pte, addr, walk);
+ walk->nr_walked++;
+ }
}
-static void vmemmap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end,
- struct vmemmap_remap_walk *walk)
+static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_offset(pud, addr);
do {
- BUG_ON(pmd_leaf(*pmd));
+ if (pmd_leaf(*pmd)) {
+ int ret;
+ ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk);
+ if (ret)
+ return ret;
+ }
next = pmd_addr_end(addr, end);
vmemmap_pte_range(pmd, addr, next, walk);
} while (pmd++, addr = next, addr != end);
+
+ return 0;
}
-static void vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end,
- struct vmemmap_remap_walk *walk)
+static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
{
pud_t *pud;
unsigned long next;
pud = pud_offset(p4d, addr);
do {
+ int ret;
+
next = pud_addr_end(addr, end);
- vmemmap_pmd_range(pud, addr, next, walk);
+ ret = vmemmap_pmd_range(pud, addr, next, walk);
+ if (ret)
+ return ret;
} while (pud++, addr = next, addr != end);
+
+ return 0;
}
-static void vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end,
- struct vmemmap_remap_walk *walk)
+static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
{
p4d_t *p4d;
unsigned long next;
p4d = p4d_offset(pgd, addr);
do {
+ int ret;
+
next = p4d_addr_end(addr, end);
- vmemmap_pud_range(p4d, addr, next, walk);
+ ret = vmemmap_pud_range(p4d, addr, next, walk);
+ if (ret)
+ return ret;
} while (p4d++, addr = next, addr != end);
+
+ return 0;
}
-static void vmemmap_remap_range(unsigned long start, unsigned long end,
- struct vmemmap_remap_walk *walk)
+static int vmemmap_remap_range(unsigned long start, unsigned long end,
+ struct vmemmap_remap_walk *walk)
{
unsigned long addr = start;
unsigned long next;
@@ -131,8 +187,12 @@ static void vmemmap_remap_range(unsigned long start, unsigned long end,
pgd = pgd_offset_k(addr);
do {
+ int ret;
+
next = pgd_addr_end(addr, end);
- vmemmap_p4d_range(pgd, addr, next, walk);
+ ret = vmemmap_p4d_range(pgd, addr, next, walk);
+ if (ret)
+ return ret;
} while (pgd++, addr = next, addr != end);
/*
@@ -141,6 +201,8 @@ static void vmemmap_remap_range(unsigned long start, unsigned long end,
* belongs to the range.
*/
flush_tlb_kernel_range(start + PAGE_SIZE, end);
+
+ return 0;
}
/*
@@ -179,10 +241,27 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
pte_t entry = mk_pte(walk->reuse_page, pgprot);
struct page *page = pte_page(*pte);
- list_add(&page->lru, walk->vmemmap_pages);
+ list_add_tail(&page->lru, walk->vmemmap_pages);
set_pte_at(&init_mm, addr, pte, entry);
}
+static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk)
+{
+ pgprot_t pgprot = PAGE_KERNEL;
+ struct page *page;
+ void *to;
+
+ BUG_ON(pte_page(*pte) != walk->reuse_page);
+
+ page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+ list_del(&page->lru);
+ to = page_to_virt(page);
+ copy_page(to, (void *)walk->reuse_addr);
+
+ set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
/**
* vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
* to the page which @reuse is mapped to, then free vmemmap
@@ -193,12 +272,12 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
* remap.
* @reuse: reuse address.
*
- * Note: This function depends on vmemmap being base page mapped. Please make
- * sure that we disable PMD mapping of vmemmap pages when calling this function.
+ * Return: %0 on success, negative error code otherwise.
*/
-void vmemmap_remap_free(unsigned long start, unsigned long end,
- unsigned long reuse)
+int vmemmap_remap_free(unsigned long start, unsigned long end,
+ unsigned long reuse)
{
+ int ret;
LIST_HEAD(vmemmap_pages);
struct vmemmap_remap_walk walk = {
.remap_pte = vmemmap_remap_pte,
@@ -221,25 +300,31 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
*/
BUG_ON(start - reuse != PAGE_SIZE);
- vmemmap_remap_range(reuse, end, &walk);
- free_vmemmap_page_list(&vmemmap_pages);
-}
+ mmap_write_lock(&init_mm);
+ ret = vmemmap_remap_range(reuse, end, &walk);
+ mmap_write_downgrade(&init_mm);
-static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
- struct vmemmap_remap_walk *walk)
-{
- pgprot_t pgprot = PAGE_KERNEL;
- struct page *page;
- void *to;
+ if (ret && walk.nr_walked) {
+ end = reuse + walk.nr_walked * PAGE_SIZE;
+ /*
+ * vmemmap_pages contains pages from the previous
+ * vmemmap_remap_range call which failed. These
+ * are pages which were removed from the vmemmap.
+ * They will be restored in the following call.
+ */
+ walk = (struct vmemmap_remap_walk) {
+ .remap_pte = vmemmap_restore_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
- BUG_ON(pte_page(*pte) != walk->reuse_page);
+ vmemmap_remap_range(reuse, end, &walk);
+ }
+ mmap_read_unlock(&init_mm);
- page = list_first_entry(walk->vmemmap_pages, struct page, lru);
- list_del(&page->lru);
- to = page_to_virt(page);
- copy_page(to, (void *)walk->reuse_addr);
+ free_vmemmap_page_list(&vmemmap_pages);
- set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+ return ret;
}
static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
@@ -273,6 +358,8 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
* remap.
* @reuse: reuse address.
* @gpf_mask: GFP flag for allocating vmemmap pages.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int vmemmap_remap_alloc(unsigned long start, unsigned long end,
unsigned long reuse, gfp_t gfp_mask)
@@ -287,12 +374,12 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end,
/* See the comment in the vmemmap_remap_free(). */
BUG_ON(start - reuse != PAGE_SIZE);
- might_sleep_if(gfpflags_allow_blocking(gfp_mask));
-
if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
return -ENOMEM;
+ mmap_read_lock(&init_mm);
vmemmap_remap_range(reuse, end, &walk);
+ mmap_read_unlock(&init_mm);
return 0;
}
--
2.11.0
^ permalink raw reply [flat|nested] 4+ messages in thread* [PATCH v3 2/3] mm: sparsemem: use huge PMD mapping for vmemmap pages
2021-06-16 9:49 [PATCH v3 0/3] Split huge PMD mapping of vmemmap pages Muchun Song
2021-06-16 9:49 ` [PATCH v3 1/3] mm: sparsemem: split the " Muchun Song
@ 2021-06-16 9:49 ` Muchun Song
2021-06-16 9:49 ` [PATCH v3 3/3] mm: hugetlb: introduce CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON Muchun Song
2 siblings, 0 replies; 4+ messages in thread
From: Muchun Song @ 2021-06-16 9:49 UTC (permalink / raw)
To: mike.kravetz, akpm, osalvador, mhocko, song.bao.hua, david,
chenhuang5, bodeddub, corbet
Cc: duanxiongchun, fam.zheng, linux-doc, linux-kernel, linux-mm, Muchun Song
The preparation of splitting huge PMD mapping of vmemmap pages is ready,
so switch the mapping from PTE to PMD.
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
---
Documentation/admin-guide/kernel-parameters.txt | 7 -------
arch/x86/mm/init_64.c | 8 ++------
include/linux/hugetlb.h | 25 ++++++-------------------
mm/memory_hotplug.c | 2 +-
4 files changed, 9 insertions(+), 33 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index db1ef6739613..a01aadafee38 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1599,13 +1599,6 @@
enabled.
Allows heavy hugetlb users to free up some more
memory (6 * PAGE_SIZE for each 2MB hugetlb page).
- This feauture is not free though. Large page
- tables are not used to back vmemmap pages which
- can lead to a performance degradation for some
- workloads. Also there will be memory allocation
- required when hugetlb pages are freed from the
- pool which can lead to corner cases under heavy
- memory pressure.
Format: { on | off (default) }
on: enable the feature
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9d9d18d0c2a1..65ea58527176 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -34,7 +34,6 @@
#include <linux/gfp.h>
#include <linux/kcore.h>
#include <linux/bootmem_info.h>
-#include <linux/hugetlb.h>
#include <asm/processor.h>
#include <asm/bios_ebda.h>
@@ -1610,8 +1609,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
- if ((is_hugetlb_free_vmemmap_enabled() && !altmap) ||
- end - start < PAGES_PER_SECTION * sizeof(struct page))
+ if (end - start < PAGES_PER_SECTION * sizeof(struct page))
err = vmemmap_populate_basepages(start, end, node, NULL);
else if (boot_cpu_has(X86_FEATURE_PSE))
err = vmemmap_populate_hugepages(start, end, node, altmap);
@@ -1639,8 +1637,6 @@ void register_page_bootmem_memmap(unsigned long section_nr,
pmd_t *pmd;
unsigned int nr_pmd_pages;
struct page *page;
- bool base_mapping = !boot_cpu_has(X86_FEATURE_PSE) ||
- is_hugetlb_free_vmemmap_enabled();
for (; addr < end; addr = next) {
pte_t *pte = NULL;
@@ -1666,7 +1662,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
}
get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
- if (base_mapping) {
+ if (!boot_cpu_has(X86_FEATURE_PSE)) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 03ca83db0a3e..d43565dd5fb9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -904,20 +904,6 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
}
#endif
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-extern bool hugetlb_free_vmemmap_enabled;
-
-static inline bool is_hugetlb_free_vmemmap_enabled(void)
-{
- return hugetlb_free_vmemmap_enabled;
-}
-#else
-static inline bool is_hugetlb_free_vmemmap_enabled(void)
-{
- return false;
-}
-#endif
-
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
@@ -1077,13 +1063,14 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
pte_t *ptep, pte_t pte, unsigned long sz)
{
}
-
-static inline bool is_hugetlb_free_vmemmap_enabled(void)
-{
- return false;
-}
#endif /* CONFIG_HUGETLB_PAGE */
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+extern bool hugetlb_free_vmemmap_enabled;
+#else
+#define hugetlb_free_vmemmap_enabled false
+#endif
+
static inline spinlock_t *huge_pte_lock(struct hstate *h,
struct mm_struct *mm, pte_t *pte)
{
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d96a3c7551c8..9d8a551c08d5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1056,7 +1056,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
* populate a single PMD.
*/
return memmap_on_memory &&
- !is_hugetlb_free_vmemmap_enabled() &&
+ !hugetlb_free_vmemmap_enabled &&
IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
size == memory_block_size_bytes() &&
IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
--
2.11.0
^ permalink raw reply [flat|nested] 4+ messages in thread