From: Muchun Song <songmuchun@bytedance.com>
To: Joao Martins <joao.m.martins@oracle.com>
Cc: Linux Memory Management List <linux-mm@kvack.org>,
Dan Williams <dan.j.williams@intel.com>,
Vishal Verma <vishal.l.verma@intel.com>,
Matthew Wilcox <willy@infradead.org>,
Jason Gunthorpe <jgg@ziepe.ca>, Jane Chu <jane.chu@oracle.com>,
Mike Kravetz <mike.kravetz@oracle.com>,
Andrew Morton <akpm@linux-foundation.org>,
Jonathan Corbet <corbet@lwn.net>, Christoph Hellwig <hch@lst.de>,
nvdimm@lists.linux.dev,
Linux Doc Mailing List <linux-doc@vger.kernel.org>
Subject: Re: [PATCH v5 4/5] mm/sparse-vmemmap: improve memory savings for compound devmaps
Date: Fri, 11 Feb 2022 15:54:36 +0800 [thread overview]
Message-ID: <CAMZfGtUEaFg=CGLRJomyumsZzcyn8O0JE1+De2Vd3a5remcH6w@mail.gmail.com> (raw)
In-Reply-To: <20220210193345.23628-5-joao.m.martins@oracle.com>
On Fri, Feb 11, 2022 at 3:34 AM Joao Martins <joao.m.martins@oracle.com> wrote:
[...]
> pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
> - struct vmem_altmap *altmap)
> + struct vmem_altmap *altmap,
> + struct page *block)
Why not use the name of "reuse" instead of "block"?
Seems like "reuse" is more clear.
> {
> pte_t *pte = pte_offset_kernel(pmd, addr);
> if (pte_none(*pte)) {
> pte_t entry;
> void *p;
>
> - p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
> - if (!p)
> - return NULL;
> + if (!block) {
> + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
> + if (!p)
> + return NULL;
> + } else {
> + /*
> + * When a PTE/PMD entry is freed from the init_mm
> + * there's a a free_pages() call to this page allocated
> + * above. Thus this get_page() is paired with the
> + * put_page_testzero() on the freeing path.
> + * This can only called by certain ZONE_DEVICE path,
> + * and through vmemmap_populate_compound_pages() when
> + * slab is available.
> + */
> + get_page(block);
> + p = page_to_virt(block);
> + }
> entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
> set_pte_at(&init_mm, addr, pte, entry);
> }
> @@ -609,7 +624,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
> }
>
> static int __meminit vmemmap_populate_address(unsigned long addr, int node,
> - struct vmem_altmap *altmap)
> + struct vmem_altmap *altmap,
> + struct page *reuse, struct page **page)
We can remove the last argument (struct page **page) if we change
the return type to "pte_t *". More simple, don't you think?
> {
> pgd_t *pgd;
> p4d_t *p4d;
> @@ -629,11 +645,13 @@ static int __meminit vmemmap_populate_address(unsigned long addr, int node,
> pmd = vmemmap_pmd_populate(pud, addr, node);
> if (!pmd)
> return -ENOMEM;
> - pte = vmemmap_pte_populate(pmd, addr, node, altmap);
> + pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
> if (!pte)
> return -ENOMEM;
> vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
>
> + if (page)
> + *page = pte_page(*pte);
> return 0;
> }
>
> @@ -644,10 +662,120 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
> int rc;
>
> for (; addr < end; addr += PAGE_SIZE) {
> - rc = vmemmap_populate_address(addr, node, altmap);
> + rc = vmemmap_populate_address(addr, node, altmap, NULL, NULL);
> if (rc)
> return rc;
> + }
> +
> + return 0;
> +}
> +
> +static int __meminit vmemmap_populate_range(unsigned long start,
> + unsigned long end,
> + int node, struct page *page)
> +{
> + unsigned long addr = start;
> + int rc;
>
> + for (; addr < end; addr += PAGE_SIZE) {
> + rc = vmemmap_populate_address(addr, node, NULL, page, NULL);
> + if (rc)
> + return rc;
> + }
> +
> + return 0;
> +}
> +
> +static inline int __meminit vmemmap_populate_page(unsigned long addr, int node,
> + struct page **page)
> +{
> + return vmemmap_populate_address(addr, node, NULL, NULL, page);
> +}
> +
> +/*
> + * For compound pages bigger than section size (e.g. x86 1G compound
> + * pages with 2M subsection size) fill the rest of sections as tail
> + * pages.
> + *
> + * Note that memremap_pages() resets @nr_range value and will increment
> + * it after each range successful onlining. Thus the value or @nr_range
> + * at section memmap populate corresponds to the in-progress range
> + * being onlined here.
> + */
> +static bool __meminit reuse_compound_section(unsigned long start_pfn,
> + struct dev_pagemap *pgmap)
> +{
> + unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
> + unsigned long offset = start_pfn -
> + PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
> +
> + return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
> +}
> +
> +static struct page * __meminit compound_section_tail_page(unsigned long addr)
> +{
> + pte_t *ptep;
> +
> + addr -= PAGE_SIZE;
> +
> + /*
> + * Assuming sections are populated sequentially, the previous section's
> + * page data can be reused.
> + */
> + ptep = pte_offset_kernel(pmd_off_k(addr), addr);
> + if (!ptep)
> + return NULL;
> +
> + return pte_page(*ptep);
> +}
> +
> +static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
> + unsigned long start,
> + unsigned long end, int node,
> + struct dev_pagemap *pgmap)
> +{
> + unsigned long size, addr;
> +
> + if (reuse_compound_section(start_pfn, pgmap)) {
> + struct page *page;
> +
> + page = compound_section_tail_page(start);
> + if (!page)
> + return -ENOMEM;
> +
> + /*
> + * Reuse the page that was populated in the prior iteration
> + * with just tail struct pages.
> + */
> + return vmemmap_populate_range(start, end, node, page);
> + }
> +
> + size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
> + for (addr = start; addr < end; addr += size) {
> + unsigned long next = addr, last = addr + size;
> + struct page *block;
> + int rc;
> +
> + /* Populate the head page vmemmap page */
> + rc = vmemmap_populate_page(addr, node, NULL);
> + if (rc)
> + return rc;
> +
> + /* Populate the tail pages vmemmap page */
> + block = NULL;
> + next = addr + PAGE_SIZE;
> + rc = vmemmap_populate_page(next, node, &block);
> + if (rc)
> + return rc;
> +
> + /*
> + * Reuse the previous page for the rest of tail pages
> + * See layout diagram in Documentation/vm/vmemmap_dedup.rst
> + */
> + next += PAGE_SIZE;
> + rc = vmemmap_populate_range(next, last, node, block);
> + if (rc)
> + return rc;
> }
>
> return 0;
> @@ -659,12 +787,18 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
> {
> unsigned long start = (unsigned long) pfn_to_page(pfn);
> unsigned long end = start + nr_pages * sizeof(struct page);
> + int r;
>
> if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
> !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
> return NULL;
>
> - if (vmemmap_populate(start, end, nid, altmap))
> + if (pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
Should we add a judgment like "is_power_of_2(sizeof(struct page))" since
this optimization is only applied when the size of the struct page does not
cross page boundaries?
Thanks.
next prev parent reply other threads:[~2022-02-11 7:55 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-02-10 19:33 [PATCH v5 0/5] sparse-vmemmap: memory savings for compound devmaps (device-dax) Joao Martins
2022-02-10 19:33 ` [PATCH v5 1/5] mm/sparse-vmemmap: add a pgmap argument to section activation Joao Martins
2022-02-11 8:03 ` Muchun Song
2022-02-11 12:37 ` Joao Martins
2022-02-10 19:33 ` [PATCH v5 2/5] mm/sparse-vmemmap: refactor core of vmemmap_populate_basepages() to helper Joao Martins
2022-02-11 7:54 ` Muchun Song
2022-02-10 19:33 ` [PATCH v5 3/5] mm/hugetlb_vmemmap: move comment block to Documentation/vm Joao Martins
2022-02-10 19:33 ` [PATCH v5 4/5] mm/sparse-vmemmap: improve memory savings for compound devmaps Joao Martins
2022-02-11 7:54 ` Muchun Song [this message]
2022-02-11 12:37 ` Joao Martins
2022-02-12 10:08 ` Muchun Song
2022-02-12 14:49 ` Muchun Song
2022-02-14 10:57 ` Joao Martins
2022-02-14 10:55 ` Joao Martins
2022-02-10 19:33 ` [PATCH v5 5/5] mm/page_alloc: reuse tail struct pages " Joao Martins
2022-02-11 5:07 ` Muchun Song
2022-02-11 12:48 ` Joao Martins
2022-02-12 11:11 ` Muchun Song
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CAMZfGtUEaFg=CGLRJomyumsZzcyn8O0JE1+De2Vd3a5remcH6w@mail.gmail.com' \
--to=songmuchun@bytedance.com \
--cc=akpm@linux-foundation.org \
--cc=corbet@lwn.net \
--cc=dan.j.williams@intel.com \
--cc=hch@lst.de \
--cc=jane.chu@oracle.com \
--cc=jgg@ziepe.ca \
--cc=joao.m.martins@oracle.com \
--cc=linux-doc@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mike.kravetz@oracle.com \
--cc=nvdimm@lists.linux.dev \
--cc=vishal.l.verma@intel.com \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox