Re: [PATCH v5 4/5] mm/sparse-vmemmap: improve memory savings for compound devmaps

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Muchun Song <songmuchun@bytedance.com>
To: Joao Martins <joao.m.martins@oracle.com>
Cc: Linux Memory Management List <linux-mm@kvack.org>,
	Dan Williams <dan.j.williams@intel.com>,
	 Vishal Verma <vishal.l.verma@intel.com>,
	Matthew Wilcox <willy@infradead.org>,
	 Jason Gunthorpe <jgg@ziepe.ca>, Jane Chu <jane.chu@oracle.com>,
	 Mike Kravetz <mike.kravetz@oracle.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	 Jonathan Corbet <corbet@lwn.net>, Christoph Hellwig <hch@lst.de>,
	nvdimm@lists.linux.dev,
	 Linux Doc Mailing List <linux-doc@vger.kernel.org>
Subject: Re: [PATCH v5 4/5] mm/sparse-vmemmap: improve memory savings for compound devmaps
Date: Fri, 11 Feb 2022 15:54:36 +0800	[thread overview]
Message-ID: <CAMZfGtUEaFg=CGLRJomyumsZzcyn8O0JE1+De2Vd3a5remcH6w@mail.gmail.com> (raw)
In-Reply-To: <20220210193345.23628-5-joao.m.martins@oracle.com>

On Fri, Feb 11, 2022 at 3:34 AM Joao Martins <joao.m.martins@oracle.com> wrote:
[...]
>  pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
> -                                      struct vmem_altmap *altmap)
> +                                      struct vmem_altmap *altmap,
> +                                      struct page *block)

Why not use the name of "reuse" instead of "block"?
Seems like "reuse" is more clear.

>  {
>         pte_t *pte = pte_offset_kernel(pmd, addr);
>         if (pte_none(*pte)) {
>                 pte_t entry;
>                 void *p;
>
> -               p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
> -               if (!p)
> -                       return NULL;
> +               if (!block) {
> +                       p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
> +                       if (!p)
> +                               return NULL;
> +               } else {
> +                       /*
> +                        * When a PTE/PMD entry is freed from the init_mm
> +                        * there's a a free_pages() call to this page allocated
> +                        * above. Thus this get_page() is paired with the
> +                        * put_page_testzero() on the freeing path.
> +                        * This can only called by certain ZONE_DEVICE path,
> +                        * and through vmemmap_populate_compound_pages() when
> +                        * slab is available.
> +                        */
> +                       get_page(block);
> +                       p = page_to_virt(block);
> +               }
>                 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
>                 set_pte_at(&init_mm, addr, pte, entry);
>         }
> @@ -609,7 +624,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
>  }
>
>  static int __meminit vmemmap_populate_address(unsigned long addr, int node,
> -                                             struct vmem_altmap *altmap)
> +                                             struct vmem_altmap *altmap,
> +                                             struct page *reuse, struct page **page)

We can remove the last argument (struct page **page) if we change
the return type to "pte_t *".  More simple, don't you think?

>  {
>         pgd_t *pgd;
>         p4d_t *p4d;
> @@ -629,11 +645,13 @@ static int __meminit vmemmap_populate_address(unsigned long addr, int node,
>         pmd = vmemmap_pmd_populate(pud, addr, node);
>         if (!pmd)
>                 return -ENOMEM;
> -       pte = vmemmap_pte_populate(pmd, addr, node, altmap);
> +       pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
>         if (!pte)
>                 return -ENOMEM;
>         vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
>
> +       if (page)
> +               *page = pte_page(*pte);
>         return 0;
>  }
>
> @@ -644,10 +662,120 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
>         int rc;
>
>         for (; addr < end; addr += PAGE_SIZE) {
> -               rc = vmemmap_populate_address(addr, node, altmap);
> +               rc = vmemmap_populate_address(addr, node, altmap, NULL, NULL);
>                 if (rc)
>                         return rc;
> +       }
> +
> +       return 0;
> +}
> +
> +static int __meminit vmemmap_populate_range(unsigned long start,
> +                                           unsigned long end,
> +                                           int node, struct page *page)
> +{
> +       unsigned long addr = start;
> +       int rc;
>
> +       for (; addr < end; addr += PAGE_SIZE) {
> +               rc = vmemmap_populate_address(addr, node, NULL, page, NULL);
> +               if (rc)
> +                       return rc;
> +       }
> +
> +       return 0;
> +}
> +
> +static inline int __meminit vmemmap_populate_page(unsigned long addr, int node,
> +                                                 struct page **page)
> +{
> +       return vmemmap_populate_address(addr, node, NULL, NULL, page);
> +}
> +
> +/*
> + * For compound pages bigger than section size (e.g. x86 1G compound
> + * pages with 2M subsection size) fill the rest of sections as tail
> + * pages.
> + *
> + * Note that memremap_pages() resets @nr_range value and will increment
> + * it after each range successful onlining. Thus the value or @nr_range
> + * at section memmap populate corresponds to the in-progress range
> + * being onlined here.
> + */
> +static bool __meminit reuse_compound_section(unsigned long start_pfn,
> +                                            struct dev_pagemap *pgmap)
> +{
> +       unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
> +       unsigned long offset = start_pfn -
> +               PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
> +
> +       return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
> +}
> +
> +static struct page * __meminit compound_section_tail_page(unsigned long addr)
> +{
> +       pte_t *ptep;
> +
> +       addr -= PAGE_SIZE;
> +
> +       /*
> +        * Assuming sections are populated sequentially, the previous section's
> +        * page data can be reused.
> +        */
> +       ptep = pte_offset_kernel(pmd_off_k(addr), addr);
> +       if (!ptep)
> +               return NULL;
> +
> +       return pte_page(*ptep);
> +}
> +
> +static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
> +                                                    unsigned long start,
> +                                                    unsigned long end, int node,
> +                                                    struct dev_pagemap *pgmap)
> +{
> +       unsigned long size, addr;
> +
> +       if (reuse_compound_section(start_pfn, pgmap)) {
> +               struct page *page;
> +
> +               page = compound_section_tail_page(start);
> +               if (!page)
> +                       return -ENOMEM;
> +
> +               /*
> +                * Reuse the page that was populated in the prior iteration
> +                * with just tail struct pages.
> +                */
> +               return vmemmap_populate_range(start, end, node, page);
> +       }
> +
> +       size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
> +       for (addr = start; addr < end; addr += size) {
> +               unsigned long next = addr, last = addr + size;
> +               struct page *block;
> +               int rc;
> +
> +               /* Populate the head page vmemmap page */
> +               rc = vmemmap_populate_page(addr, node, NULL);
> +               if (rc)
> +                       return rc;
> +
> +               /* Populate the tail pages vmemmap page */
> +               block = NULL;
> +               next = addr + PAGE_SIZE;
> +               rc = vmemmap_populate_page(next, node, &block);
> +               if (rc)
> +                       return rc;
> +
> +               /*
> +                * Reuse the previous page for the rest of tail pages
> +                * See layout diagram in Documentation/vm/vmemmap_dedup.rst
> +                */
> +               next += PAGE_SIZE;
> +               rc = vmemmap_populate_range(next, last, node, block);
> +               if (rc)
> +                       return rc;
>         }
>
>         return 0;
> @@ -659,12 +787,18 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
>  {
>         unsigned long start = (unsigned long) pfn_to_page(pfn);
>         unsigned long end = start + nr_pages * sizeof(struct page);
> +       int r;
>
>         if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
>                 !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
>                 return NULL;
>
> -       if (vmemmap_populate(start, end, nid, altmap))
> +       if (pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)

Should we add a judgment like "is_power_of_2(sizeof(struct page))" since
this optimization is only applied when the size of the struct page does not
cross page boundaries?

Thanks.

next prev parent reply	other threads:[~2022-02-11  7:55 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-02-10 19:33 [PATCH v5 0/5] sparse-vmemmap: memory savings for compound devmaps (device-dax) Joao Martins
2022-02-10 19:33 ` [PATCH v5 1/5] mm/sparse-vmemmap: add a pgmap argument to section activation Joao Martins
2022-02-11  8:03   ` Muchun Song
2022-02-11 12:37     ` Joao Martins
2022-02-10 19:33 ` [PATCH v5 2/5] mm/sparse-vmemmap: refactor core of vmemmap_populate_basepages() to helper Joao Martins
2022-02-11  7:54   ` Muchun Song
2022-02-10 19:33 ` [PATCH v5 3/5] mm/hugetlb_vmemmap: move comment block to Documentation/vm Joao Martins
2022-02-10 19:33 ` [PATCH v5 4/5] mm/sparse-vmemmap: improve memory savings for compound devmaps Joao Martins
2022-02-11  7:54   ` Muchun Song [this message]
2022-02-11 12:37     ` Joao Martins
2022-02-12 10:08       ` Muchun Song
2022-02-12 14:49         ` Muchun Song
2022-02-14 10:57           ` Joao Martins
2022-02-14 10:55         ` Joao Martins
2022-02-10 19:33 ` [PATCH v5 5/5] mm/page_alloc: reuse tail struct pages " Joao Martins
2022-02-11  5:07   ` Muchun Song
2022-02-11 12:48     ` Joao Martins
2022-02-12 11:11       ` Muchun Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAMZfGtUEaFg=CGLRJomyumsZzcyn8O0JE1+De2Vd3a5remcH6w@mail.gmail.com' \
    --to=songmuchun@bytedance.com \
    --cc=akpm@linux-foundation.org \
    --cc=corbet@lwn.net \
    --cc=dan.j.williams@intel.com \
    --cc=hch@lst.de \
    --cc=jane.chu@oracle.com \
    --cc=jgg@ziepe.ca \
    --cc=joao.m.martins@oracle.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mike.kravetz@oracle.com \
    --cc=nvdimm@lists.linux.dev \
    --cc=vishal.l.verma@intel.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox