From: Ryan Roberts <ryan.roberts@arm.com>
To: Dev Jain <dev.jain@arm.com>,
akpm@linux-foundation.org, david@redhat.com, willy@infradead.org,
kirill.shutemov@linux.intel.com
Cc: anshuman.khandual@arm.com, catalin.marinas@arm.com,
cl@gentwo.org, vbabka@suse.cz, mhocko@suse.com,
apopple@nvidia.com, dave.hansen@linux.intel.com, will@kernel.org,
baohua@kernel.org, jack@suse.cz, srivatsa@csail.mit.edu,
haowenchao22@gmail.com, hughd@google.com,
aneesh.kumar@kernel.org, yang@os.amperecomputing.com,
peterx@redhat.com, ioworker0@gmail.com,
wangkefeng.wang@huawei.com, ziy@nvidia.com, jglisse@google.com,
surenb@google.com, vishal.moola@gmail.com, zokeefe@google.com,
zhengqi.arch@bytedance.com, jhubbard@nvidia.com,
21cnbao@gmail.com, linux-mm@kvack.org,
linux-kernel@vger.kernel.org
Subject: Re: [RFC PATCH 07/12] khugepaged: Scan PTEs order-wise
Date: Tue, 17 Dec 2024 18:15:09 +0000 [thread overview]
Message-ID: <d0e339a0-94c9-42e7-b62c-63fde6cffd71@arm.com> (raw)
In-Reply-To: <20241216165105.56185-8-dev.jain@arm.com>
On 16/12/2024 16:51, Dev Jain wrote:
> Scan the PTEs order-wise, using the mask of suitable orders for this VMA
> derived in conjunction with sysfs THP settings. Scale down the tunables; in
> case of collapse failure, we drop down to the next order. Otherwise, we try to
> jump to the highest possible order and then start a fresh scan. Note that
> madvise(MADV_COLLAPSE) has not been generalized.
Is there are reason you are not modifying MADV_COLLAPSE? It's really just a
synchonous way to do what khugepaged does asynchonously (isn't it?), so it would
behave the same way in an ideal world.
>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
> mm/khugepaged.c | 84 ++++++++++++++++++++++++++++++++++++++++---------
> 1 file changed, 69 insertions(+), 15 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 886c76816963..078794aa3335 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -20,6 +20,7 @@
> #include <linux/swapops.h>
> #include <linux/shmem_fs.h>
> #include <linux/ksm.h>
> +#include <linux/count_zeros.h>
>
> #include <asm/tlb.h>
> #include <asm/pgalloc.h>
> @@ -1111,7 +1112,7 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> }
>
> static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> - int referenced, int unmapped,
> + int referenced, int unmapped, int order,
> struct collapse_control *cc)
> {
> LIST_HEAD(compound_pagelist);
> @@ -1278,38 +1279,59 @@ static int hpage_collapse_scan_ptes(struct mm_struct *mm,
> unsigned long address, bool *mmap_locked,
> struct collapse_control *cc)
> {
> - pmd_t *pmd;
> - pte_t *pte, *_pte;
> - int result = SCAN_FAIL, referenced = 0;
> - int none_or_zero = 0, shared = 0;
> - struct page *page = NULL;
> + unsigned int max_ptes_shared, max_ptes_none, max_ptes_swap;
> + int referenced, shared, none_or_zero, unmapped;
> + unsigned long _address, org_address = address;
nit: Perhaps it's clearer to keep the original address in address and use a
variable, start, for the starting point of each scan?
> struct folio *folio = NULL;
> - unsigned long _address;
> - spinlock_t *ptl;
> - int node = NUMA_NO_NODE, unmapped = 0;
> + struct page *page = NULL;
> + int node = NUMA_NO_NODE;
> + int result = SCAN_FAIL;
> bool writable = false;
> + unsigned long orders;
> + pte_t *pte, *_pte;
> + spinlock_t *ptl;
> + pmd_t *pmd;
> + int order;
>
> VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>
> + orders = thp_vma_allowable_orders(vma, vma->vm_flags,
> + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER + 1) - 1);
Perhaps THP_ORDERS_ALL instead of "BIT(PMD_ORDER + 1) - 1"?
> + orders = thp_vma_suitable_orders(vma, address, orders);
> + order = highest_order(orders);
> +
> + /* MADV_COLLAPSE needs to work irrespective of sysfs setting */
> + if (!cc->is_khugepaged)
> + order = HPAGE_PMD_ORDER;
> +
> +scan_pte_range:
> +
> + max_ptes_shared = khugepaged_max_ptes_shared >> (HPAGE_PMD_ORDER - order);
> + max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
> + max_ptes_swap = khugepaged_max_ptes_swap >> (HPAGE_PMD_ORDER - order);
> + referenced = 0, shared = 0, none_or_zero = 0, unmapped = 0;
> +
> + /* Check pmd after taking mmap lock */
> result = find_pmd_or_thp_or_none(mm, address, &pmd);
> if (result != SCAN_SUCCEED)
> goto out;
>
> memset(cc->node_load, 0, sizeof(cc->node_load));
> nodes_clear(cc->alloc_nmask);
> +
> pte = pte_offset_map_lock(mm, pmd, address, &ptl);
> if (!pte) {
> result = SCAN_PMD_NULL;
> goto out;
> }
>
> - for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> + for (_address = address, _pte = pte; _pte < pte + (1UL << order);
> _pte++, _address += PAGE_SIZE) {
> pte_t pteval = ptep_get(_pte);
> if (is_swap_pte(pteval)) {
> ++unmapped;
> if (!cc->is_khugepaged ||
> - unmapped <= khugepaged_max_ptes_swap) {
> + unmapped <= max_ptes_swap) {
> /*
> * Always be strict with uffd-wp
> * enabled swap entries. Please see
> @@ -1330,7 +1352,7 @@ static int hpage_collapse_scan_ptes(struct mm_struct *mm,
> ++none_or_zero;
> if (!userfaultfd_armed(vma) &&
> (!cc->is_khugepaged ||
> - none_or_zero <= khugepaged_max_ptes_none)) {
> + none_or_zero <= max_ptes_none)) {
> continue;
> } else {
> result = SCAN_EXCEED_NONE_PTE;
> @@ -1375,7 +1397,7 @@ static int hpage_collapse_scan_ptes(struct mm_struct *mm,
> if (folio_likely_mapped_shared(folio)) {
> ++shared;
> if (cc->is_khugepaged &&
> - shared > khugepaged_max_ptes_shared) {
> + shared > max_ptes_shared) {
> result = SCAN_EXCEED_SHARED_PTE;
> count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> goto out_unmap;
> @@ -1432,7 +1454,7 @@ static int hpage_collapse_scan_ptes(struct mm_struct *mm,
> result = SCAN_PAGE_RO;
> } else if (cc->is_khugepaged &&
> (!referenced ||
> - (unmapped && referenced < HPAGE_PMD_NR / 2))) {
> + (unmapped && referenced < (1UL << order) / 2))) {
> result = SCAN_LACK_REFERENCED_PAGE;
> } else {
> result = SCAN_SUCCEED;
> @@ -1441,9 +1463,41 @@ static int hpage_collapse_scan_ptes(struct mm_struct *mm,
> pte_unmap_unlock(pte, ptl);
> if (result == SCAN_SUCCEED) {
> result = collapse_huge_page(mm, address, referenced,
> - unmapped, cc);
> + unmapped, order, cc);
> /* collapse_huge_page will return with the mmap_lock released */
> *mmap_locked = false;
> +
> + /* Immediately exit on exhaustion of range */
> + if (_address == org_address + (PAGE_SIZE << HPAGE_PMD_ORDER))
> + goto out;
Looks like this assumes this function is always asked to scan a full PTE table?
Does that mean that you can't handle collapse for VMAs that don't span a whole
PMD entry? I think we will want to support that.
> + }
> + if (result != SCAN_SUCCEED) {
> +
> + /* Go to the next order. */
> + order = next_order(&orders, order);
> + if (order < 2)
This should be:
if (!orders)
I think the return order is undefined when order is the last order in orders.
> + goto out;
> + goto maybe_mmap_lock;
> + } else {
> + address = _address;
> + pte = _pte;
> +
> +
> + /* Get highest order possible starting from address */
> + order = count_trailing_zeros(address >> PAGE_SHIFT);
> +
> + /* This needs to be present in the mask too */
> + if (!(orders & (1UL << order)))
> + order = next_order(&orders, order);
Not quite; if the exact order isn't in the bitmap, this will pick out the
highest order in the bitmap, which may be higher than count_trailing_zeros()
returned. You could do:
order = count_trailing_zeros(address >> PAGE_SHIFT);
orders &= (1UL << order + 1) - 1;
order = next_order(&orders, order);
if (!orders)
goto out;
That will mask out any orders that are bigger than the one returned by
count_trailing_zeros() then next_order() will return the highest order in the
remaining set.
But even that doesn't quite work because next_order() is destructive. Once you
arrive on a higher order address boundary, you want to be able to select a
higher order from the original orders bitmap. But you have lost them on a
previous go around the loop.
Perhaps stash orig_orders at the top of the function when you first calculate
it. Then I think this works (totally untested):
order = count_trailing_zeros(address >> PAGE_SHIFT);
orders = orig_orders & (1UL << order + 1) - 1;
order = next_order(&orders, order);
if (!orders)
goto out;
You might want to do something like this for the first go around the loop, but I
think address is currently always at the start of the PMD on entry, so not
needed until that restriction is removed.
> + if (order < 2)
> + goto out;
> +
> +maybe_mmap_lock:
> + if (!(*mmap_locked)) {
> + mmap_read_lock(mm);
Given the lock was already held in read mode on entering this function, then
released by collapse_huge_page(), is it definitely safe to retake this lock and
rerun this function? Is it possible that state that was checked before entering
this function has changed since the lock was released that would now need
re-checking?
> + *mmap_locked = true;
> + }
> + goto scan_pte_range;
> }
> out:
> trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
next prev parent reply other threads:[~2024-12-17 18:15 UTC|newest]
Thread overview: 74+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-12-16 16:50 [RFC PATCH 00/12] khugepaged: Asynchronous mTHP collapse Dev Jain
2024-12-16 16:50 ` [RFC PATCH 01/12] khugepaged: Rename hpage_collapse_scan_pmd() -> ptes() Dev Jain
2024-12-17 4:18 ` Matthew Wilcox
2024-12-17 5:52 ` Dev Jain
2024-12-17 6:43 ` Ryan Roberts
2024-12-17 18:11 ` Zi Yan
2024-12-17 19:12 ` Ryan Roberts
2024-12-16 16:50 ` [RFC PATCH 02/12] khugepaged: Generalize alloc_charge_folio() Dev Jain
2024-12-17 2:51 ` Baolin Wang
2024-12-17 6:08 ` Dev Jain
2024-12-17 4:17 ` Matthew Wilcox
2024-12-17 7:09 ` Ryan Roberts
2024-12-17 13:00 ` Zi Yan
2024-12-20 17:41 ` Christoph Lameter (Ampere)
2024-12-20 17:45 ` Ryan Roberts
2024-12-20 18:47 ` Christoph Lameter (Ampere)
2025-01-02 11:21 ` Ryan Roberts
2024-12-17 6:53 ` Ryan Roberts
2024-12-17 9:06 ` Dev Jain
2024-12-16 16:50 ` [RFC PATCH 03/12] khugepaged: Generalize hugepage_vma_revalidate() Dev Jain
2024-12-17 4:21 ` Matthew Wilcox
2024-12-17 16:58 ` Ryan Roberts
2024-12-16 16:50 ` [RFC PATCH 04/12] khugepaged: Generalize __collapse_huge_page_swapin() Dev Jain
2024-12-17 4:24 ` Matthew Wilcox
2024-12-16 16:50 ` [RFC PATCH 05/12] khugepaged: Generalize __collapse_huge_page_isolate() Dev Jain
2024-12-17 4:32 ` Matthew Wilcox
2024-12-17 6:41 ` Dev Jain
2024-12-17 17:14 ` Ryan Roberts
2024-12-17 17:09 ` Ryan Roberts
2024-12-16 16:50 ` [RFC PATCH 06/12] khugepaged: Generalize __collapse_huge_page_copy_failed() Dev Jain
2024-12-17 17:22 ` Ryan Roberts
2024-12-18 8:49 ` Dev Jain
2024-12-16 16:51 ` [RFC PATCH 07/12] khugepaged: Scan PTEs order-wise Dev Jain
2024-12-17 18:15 ` Ryan Roberts [this message]
2024-12-18 9:24 ` Dev Jain
2025-01-06 10:04 ` Usama Arif
2025-01-07 7:17 ` Dev Jain
2024-12-16 16:51 ` [RFC PATCH 08/12] khugepaged: Abstract PMD-THP collapse Dev Jain
2024-12-17 19:24 ` Ryan Roberts
2024-12-18 9:26 ` Dev Jain
2024-12-16 16:51 ` [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio() Dev Jain
2024-12-16 17:06 ` David Hildenbrand
2024-12-16 19:08 ` Yang Shi
2024-12-17 10:07 ` Dev Jain
2024-12-17 10:32 ` David Hildenbrand
2024-12-18 8:35 ` Dev Jain
2025-01-02 10:08 ` Dev Jain
2025-01-02 11:33 ` David Hildenbrand
2025-01-03 8:17 ` Dev Jain
2025-01-02 11:22 ` David Hildenbrand
2024-12-18 15:59 ` Dev Jain
2025-01-06 10:17 ` Usama Arif
2025-01-07 8:12 ` Dev Jain
2024-12-16 16:51 ` [RFC PATCH 10/12] khugepaged: Skip PTE range if a larger mTHP is already mapped Dev Jain
2024-12-18 7:36 ` Ryan Roberts
2024-12-18 9:34 ` Dev Jain
2024-12-19 3:40 ` John Hubbard
2024-12-19 3:51 ` Zi Yan
2024-12-19 7:59 ` Dev Jain
2024-12-19 8:07 ` Dev Jain
2024-12-20 11:57 ` Ryan Roberts
2024-12-16 16:51 ` [RFC PATCH 11/12] khugepaged: Enable sysfs to control order of collapse Dev Jain
2024-12-16 16:51 ` [RFC PATCH 12/12] selftests/mm: khugepaged: Enlighten for mTHP collapse Dev Jain
2024-12-18 9:03 ` Ryan Roberts
2024-12-18 9:50 ` Dev Jain
2024-12-20 11:05 ` Ryan Roberts
2024-12-30 7:09 ` Dev Jain
2024-12-30 16:36 ` Zi Yan
2025-01-02 11:43 ` Ryan Roberts
2025-01-03 10:10 ` Dev Jain
2025-01-03 10:11 ` Dev Jain
2024-12-16 17:31 ` [RFC PATCH 00/12] khugepaged: Asynchronous " Dev Jain
2025-01-02 21:58 ` Nico Pache
2025-01-03 7:04 ` Dev Jain
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=d0e339a0-94c9-42e7-b62c-63fde6cffd71@arm.com \
--to=ryan.roberts@arm.com \
--cc=21cnbao@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=aneesh.kumar@kernel.org \
--cc=anshuman.khandual@arm.com \
--cc=apopple@nvidia.com \
--cc=baohua@kernel.org \
--cc=catalin.marinas@arm.com \
--cc=cl@gentwo.org \
--cc=dave.hansen@linux.intel.com \
--cc=david@redhat.com \
--cc=dev.jain@arm.com \
--cc=haowenchao22@gmail.com \
--cc=hughd@google.com \
--cc=ioworker0@gmail.com \
--cc=jack@suse.cz \
--cc=jglisse@google.com \
--cc=jhubbard@nvidia.com \
--cc=kirill.shutemov@linux.intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@suse.com \
--cc=peterx@redhat.com \
--cc=srivatsa@csail.mit.edu \
--cc=surenb@google.com \
--cc=vbabka@suse.cz \
--cc=vishal.moola@gmail.com \
--cc=wangkefeng.wang@huawei.com \
--cc=will@kernel.org \
--cc=willy@infradead.org \
--cc=yang@os.amperecomputing.com \
--cc=zhengqi.arch@bytedance.com \
--cc=ziy@nvidia.com \
--cc=zokeefe@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox