Re: [RFC PATCH 4/6] mm: thp: add support for split huge page to any lower order pages.

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Zi Yan <ziy@nvidia.com>
To: Ralph Campbell <rcampbell@nvidia.com>
Cc: <linux-mm@kvack.org>, Matthew Wilcox <willy@infradead.org>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	Roman Gushchin <guro@fb.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	<linux-kernel@vger.kernel.org>, <linux-kselftest@vger.kernel.org>,
	Yang Shi <shy828301@gmail.com>,
	"Michal Hocko" <mhocko@kernel.org>,
	John Hubbard <jhubbard@nvidia.com>,
	David Nellans <dnellans@nvidia.com>
Subject: Re: [RFC PATCH 4/6] mm: thp: add support for split huge page to any lower order pages.
Date: Thu, 12 Nov 2020 17:20:50 -0500	[thread overview]
Message-ID: <18B08769-FE06-4516-8426-86EB8A4AE4E5@nvidia.com> (raw)
In-Reply-To: <9b480b89-6a05-bdb7-c6ff-4518c008595a@nvidia.com>

[-- Attachment #1: Type: text/plain, Size: 11337 bytes --]

On 12 Nov 2020, at 17:01, Ralph Campbell wrote:

> On 11/11/20 12:40 PM, Zi Yan wrote:
>> From: Zi Yan <ziy@nvidia.com>
>>
>> To split a THP to any lower order pages, we need to reform THPs on
>> subpages at given order and add page refcount based on the new page
>> order. Also we need to reinitialize page_deferred_list after removing
>> the page from the split_queue, otherwise a subsequent split will see
>> list corruption when checking the page_deferred_list again.
>>
>> It has many uses, like minimizing the number of pages after
>> truncating a pagecache THP. For anonymous THPs, we can only split them
>> to order-0 like before until we add support for any size anonymous THPs.
>>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>>   include/linux/huge_mm.h |  8 +++++
>>   mm/huge_memory.c        | 78 +++++++++++++++++++++++++++++------------
>>   mm/swap.c               |  1 -
>>   3 files changed, 63 insertions(+), 24 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 60a907a19f7d..9819cd9b4619 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -189,6 +189,8 @@ bool is_transparent_hugepage(struct page *page);
>>    bool can_split_huge_page(struct page *page, int *pextra_pins);
>>   int split_huge_page_to_list(struct page *page, struct list_head *list);
>> +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>> +		unsigned int new_order);
>>   static inline int split_huge_page(struct page *page)
>>   {
>>   	return split_huge_page_to_list(page, NULL);
>> @@ -396,6 +398,12 @@ split_huge_page_to_list(struct page *page, struct list_head *list)
>>   {
>>   	return 0;
>>   }
>> +static inline int
>> +split_huge_page_to_order_to_list(struct page *page, struct list_head *list,
>> +		unsigned int new_order)
>> +{
>> +	return 0;
>> +}
>>   static inline int split_huge_page(struct page *page)
>>   {
>>   	return 0;
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 8b7d771ee962..88f50da40c9b 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2327,11 +2327,14 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
>>   static void unmap_page(struct page *page)
>>   {
>>   	enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
>> -		TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
>> +		TTU_RMAP_LOCKED;
>>   	bool unmap_success;
>>    	VM_BUG_ON_PAGE(!PageHead(page), page);
>>  +	if (thp_order(page) >= HPAGE_PMD_ORDER)
>> +		ttu_flags |= TTU_SPLIT_HUGE_PMD;
>> +
>>   	if (PageAnon(page))
>>   		ttu_flags |= TTU_SPLIT_FREEZE;
>>  @@ -2339,21 +2342,22 @@ static void unmap_page(struct page *page)
>>   	VM_BUG_ON_PAGE(!unmap_success, page);
>>   }
>>  -static void remap_page(struct page *page, unsigned int nr)
>> +static void remap_page(struct page *page, unsigned int nr, unsigned int new_nr)
>>   {
>>   	int i;
>
> Use unsigned int i?
> Maybe a blank line here and the {}'s around if/else aren't needed.
>
>> -	if (PageTransHuge(page)) {
>> +	if (thp_nr_pages(page) == nr) {
>>   		remove_migration_ptes(page, page, true);
>>   	} else {
>> -		for (i = 0; i < nr; i++)
>> +		for (i = 0; i < nr; i += new_nr)
>>   			remove_migration_ptes(page + i, page + i, true);
>>   	}
>>   }
>>    static void __split_huge_page_tail(struct page *head, int tail,
>> -		struct lruvec *lruvec, struct list_head *list)
>> +		struct lruvec *lruvec, struct list_head *list, unsigned int new_order)
>>   {
>>   	struct page *page_tail = head + tail;
>> +	unsigned long compound_head_flag = new_order ? (1L << PG_head) : 0;
>>    	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
>>  @@ -2377,6 +2381,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
>>   #ifdef CONFIG_64BIT
>>   			 (1L << PG_arch_2) |
>>   #endif
>> +			 compound_head_flag |
>>   			 (1L << PG_dirty)));
>>    	/* ->mapping in first tail page is compound_mapcount */
>> @@ -2395,10 +2400,15 @@ static void __split_huge_page_tail(struct page *head, int tail,
>>   	 * which needs correct compound_head().
>>   	 */
>>   	clear_compound_head(page_tail);
>> +	if (new_order) {
>> +		prep_compound_page(page_tail, new_order);
>> +		thp_prep(page_tail);
>> +	}
>>    	/* Finally unfreeze refcount. Additional reference from page cache. */
>> -	page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
>> -					  PageSwapCache(head)));
>> +	page_ref_unfreeze(page_tail, 1 + ((!PageAnon(head) ||
>> +					   PageSwapCache(head)) ?
>> +						thp_nr_pages(page_tail) : 0));
>>    	if (page_is_young(head))
>>   		set_page_young(page_tail);
>> @@ -2416,7 +2426,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
>>   }
>>    static void __split_huge_page(struct page *page, struct list_head *list,
>> -		pgoff_t end, unsigned long flags)
>> +		pgoff_t end, unsigned long flags, unsigned int new_order)
>>   {
>>   	struct page *head = compound_head(page);
>>   	pg_data_t *pgdat = page_pgdat(head);
>> @@ -2424,12 +2434,13 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>   	struct address_space *swap_cache = NULL;
>>   	unsigned long offset = 0;
>>   	unsigned int nr = thp_nr_pages(head);
>> +	unsigned int new_nr = 1 << new_order;
>>   	int i;
>>    	lruvec = mem_cgroup_page_lruvec(head, pgdat);
>>    	/* complete memcg works before add pages to LRU */
>> -	mem_cgroup_split_huge_fixup(head, 1);
>> +	mem_cgroup_split_huge_fixup(head, new_nr);
>>    	if (PageAnon(head) && PageSwapCache(head)) {
>>   		swp_entry_t entry = { .val = page_private(head) };
>> @@ -2439,14 +2450,14 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>   		xa_lock(&swap_cache->i_pages);
>>   	}
>>  -	for (i = nr - 1; i >= 1; i--) {
>> -		__split_huge_page_tail(head, i, lruvec, list);
>> +	for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
>> +		__split_huge_page_tail(head, i, lruvec, list, new_order);
>>   		/* Some pages can be beyond i_size: drop them from page cache */
>>   		if (head[i].index >= end) {
>>   			ClearPageDirty(head + i);
>>   			__delete_from_page_cache(head + i, NULL);
>>   			if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
>> -				shmem_uncharge(head->mapping->host, 1);
>> +				shmem_uncharge(head->mapping->host, new_nr);
>>   			put_page(head + i);
>>   		} else if (!PageAnon(page)) {
>>   			__xa_store(&head->mapping->i_pages, head[i].index,
>> @@ -2457,28 +2468,31 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>   		}
>>   	}
>>  -	ClearPageCompound(head);
>> +	if (!new_order)
>> +		ClearPageCompound(head);
>> +	else
>> +		set_compound_order(head, new_order);
>>  -	split_page_owner(head, nr, 1);
>> +	split_page_owner(head, nr, new_nr);
>
> This needs to be "new_order" instead of "new_nr".
>
>>   	/* See comment in __split_huge_page_tail() */
>>   	if (PageAnon(head)) {
>>   		/* Additional pin to swap cache */
>>   		if (PageSwapCache(head)) {
>> -			page_ref_add(head, 2);
>> +			page_ref_add(head, 1 + new_nr);
>>   			xa_unlock(&swap_cache->i_pages);
>>   		} else {
>>   			page_ref_inc(head);
>>   		}
>>   	} else {
>>   		/* Additional pin to page cache */
>> -		page_ref_add(head, 2);
>> +		page_ref_add(head, 1 + new_nr);
>>   		xa_unlock(&head->mapping->i_pages);
>>   	}
>>    	spin_unlock_irqrestore(&pgdat->lru_lock, flags);
>>  -	remap_page(head, nr);
>> +	remap_page(head, nr, new_nr);
>>    	if (PageSwapCache(head)) {
>>   		swp_entry_t entry = { .val = page_private(head) };
>> @@ -2486,7 +2500,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>   		split_swap_cluster(entry);
>>   	}
>>  -	for (i = 0; i < nr; i++) {
>> +	for (i = 0; i < nr; i += new_nr) {
>>   		struct page *subpage = head + i;
>>   		if (subpage == page)
>>   			continue;
>> @@ -2620,21 +2634,39 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
>>    * us.
>>    */
>>   int split_huge_page_to_list(struct page *page, struct list_head *list)
>> +{
>> +	return split_huge_page_to_list_to_order(page, list, 0);
>> +}
>> +
>> +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>> +				     unsigned int new_order)
>>   {
>>   	struct page *head = compound_head(page);
>>   	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
>>   	struct deferred_split *ds_queue = get_deferred_split_queue(head);
>> -	XA_STATE(xas, &head->mapping->i_pages, head->index);
>> +	/* reset xarray order to new order after split */
>> +	XA_STATE_ORDER(xas, &head->mapping->i_pages, head->index, new_order);
>>   	struct anon_vma *anon_vma = NULL;
>>   	struct address_space *mapping = NULL;
>>   	int count, mapcount, extra_pins, ret;
>>   	unsigned long flags;
>>   	pgoff_t end;
>>  +	VM_BUG_ON(thp_order(head) <= new_order);
>>   	VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
>>   	VM_BUG_ON_PAGE(!PageLocked(head), head);
>>   	VM_BUG_ON_PAGE(!PageCompound(head), head);
>>  +	if (new_order == 1) {
>> +		WARN_ONCE(1, "Cannot split THP to order-1 (no order-1 THPs)");
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (PageAnon(head) && new_order) {
>> +		WARN_ONCE(1, "Split anonymous THP to non-zero order not support");
>> +		return -EINVAL;
>> +	}
>> +
>>   	if (PageWriteback(head))
>>   		return -EBUSY;
>>  @@ -2720,18 +2752,18 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>>   	if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
>>   		if (!list_empty(page_deferred_list(head))) {
>>   			ds_queue->split_queue_len--;
>> -			list_del(page_deferred_list(head));
>> +			list_del_init(page_deferred_list(head));
>>   		}
>>   		spin_unlock(&ds_queue->split_queue_lock);
>>   		if (mapping) {
>>   			if (PageSwapBacked(head))
>>   				__dec_lruvec_page_state(head, NR_SHMEM_THPS);
>> -			else
>> +			else if (!new_order)
>>   				__mod_lruvec_page_state(head, NR_FILE_THPS,
>>   						-thp_nr_pages(head));
>>   		}
>>  -		__split_huge_page(page, list, end, flags);
>> +		__split_huge_page(page, list, end, flags, new_order);
>>   		ret = 0;
>>   	} else {
>>   		if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
>> @@ -2746,7 +2778,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>>   fail:		if (mapping)
>>   			xas_unlock(&xas);
>>   		spin_unlock_irqrestore(&pgdata->lru_lock, flags);
>> -		remap_page(head, thp_nr_pages(head));
>> +		remap_page(head, thp_nr_pages(head), 0);
>
> Shouldn't this be "1" instead of zero?
> remap_page() takes new_nr.
>
>>   		ret = -EBUSY;
>>   	}
>>  diff --git a/mm/swap.c b/mm/swap.c
>> index 14c3bac607a6..6c33e6165597 100644
>> --- a/mm/swap.c
>> +++ b/mm/swap.c
>> @@ -983,7 +983,6 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
>>   		       struct lruvec *lruvec, struct list_head *list)
>>   {
>>   	VM_BUG_ON_PAGE(!PageHead(page), page);
>> -	VM_BUG_ON_PAGE(PageCompound(page_tail), page);
>>   	VM_BUG_ON_PAGE(PageLRU(page_tail), page);
>>   	lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
>>

Thanks for catching all these. Will fix them in the next version.


—
Best Regards,
Yan Zi

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 854 bytes --]

next prev parent reply	other threads:[~2020-11-12 22:21 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-11 20:40 [RFC PATCH 0/6] Split huge pages " Zi Yan
2020-11-11 20:40 ` [RFC PATCH 1/6] mm: huge_memory: add new debugfs interface to trigger split huge page on any page range Zi Yan
2020-11-12 22:22   ` Ralph Campbell
2020-11-12 22:38     ` Zi Yan
2020-11-16 16:06   ` Kirill A. Shutemov
2020-11-16 17:26     ` Zi Yan
2020-11-11 20:40 ` [RFC PATCH 2/6] mm: memcg: make memcg huge page split support any order split Zi Yan
2020-11-12 17:58   ` Ralph Campbell
2020-11-12 18:00     ` Zi Yan
2020-11-14  0:23   ` Roman Gushchin
2020-11-14  0:56     ` Zi Yan
2020-11-11 20:40 ` [RFC PATCH 3/6] mm: page_owner: add support for splitting to any order in split page_owner Zi Yan
2020-11-12 17:57   ` Ralph Campbell
2020-11-12 17:59     ` Zi Yan
2020-11-14  0:15   ` Roman Gushchin
2020-11-14  1:08     ` Zi Yan
2020-11-14  1:38       ` Roman Gushchin
2020-11-17 21:05         ` Matthew Wilcox
2020-11-17 21:12           ` Zi Yan
2020-11-17 21:22             ` Matthew Wilcox
2020-11-17 21:25               ` Zi Yan
2020-11-17 21:35               ` Roman Gushchin
2020-11-17 21:43                 ` Matthew Wilcox
2020-11-16 16:25   ` Kirill A. Shutemov
2020-11-16 17:27     ` Zi Yan
2020-11-17 21:10   ` Matthew Wilcox
2020-11-17 21:13     ` Zi Yan
2020-11-11 20:40 ` [RFC PATCH 4/6] mm: thp: add support for split huge page to any lower order pages Zi Yan
2020-11-12 22:01   ` Ralph Campbell
2020-11-12 22:20     ` Zi Yan [this message]
2020-11-14  0:52   ` Roman Gushchin
2020-11-14  1:00     ` Zi Yan
2020-11-11 20:40 ` [RFC PATCH 5/6] mm: truncate: split thp to a non-zero order if possible Zi Yan
2020-11-12 22:08   ` Ralph Campbell
2020-11-12 22:37     ` Zi Yan
2020-11-11 20:40 ` [RFC PATCH 6/6] mm: huge_memory: enable debugfs to split huge pages to any order Zi Yan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=18B08769-FE06-4516-8426-86EB8A4AE4E5@nvidia.com \
    --to=ziy@nvidia.com \
    --cc=akpm@linux-foundation.org \
    --cc=dnellans@nvidia.com \
    --cc=guro@fb.com \
    --cc=jhubbard@nvidia.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=rcampbell@nvidia.com \
    --cc=shy828301@gmail.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox