linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Miaohe Lin <linmiaohe@huawei.com>
To: Zi Yan <ziy@nvidia.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>,
	Shuah Khan <shuah@kernel.org>, Yang Shi <shy828301@gmail.com>,
	Hugh Dickins <hughd@google.com>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>,
	<linux-kernel@vger.kernel.org>, <cgroups@vger.kernel.org>,
	<linux-kselftest@vger.kernel.org>,
	Matthew Wilcox <willy@infradead.org>, <linux-mm@kvack.org>,
	Yu Zhao <yuzhao@google.com>
Subject: Re: [RFC PATCH 3/5] mm: thp: split huge page to any lower order pages.
Date: Thu, 24 Mar 2022 10:02:19 +0800	[thread overview]
Message-ID: <6587a8af-8d6d-c947-2cee-11f75ceefef6@huawei.com> (raw)
In-Reply-To: <87E48455-3FDD-47FC-A953-CBAB52FD3889@nvidia.com>

On 2022/3/24 6:10, Zi Yan wrote:
> On 22 Mar 2022, at 22:31, Miaohe Lin wrote:
> 
>> On 2022/3/22 22:30, Zi Yan wrote:
>>> On 21 Mar 2022, at 23:21, Miaohe Lin wrote:
>>>
>>>> On 2022/3/21 22:21, Zi Yan wrote:
>>>>> From: Zi Yan <ziy@nvidia.com>
>>>>>
>>>>> To split a THP to any lower order pages, we need to reform THPs on
>>>>> subpages at given order and add page refcount based on the new page
>>>>> order. Also we need to reinitialize page_deferred_list after removing
>>>>> the page from the split_queue, otherwise a subsequent split will see
>>>>> list corruption when checking the page_deferred_list again.
>>>>>
>>>>> It has many uses, like minimizing the number of pages after
>>>>> truncating a pagecache THP. For anonymous THPs, we can only split them
>>>>> to order-0 like before until we add support for any size anonymous THPs.
>>>>>
>>>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>>>> ---
>>>>>  include/linux/huge_mm.h |   8 +++
>>>>>  mm/huge_memory.c        | 111 ++++++++++++++++++++++++++++++----------
>>>>>  2 files changed, 91 insertions(+), 28 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>> index 2999190adc22..c7153cd7e9e4 100644
>>>>> --- a/include/linux/huge_mm.h
>>>>> +++ b/include/linux/huge_mm.h
>>>>> @@ -186,6 +186,8 @@ void free_transhuge_page(struct page *page);
>>>>>
>>>>>  bool can_split_folio(struct folio *folio, int *pextra_pins);
>>>>>  int split_huge_page_to_list(struct page *page, struct list_head *list);
>>>>> +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>> +		unsigned int new_order);
>>>>>  static inline int split_huge_page(struct page *page)
>>>>>  {
>>>>>  	return split_huge_page_to_list(page, NULL);
>>>>> @@ -355,6 +357,12 @@ split_huge_page_to_list(struct page *page, struct list_head *list)
>>>>>  {
>>>>>  	return 0;
>>>>>  }
>>>>> +static inline int
>>>>> +split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>> +		unsigned int new_order)
>>>>> +{
>>>>> +	return 0;
>>>>> +}
>>>>>  static inline int split_huge_page(struct page *page)
>>>>>  {
>>>>>  	return 0;
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index fcfa46af6c4c..3617aa3ad0b1 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -2236,11 +2236,13 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
>>>>>  static void unmap_page(struct page *page)
>>>>>  {
>>>>>  	struct folio *folio = page_folio(page);
>>>>> -	enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
>>>>> -		TTU_SYNC;
>>>>> +	enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC;
>>>>>
>>>>>  	VM_BUG_ON_PAGE(!PageHead(page), page);
>>>>>
>>>>> +	if (folio_order(folio) >= HPAGE_PMD_ORDER)
>>>>> +		ttu_flags |= TTU_SPLIT_HUGE_PMD;
>>>>> +
>>>>>  	/*
>>>>>  	 * Anon pages need migration entries to preserve them, but file
>>>>>  	 * pages can simply be left unmapped, then faulted back on demand.
>>>>> @@ -2254,9 +2256,9 @@ static void unmap_page(struct page *page)
>>>>>  	VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
>>>>>  }
>>>>>
>>>>> -static void remap_page(struct folio *folio, unsigned long nr)
>>>>> +static void remap_page(struct folio *folio, unsigned short nr)
>>>>>  {
>>>>> -	int i = 0;
>>>>> +	unsigned int i;
>>>>>
>>>>>  	/* If unmap_page() uses try_to_migrate() on file, remove this check */
>>>>>  	if (!folio_test_anon(folio))
>>>>> @@ -2274,7 +2276,6 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
>>>>>  		struct lruvec *lruvec, struct list_head *list)
>>>>>  {
>>>>>  	VM_BUG_ON_PAGE(!PageHead(head), head);
>>>>> -	VM_BUG_ON_PAGE(PageCompound(tail), head);
>>>>>  	VM_BUG_ON_PAGE(PageLRU(tail), head);
>>>>>  	lockdep_assert_held(&lruvec->lru_lock);
>>>>>
>>>>> @@ -2295,9 +2296,10 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
>>>>>  }
>>>>>
>>>>>  static void __split_huge_page_tail(struct page *head, int tail,
>>>>> -		struct lruvec *lruvec, struct list_head *list)
>>>>> +		struct lruvec *lruvec, struct list_head *list, unsigned int new_order)
>>>>>  {
>>>>>  	struct page *page_tail = head + tail;
>>>>> +	unsigned long compound_head_flag = new_order ? (1L << PG_head) : 0;
>>>>>
>>>>>  	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
>>>>>
>>>>> @@ -2321,6 +2323,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
>>>>>  #ifdef CONFIG_64BIT
>>>>>  			 (1L << PG_arch_2) |
>>>>>  #endif
>>>>> +			 compound_head_flag |
>>>>>  			 (1L << PG_dirty)));
>>>>>
>>>>>  	/* ->mapping in first tail page is compound_mapcount */
>>>>> @@ -2329,7 +2332,10 @@ static void __split_huge_page_tail(struct page *head, int tail,
>>>>>  	page_tail->mapping = head->mapping;
>>>>>  	page_tail->index = head->index + tail;
>>>>>
>>>>> -	/* Page flags must be visible before we make the page non-compound. */
>>>>> +	/*
>>>>> +	 * Page flags must be visible before we make the page non-compound or
>>>>> +	 * a compound page in new_order.
>>>>> +	 */
>>>>>  	smp_wmb();
>>>>>
>>>>>  	/*
>>>>> @@ -2339,10 +2345,15 @@ static void __split_huge_page_tail(struct page *head, int tail,
>>>>>  	 * which needs correct compound_head().
>>>>>  	 */
>>>>>  	clear_compound_head(page_tail);
>>>>> +	if (new_order) {
>>>>> +		prep_compound_page(page_tail, new_order);
>>>>> +		prep_transhuge_page(page_tail);
>>>>> +	}
>>>>
>>>> Many thanks for your series. It looks really good. One question:
>>>> IIUC, It seems there has assumption that LRU compound_pages should
>>>> be PageTransHuge. So PageTransHuge just checks PageHead:
>>>>
>>>> static inline int PageTransHuge(struct page *page)
>>>> {
>>>> 	VM_BUG_ON_PAGE(PageTail(page), page);
>>>> 	return PageHead(page);
>>>> }
>>>>
>>>> So LRU pages with any order( > 0) will might be wrongly treated as THP which
>>>> has order = HPAGE_PMD_ORDER. We should ensure thp_nr_pages is used instead of
>>>> hard coded HPAGE_PMD_ORDER.
>>>>
>>>> Looks at the below code snippet:
>>>> mm/mempolicy.c:
>>>> static struct page *new_page(struct page *page, unsigned long start)
>>>> {
>>>> ...
>>>> 	} else if (PageTransHuge(page)) {
>>>> 		struct page *thp;
>>>>
>>>> 		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
>>>> 					 HPAGE_PMD_ORDER);
>>>> 					 ^^^^^^^^^^^^^^^^
>>>> 		if (!thp)
>>>> 			return NULL;
>>>> 		prep_transhuge_page(thp);
>>>> 		return thp;
>>>> 	}
>>>> ...
>>>> }
>>>>
>>>> HPAGE_PMD_ORDER is used instead of thp_nr_pages. So the lower order pages might be
>>>> used as if its order is HPAGE_PMD_ORDER. All of such usage might need to be fixed.
>>>> Or am I miss something ?
>>>>
>>>> Thanks again for your work. :)
>>>
>>> THP will still only have HPAGE_PMD_ORDER and will not be split into any order
>>> other than 0. This series only allows to split huge page cache folio (added by Matthew)
>>> into any lower order. I have an explicit VM_BUG_ON() to ensure new_order
>>> is only 0 when non page cache page is the input. Since there is still non-trivial
>>> amount of work to add any order THP support in the kernel. IIRC, Yu Zhao (cc’d) was
>>> planning to work on that.
>>>
>>
>> Many thanks for clarifying. I'm sorry but I haven't followed Matthew's patches. I am
>> wondering could huge page cache folio be treated as THP ? If so, how to ensure the
>> correctness of huge page cache ?
> 
> You are right. All these HPAGE_PMD_ORDRE locations should be replaced by thp_nr_pages().
> I will look into it.
> 

Many thanks for doing this. :)

> Thanks a lot.
> 
> --
> Best Regards,
> Yan, Zi
> 



  reply	other threads:[~2022-03-24  2:02 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-21 14:21 [RFC PATCH 0/5] Split a " Zi Yan
2022-03-21 14:21 ` [RFC PATCH 1/5] mm: memcg: make memcg huge page split support any order split Zi Yan
2022-03-21 18:57   ` Roman Gushchin
2022-03-21 19:07     ` Zi Yan
2022-03-21 19:54       ` Matthew Wilcox
2022-03-21 20:26         ` Zi Yan
2022-03-21 14:21 ` [RFC PATCH 2/5] mm: page_owner: add support for splitting to any order in split page_owner Zi Yan
2022-03-21 19:02   ` Roman Gushchin
2022-03-21 19:08     ` Zi Yan
2022-03-21 14:21 ` [RFC PATCH 3/5] mm: thp: split huge page to any lower order pages Zi Yan
2022-03-21 22:18   ` Roman Gushchin
2022-03-22 14:21     ` Zi Yan
2022-03-22  3:21   ` Miaohe Lin
2022-03-22 14:30     ` Zi Yan
2022-03-23  2:31       ` Miaohe Lin
2022-03-23 22:10         ` Zi Yan
2022-03-24  2:02           ` Miaohe Lin [this message]
2022-03-22 20:57   ` Yang Shi
2022-03-21 14:21 ` [RFC PATCH 4/5] mm: truncate: split huge page cache page to a non-zero order if possible Zi Yan
2022-03-21 22:32   ` Roman Gushchin
2022-03-22 14:19     ` Zi Yan
2022-03-23  6:40   ` [mm] 2757cee2d6: UBSAN:shift-out-of-bounds_in_include/linux/log2.h kernel test robot
2022-03-21 14:21 ` [RFC PATCH 5/5] mm: huge_memory: enable debugfs to split huge pages to any order Zi Yan
2022-03-21 22:23   ` Roman Gushchin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=6587a8af-8d6d-c947-2cee-11f75ceefef6@huawei.com \
    --to=linmiaohe@huawei.com \
    --cc=cgroups@vger.kernel.org \
    --cc=hughd@google.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=roman.gushchin@linux.dev \
    --cc=shuah@kernel.org \
    --cc=shy828301@gmail.com \
    --cc=willy@infradead.org \
    --cc=yuzhao@google.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox