linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Kefeng Wang <wangkefeng.wang@huawei.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Huang Ying <ying.huang@intel.com>,
	Mel Gorman <mgorman@techsingularity.net>,
	Ryan Roberts <ryan.roberts@arm.com>,
	David Hildenbrand <david@redhat.com>,
	Barry Song <v-songbaohua@oppo.com>,
	Vlastimil Babka <vbabka@suse.cz>, Zi Yan <ziy@nvidia.com>,
	"Matthew Wilcox (Oracle)" <willy@infradead.org>,
	Jonathan Corbet <corbet@lwn.net>, Yang Shi <shy828301@gmail.com>,
	Yu Zhao <yuzhao@google.com>, <linux-mm@kvack.org>,
	Kefeng Wang <wangkefeng.wang@huawei.com>
Subject: [PATCH rfc 2/3] mm: add control to allow specified high-order pages stored on PCP list
Date: Mon, 15 Apr 2024 16:12:19 +0800	[thread overview]
Message-ID: <20240415081220.3246839-3-wangkefeng.wang@huawei.com> (raw)
In-Reply-To: <20240415081220.3246839-1-wangkefeng.wang@huawei.com>

The high-order pages stored on PCP list may not always win, even herts
some workloads, so it is disabled by default for high-orders except
PMD_ORDER. Since there is already per-supported-THP-size interfaces
to configrate mTHP behaviours, adding a new control pcp_enabled under
above interfaces to allow user to enable/disable the specified high-order
pages stored on PCP list or not, but it can't change the existing behaviour
for order = PMD_ORDER and order <= PAGE_ALLOC_COSTLY_ORDER, they are
always enabled and can't be disabled, meanwhile, when disabled by
pcp_enabled for other high-orders, pcplists will be drained.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 Documentation/admin-guide/mm/transhuge.rst | 11 +++++
 include/linux/gfp.h                        |  1 +
 include/linux/huge_mm.h                    |  1 +
 mm/huge_memory.c                           | 47 ++++++++++++++++++++++
 mm/page_alloc.c                            | 16 ++++++++
 5 files changed, 76 insertions(+)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 04eb45a2f940..3cb91336f81a 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -189,6 +189,17 @@ madvise
 never
 	should be self-explanatory.
 
+
+There's also sysfs knob to control hugepage to be stored on PCP lists for
+high-orders(greated than PAGE_ALLOC_COSTLY_ORDER), which could reduce
+the zone lock contention when allocate hige-order pages frequently. Please
+note that the PCP behavior of low-order and PMD-order pages cannot changed,
+it is possible to enable other higher-order pages stored on PCP lists by
+writing 1 or disable it back by writing 0::
+
+        echo 0 >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/pcp_enabled
+        echo 1 >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/pcp_enabled
+
 By default kernel tries to use huge, PMD-mappable zero page on read
 page fault to anonymous mapping. It's possible to disable huge zero
 page by writing 0 or enable it back by writing 1::
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 450c2cbcf04b..2ae1157abd6e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -365,6 +365,7 @@ extern void page_frag_free(void *addr);
 
 void page_alloc_init_cpuhp(void);
 int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
+void drain_all_zone_pages(void);
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b67294d5814f..86306becfd52 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -108,6 +108,7 @@ extern unsigned long transparent_hugepage_flags;
 extern unsigned long huge_anon_orders_always;
 extern unsigned long huge_anon_orders_madvise;
 extern unsigned long huge_anon_orders_inherit;
+extern unsigned long huge_pcp_allow_orders;
 
 static inline bool hugepage_global_enabled(void)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9a1b57ef9c60..9b8a8aa36526 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -512,8 +512,49 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj,
 static struct kobj_attribute thpsize_enabled_attr =
 	__ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
 
+unsigned long huge_pcp_allow_orders __read_mostly;
+static ssize_t thpsize_pcp_enabled_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *buf)
+{
+	int order = to_thpsize(kobj)->order;
+
+	return sysfs_emit(buf, "%d\n",
+			  !!test_bit(order, &huge_pcp_allow_orders));
+}
+
+static ssize_t thpsize_pcp_enabled_store(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 const char *buf, size_t count)
+{
+	int order = to_thpsize(kobj)->order;
+	unsigned long value;
+	int ret;
+
+	if (order <= PAGE_ALLOC_COSTLY_ORDER || order == PMD_ORDER)
+		return -EINVAL;
+
+	ret = kstrtoul(buf, 10, &value);
+	if (ret < 0)
+		return ret;
+	if (value > 1)
+		return -EINVAL;
+
+	if (value) {
+		set_bit(order, &huge_pcp_allow_orders);
+	} else {
+		if (test_and_clear_bit(order, &huge_pcp_allow_orders))
+			drain_all_zone_pages();
+	}
+
+	return count;
+}
+
+static struct kobj_attribute thpsize_pcp_enabled_attr = __ATTR(pcp_enabled,
+		0644, thpsize_pcp_enabled_show, thpsize_pcp_enabled_store);
+
 static struct attribute *thpsize_attrs[] = {
 	&thpsize_enabled_attr.attr,
+	&thpsize_pcp_enabled_attr.attr,
 	NULL,
 };
 
@@ -624,6 +665,8 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 	 */
 	huge_anon_orders_inherit = BIT(PMD_ORDER);
 
+	huge_pcp_allow_orders = BIT(PMD_ORDER);
+
 	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
 	if (unlikely(!*hugepage_kobj)) {
 		pr_err("failed to create transparent hugepage kobject\n");
@@ -658,6 +701,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 			err = PTR_ERR(thpsize);
 			goto remove_all;
 		}
+
+		if (order <= PAGE_ALLOC_COSTLY_ORDER)
+			huge_pcp_allow_orders |= BIT(order);
+
 		list_add(&thpsize->node, &thpsize_list);
 		order = next_order(&orders, order);
 	}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2248afc7b73a..25fd3fe30cb0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -537,6 +537,8 @@ static inline bool pcp_allowed_order(unsigned int order)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	if (order == PCP_MAX_ORDER)
 		return true;
+	if (BIT(order) & huge_pcp_allow_orders)
+		return true;
 #endif
 	return false;
 }
@@ -6705,6 +6707,20 @@ void zone_pcp_reset(struct zone *zone)
 	}
 }
 
+void drain_all_zone_pages(void)
+{
+	struct zone *zone;
+
+	mutex_lock(&pcp_batch_high_lock);
+	for_each_populated_zone(zone)
+		__zone_set_pageset_high_and_batch(zone, 0, 0, 1);
+	__drain_all_pages(NULL, true);
+	for_each_populated_zone(zone)
+		__zone_set_pageset_high_and_batch(zone, zone->pageset_high_min,
+				zone->pageset_high_max, zone->pageset_batch);
+	mutex_unlock(&pcp_batch_high_lock);
+}
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be in a single zone, must not contain holes,
-- 
2.27.0



  parent reply	other threads:[~2024-04-15  8:12 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-15  8:12 [PATCH rfc 0/3] mm: allow more high-order pages stored on PCP lists Kefeng Wang
2024-04-15  8:12 ` [PATCH rfc 1/3] mm: prepare more high-order pages to be stored on the per-cpu lists Kefeng Wang
2024-04-15 11:41   ` Baolin Wang
2024-04-15 12:25     ` Kefeng Wang
2024-04-15  8:12 ` Kefeng Wang [this message]
2024-04-15  8:12 ` [PATCH rfc 3/3] mm: pcp: show per-order pages count Kefeng Wang
2024-04-15  8:18 ` [PATCH rfc 0/3] mm: allow more high-order pages stored on PCP lists Barry Song
2024-04-15  8:59   ` Kefeng Wang
2024-04-15 10:52     ` David Hildenbrand
2024-04-15 11:14       ` Barry Song
2024-04-15 12:17       ` Kefeng Wang
2024-04-16  0:21         ` Barry Song
2024-04-16  4:50           ` Kefeng Wang
2024-04-16  4:58             ` Kefeng Wang
2024-04-16  5:26               ` Barry Song
2024-04-16  7:03                 ` David Hildenbrand
2024-04-16  8:06                   ` Kefeng Wang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240415081220.3246839-3-wangkefeng.wang@huawei.com \
    --to=wangkefeng.wang@huawei.com \
    --cc=akpm@linux-foundation.org \
    --cc=corbet@lwn.net \
    --cc=david@redhat.com \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@techsingularity.net \
    --cc=ryan.roberts@arm.com \
    --cc=shy828301@gmail.com \
    --cc=v-songbaohua@oppo.com \
    --cc=vbabka@suse.cz \
    --cc=willy@infradead.org \
    --cc=ying.huang@intel.com \
    --cc=yuzhao@google.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox