linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
	Rik van Riel <riel@redhat.com>, Mel Gorman <mgorman@suse.de>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Minchan Kim <minchan@kernel.org>,
	Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>,
	Zhang Yanfei <zhangyanfei@cn.fujitsu.com>,
	"Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>,
	Tang Chen <tangchen@cn.fujitsu.com>,
	Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>,
	Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>,
	Wen Congyang <wency@cn.fujitsu.com>,
	Marek Szyprowski <m.szyprowski@samsung.com>,
	Michal Nazarewicz <mina86@mina86.com>,
	Laura Abbott <lauraa@codeaurora.org>,
	Heesub Shin <heesub.shin@samsung.com>,
	"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>,
	Ritesh Harjani <ritesh.list@gmail.com>,
	t.stanislaws@samsung.com, Gioh Kim <gioh.kim@lge.com>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	Joonsoo Kim <iamjoonsoo.kim@lge.com>
Subject: [PATCH v2 3/8] mm/page_alloc: fix pcp high, batch management
Date: Wed,  6 Aug 2014 16:18:32 +0900	[thread overview]
Message-ID: <1407309517-3270-7-git-send-email-iamjoonsoo.kim@lge.com> (raw)
In-Reply-To: <1407309517-3270-1-git-send-email-iamjoonsoo.kim@lge.com>

per cpu pages structure, aka pcp, has high and batch values to control
how many pages we perform caching. This values could be updated
asynchronously and updater should ensure that this doesn't make any
problem. For this purpose, pageset_update() is implemented and do some
memory synchronization. But, it turns out to be wrong when I implemented
new feature using this. There is no corresponding smp_rmb() in read-side
so that it can't guarantee anything. Without correct updating, system
could hang in free_pcppages_bulk() due to larger batch value than high.
To properly update this values, we need to synchronization primitives on
read-side, but, it hurts allocator's fastpath.

There is another choice for synchronization, that is, sending IPI. This
is somewhat expensive, but, this is really rare case so I guess it has
no problem here. However, reducing IPI is very helpful here. Current
logic handles each CPU's pcp update one by one. To reduce sending IPI,
we need to re-ogranize the code to handle all CPU's pcp update at one go.
This patch implement these requirements.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
---
 mm/page_alloc.c |  139 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 80 insertions(+), 59 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e6fee4b..3e1e344 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3800,7 +3800,7 @@ static void build_zonelist_cache(pg_data_t *pgdat)
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
+static void setup_pageset(struct per_cpu_pageset __percpu *pcp);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static void setup_zone_pageset(struct zone *zone);
 
@@ -3846,9 +3846,9 @@ static int __build_all_zonelists(void *data)
 	 * needs the percpu allocator in order to allocate its pagesets
 	 * (a chicken-egg dilemma).
 	 */
-	for_each_possible_cpu(cpu) {
-		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+	setup_pageset(&boot_pageset);
 
+	for_each_possible_cpu(cpu) {
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 		/*
 		 * We now know the "local memory node" for each node--
@@ -4230,24 +4230,59 @@ static int zone_batchsize(struct zone *zone)
  * outside of boot time (or some other assurance that no concurrent updaters
  * exist).
  */
-static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
-		unsigned long batch)
+static void pageset_update(struct zone *zone, int high, int batch)
 {
-       /* start with a fail safe value for batch */
-	pcp->batch = 1;
-	smp_wmb();
+	int cpu;
+	struct per_cpu_pages *pcp;
+
+	/* start with a fail safe value for batch */
+	for_each_possible_cpu(cpu) {
+		pcp = &per_cpu_ptr(zone->pageset, cpu)->pcp;
+		pcp->batch = 1;
+	}
+	kick_all_cpus_sync();
+
+	/* Update high, then batch, in order */
+	for_each_possible_cpu(cpu) {
+		pcp = &per_cpu_ptr(zone->pageset, cpu)->pcp;
+		pcp->high = high;
+	}
+	kick_all_cpus_sync();
 
-       /* Update high, then batch, in order */
-	pcp->high = high;
-	smp_wmb();
+	for_each_possible_cpu(cpu) {
+		pcp = &per_cpu_ptr(zone->pageset, cpu)->pcp;
+		pcp->batch = batch;
+	}
+}
+
+/*
+ * pageset_get_values_by_high() gets the high water mark for
+ * hot per_cpu_pagelist to the value high for the pageset p.
+ */
+static void pageset_get_values_by_high(int input_high,
+				int *output_high, int *output_batch)
+{
+	*output_batch = max(1, input_high / 4);
+	if ((input_high / 4) > (PAGE_SHIFT * 8))
+		*output_batch = PAGE_SHIFT * 8;
+}
 
-	pcp->batch = batch;
+/* a companion to pageset_get_values_by_high() */
+static void pageset_get_values_by_batch(int input_batch,
+				int *output_high, int *output_batch)
+{
+	*output_high = 6 * input_batch;
+	*output_batch = max(1, 1 * input_batch);
 }
 
-/* a companion to pageset_set_high() */
-static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
+static void pageset_get_values(struct zone *zone, int *high, int *batch)
 {
-	pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
+	if (percpu_pagelist_fraction) {
+		pageset_get_values_by_high(
+			(zone->managed_pages / percpu_pagelist_fraction),
+			high, batch);
+	} else
+		pageset_get_values_by_batch(zone_batchsize(zone), high, batch);
 }
 
 static void pageset_init(struct per_cpu_pageset *p)
@@ -4263,51 +4298,38 @@ static void pageset_init(struct per_cpu_pageset *p)
 		INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
 
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+/* Use this only in boot time, because it doesn't do any synchronization */
+static void setup_pageset(struct per_cpu_pageset __percpu *pcp)
 {
-	pageset_init(p);
-	pageset_set_batch(p, batch);
-}
-
-/*
- * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
- * to the value high for the pageset p.
- */
-static void pageset_set_high(struct per_cpu_pageset *p,
-				unsigned long high)
-{
-	unsigned long batch = max(1UL, high / 4);
-	if ((high / 4) > (PAGE_SHIFT * 8))
-		batch = PAGE_SHIFT * 8;
-
-	pageset_update(&p->pcp, high, batch);
-}
-
-static void pageset_set_high_and_batch(struct zone *zone,
-				       struct per_cpu_pageset *pcp)
-{
-	if (percpu_pagelist_fraction)
-		pageset_set_high(pcp,
-			(zone->managed_pages /
-				percpu_pagelist_fraction));
-	else
-		pageset_set_batch(pcp, zone_batchsize(zone));
-}
+	int cpu;
+	int high, batch;
+	struct per_cpu_pageset *p;
 
-static void __meminit zone_pageset_init(struct zone *zone, int cpu)
-{
-	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+	pageset_get_values_by_batch(0, &high, &batch);
 
-	pageset_init(pcp);
-	pageset_set_high_and_batch(zone, pcp);
+	for_each_possible_cpu(cpu) {
+		p = per_cpu_ptr(pcp, cpu);
+		pageset_init(p);
+		p->pcp.high = high;
+		p->pcp.batch = batch;
+	}
 }
 
 static void __meminit setup_zone_pageset(struct zone *zone)
 {
 	int cpu;
+	int high, batch;
+	struct per_cpu_pageset *p;
+
+	pageset_get_values(zone, &high, &batch);
+
 	zone->pageset = alloc_percpu(struct per_cpu_pageset);
-	for_each_possible_cpu(cpu)
-		zone_pageset_init(zone, cpu);
+	for_each_possible_cpu(cpu) {
+		p = per_cpu_ptr(zone->pageset, cpu);
+		pageset_init(p);
+		p->pcp.high = high;
+		p->pcp.batch = batch;
+	}
 }
 
 /*
@@ -5928,11 +5950,10 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
 		goto out;
 
 	for_each_populated_zone(zone) {
-		unsigned int cpu;
+		int high, batch;
 
-		for_each_possible_cpu(cpu)
-			pageset_set_high_and_batch(zone,
-					per_cpu_ptr(zone->pageset, cpu));
+		pageset_get_values(zone, &high, &batch);
+		pageset_update(zone, high, batch);
 	}
 out:
 	mutex_unlock(&pcp_batch_high_lock);
@@ -6455,11 +6476,11 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
  */
 void __meminit zone_pcp_update(struct zone *zone)
 {
-	unsigned cpu;
+	int high, batch;
+
 	mutex_lock(&pcp_batch_high_lock);
-	for_each_possible_cpu(cpu)
-		pageset_set_high_and_batch(zone,
-				per_cpu_ptr(zone->pageset, cpu));
+	pageset_get_values(zone, &high, &batch);
+	pageset_update(zone, high, batch);
 	mutex_unlock(&pcp_batch_high_lock);
 }
 #endif
-- 
1.7.9.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2014-08-06  7:11 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-08-06  7:18 [PATCH v2 0/8] fix freepage count problems in memory isolation Joonsoo Kim
2014-08-06  7:18 ` [PATCH v2 1/8] mm/page_alloc: correct to clear guard attribute in DEBUG_PAGEALLOC Joonsoo Kim
2014-08-07  1:46   ` Zhang Yanfei
2014-08-06  7:18 ` [PATCH v2 1/8] mm/page_alloc: fix pcp high, batch management Joonsoo Kim
2014-08-12  1:24   ` Minchan Kim
2014-08-13  8:13     ` Joonsoo Kim
2014-08-06  7:18 ` [PATCH v2 2/8] mm/isolation: remove unstable check for isolated page Joonsoo Kim
2014-08-07 13:49   ` Vlastimil Babka
2014-08-08  6:22     ` Joonsoo Kim
2014-08-11  9:23   ` Aneesh Kumar K.V
2014-08-13  8:19     ` Joonsoo Kim
2014-08-06  7:18 ` [PATCH v2 2/8] mm/page_alloc: correct to clear guard attribute in DEBUG_PAGEALLOC Joonsoo Kim
2014-08-12  1:45   ` Minchan Kim
2014-08-13  8:20     ` Joonsoo Kim
2014-08-06  7:18 ` [PATCH v2 3/8] mm/isolation: remove unstable check for isolated page Joonsoo Kim
2014-08-06  7:18 ` Joonsoo Kim [this message]
2014-08-07  2:11   ` [PATCH v2 3/8] mm/page_alloc: fix pcp high, batch management Zhang Yanfei
2014-08-07  8:23     ` Joonsoo Kim
2014-08-06  7:18 ` [PATCH v2 4/8] mm/isolation: close the two race problems related to pageblock isolation Joonsoo Kim
2014-08-07 14:34   ` Vlastimil Babka
2014-08-08  6:30     ` Joonsoo Kim
2014-08-12  5:17   ` Minchan Kim
2014-08-12  9:45     ` Vlastimil Babka
2014-08-13  8:09       ` Joonsoo Kim
2014-08-13  8:29     ` Joonsoo Kim
2014-08-06  7:18 ` [PATCH v2 5/8] mm/isolation: change pageblock isolation logic to fix freepage counting bugs Joonsoo Kim
2014-08-06 15:12   ` Vlastimil Babka
2014-08-07  8:19     ` Joonsoo Kim
2014-08-07  8:53       ` Vlastimil Babka
2014-08-07 12:26         ` Joonsoo Kim
2014-08-07 13:04           ` Vlastimil Babka
2014-08-07 13:35             ` Joonsoo Kim
2014-08-07 15:15   ` Vlastimil Babka
2014-08-08  6:45     ` Joonsoo Kim
2014-08-12  6:43   ` Minchan Kim
2014-08-12 10:58     ` Vlastimil Babka
2014-08-06  7:18 ` [PATCH v2 6/8] mm/isolation: factor out pre/post logic on set/unset_migratetype_isolate() Joonsoo Kim
2014-08-06  7:18 ` [PATCH v2 7/8] mm/isolation: fix freepage counting bug on start/undo_isolat_page_range() Joonsoo Kim
2014-08-06  7:18 ` [PATCH v2 8/8] mm/isolation: remove useless race handling related to pageblock isolation Joonsoo Kim
2014-08-06  7:25 ` [PATCH v2 0/8] fix freepage count problems in memory isolation Joonsoo Kim
2014-08-07  0:49 ` Zhang Yanfei
2014-08-07  8:20   ` Joonsoo Kim

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1407309517-3270-7-git-send-email-iamjoonsoo.kim@lge.com \
    --to=iamjoonsoo.kim@lge.com \
    --cc=akpm@linux-foundation.org \
    --cc=aneesh.kumar@linux.vnet.ibm.com \
    --cc=b.zolnierkie@samsung.com \
    --cc=gioh.kim@lge.com \
    --cc=hannes@cmpxchg.org \
    --cc=heesub.shin@samsung.com \
    --cc=isimatu.yasuaki@jp.fujitsu.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=lauraa@codeaurora.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=m.szyprowski@samsung.com \
    --cc=mgorman@suse.de \
    --cc=mina86@mina86.com \
    --cc=minchan@kernel.org \
    --cc=n-horiguchi@ah.jp.nec.com \
    --cc=riel@redhat.com \
    --cc=ritesh.list@gmail.com \
    --cc=srivatsa.bhat@linux.vnet.ibm.com \
    --cc=t.stanislaws@samsung.com \
    --cc=tangchen@cn.fujitsu.com \
    --cc=wency@cn.fujitsu.com \
    --cc=zhangyanfei@cn.fujitsu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox