[PATCH] mm, meminit: Serially initialise deferred memory if trace_buf

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
@ 2017-11-15  8:55 Mel Gorman
  2017-11-15 11:55 ` Michal Hocko
  2017-11-15 19:49 ` Andrew Morton
  0 siblings, 2 replies; 21+ messages in thread
From: Mel Gorman @ 2017-11-15  8:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, linux-kernel, mgorman, yasu.isimatu, koki.sanagi

Yasuaki Ishimatsu reported a premature OOM when trace_buf_size=100m was
specified on a machine with many CPUs. The kernel tried to allocate 38.4GB
but only 16GB was available due to deferred memory initialisation.

The allocation context is within smp_init() so there are no opportunities
to do the deferred meminit earlier. Furthermore, the partial initialisation
of memory occurs before the size of the trace buffers is set so there is
no opportunity to adjust the amount of memory that is pre-initialised. We
could potentially catch when memory is low during system boot and adjust the
amount that is initialised serially but it's a little clumsy as it would
require a check in the failure path of the page allocator.  Given that
deferred meminit is basically a minor optimisation that only benefits very
large machines and trace_buf_size is somewhat specialised, it follows that
the most straight-forward option is to go back to serialised meminit if
trace_buf_size is specified.

Reported-and-tested-by: YASUAKI ISHIMATSU <yasu.isimatu@gmail.com>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 include/linux/gfp.h  | 13 +++++++++++++
 init/main.c          |  2 ++
 kernel/trace/trace.c |  7 +++++++
 mm/page_alloc.c      | 30 ++++++++++++++++++++++++++++++
 4 files changed, 52 insertions(+)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 710143741eb5..6ef0ab13f774 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -558,6 +558,19 @@ void drain_local_pages(struct zone *zone);
 
 void page_alloc_init_late(void);
 
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+extern void __init disable_deferred_meminit(void);
+extern void page_alloc_init_late_prepare(void);
+#else
+static inline void disable_deferred_meminit(void)
+{
+}
+
+static inline void page_alloc_init_late_prepare(void)
+{
+}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
 /*
  * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
  * GFP flags are used before interrupts are enabled. Once interrupts are
diff --git a/init/main.c b/init/main.c
index 0ee9c6866ada..0248b8b5bc3a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1058,6 +1058,8 @@ static noinline void __init kernel_init_freeable(void)
 	do_pre_smp_initcalls();
 	lockup_detector_init();
 
+	page_alloc_init_late_prepare();
+
 	smp_init();
 	sched_init_smp();
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 752e5daf0896..cfa7175ff093 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1115,6 +1115,13 @@ static int __init set_buf_size(char *str)
 	if (buf_size == 0)
 		return 0;
 	trace_buf_size = buf_size;
+
+	/*
+	 * The size of buffers are unpredictable so initialise all memory
+	 * before the allocation attempt occurs.
+	 */
+	disable_deferred_meminit();
+
 	return 1;
 }
 __setup("trace_buf_size=", set_buf_size);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 77e4d3c5c57b..4dd0e153b0f2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -290,6 +290,19 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+bool __initdata deferred_meminit_disabled;
+
+/*
+ * Allow deferred meminit to be disabled by subsystems that require large
+ * allocations before the memory allocator is fully initialised. It should
+ * only be used in cases where the size of the allocation may not fit into
+ * the 2G per node that is allocated serially.
+ */
+void __init disable_deferred_meminit(void)
+{
+	deferred_meminit_disabled = true;
+}
+
 static inline void reset_deferred_meminit(pg_data_t *pgdat)
 {
 	unsigned long max_initialise;
@@ -1567,6 +1580,23 @@ static int __init deferred_init_memmap(void *data)
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/*
+ * Serialised init of remaining memory if large buffers of unknown size
+ * are required that might fail before parallelised meminit can start
+ */
+void __init page_alloc_init_late_prepare(void)
+{
+	int nid;
+
+	if (!deferred_meminit_disabled)
+		return;
+
+	for_each_node_state(nid, N_MEMORY)
+		deferred_init_memmap(NODE_DATA(nid));
+}
+#endif
+
 void __init page_alloc_init_late(void)
 {
 	struct zone *zone;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-15  8:55 [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified Mel Gorman
@ 2017-11-15 11:55 ` Michal Hocko
  2017-11-15 14:13   ` Mel Gorman
  2017-11-15 19:49 ` Andrew Morton
  1 sibling, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2017-11-15 11:55 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, linux-mm, linux-kernel, yasu.isimatu, koki.sanagi

On Wed 15-11-17 08:55:56, Mel Gorman wrote:
> Yasuaki Ishimatsu reported a premature OOM when trace_buf_size=100m was
> specified on a machine with many CPUs. The kernel tried to allocate 38.4GB
> but only 16GB was available due to deferred memory initialisation.
> 
> The allocation context is within smp_init() so there are no opportunities
> to do the deferred meminit earlier. Furthermore, the partial initialisation
> of memory occurs before the size of the trace buffers is set so there is
> no opportunity to adjust the amount of memory that is pre-initialised. We
> could potentially catch when memory is low during system boot and adjust the
> amount that is initialised serially but it's a little clumsy as it would
> require a check in the failure path of the page allocator.  Given that
> deferred meminit is basically a minor optimisation that only benefits very
> large machines and trace_buf_size is somewhat specialised, it follows that
> the most straight-forward option is to go back to serialised meminit if
> trace_buf_size is specified.

Can we instead do a smaller trace buffer in the early stage and then
allocate the rest after the whole memory is initialized? The early
memory init code is quite complex to make it even more so for something
that looks like a borderline useful usecase. Seriously, who is going
need 100M trace buffer _per cpu_ during early boot?

> Reported-and-tested-by: YASUAKI ISHIMATSU <yasu.isimatu@gmail.com>
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-15 11:55 ` Michal Hocko
@ 2017-11-15 14:13   ` Mel Gorman
  2017-11-15 14:28     ` Michal Hocko
  0 siblings, 1 reply; 21+ messages in thread
From: Mel Gorman @ 2017-11-15 14:13 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, linux-mm, linux-kernel, yasu.isimatu, koki.sanagi

On Wed, Nov 15, 2017 at 12:55:59PM +0100, Michal Hocko wrote:
> On Wed 15-11-17 08:55:56, Mel Gorman wrote:
> > Yasuaki Ishimatsu reported a premature OOM when trace_buf_size=100m was
> > specified on a machine with many CPUs. The kernel tried to allocate 38.4GB
> > but only 16GB was available due to deferred memory initialisation.
> > 
> > The allocation context is within smp_init() so there are no opportunities
> > to do the deferred meminit earlier. Furthermore, the partial initialisation
> > of memory occurs before the size of the trace buffers is set so there is
> > no opportunity to adjust the amount of memory that is pre-initialised. We
> > could potentially catch when memory is low during system boot and adjust the
> > amount that is initialised serially but it's a little clumsy as it would
> > require a check in the failure path of the page allocator.  Given that
> > deferred meminit is basically a minor optimisation that only benefits very
> > large machines and trace_buf_size is somewhat specialised, it follows that
> > the most straight-forward option is to go back to serialised meminit if
> > trace_buf_size is specified.
> 
> Can we instead do a smaller trace buffer in the early stage and then
> allocate the rest after the whole memory is initialized?

Potentially yes, but it's also unnecessarily complex to setup buffers,
finish init, tear them down, set them back up etc. It's not much of an
improvement to allocate a small buffer and then grow them later.

> The early
> memory init code is quite complex to make it even more so for something
> that looks like a borderline useful usecase.

The additional complexity to memory init is marginal in comparison to
playing games with how the tracing ring buffers are allocated.

> Seriously, who is going
> need 100M trace buffer _per cpu_ during early boot?
> 

I doubt anyone well. Even the original reporter appeared to pick that
particular value just to trigger the OOM.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-15 14:13   ` Mel Gorman
@ 2017-11-15 14:28     ` Michal Hocko
  2017-11-15 14:43       ` Mel Gorman
  0 siblings, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2017-11-15 14:28 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, linux-mm, linux-kernel, yasu.isimatu, koki.sanagi

On Wed 15-11-17 14:13:29, Mel Gorman wrote:
[...]
> I doubt anyone well. Even the original reporter appeared to pick that
> particular value just to trigger the OOM.

Then why do we care at all? The trace buffer size can be configured from
the userspace if it is not sufficiently large IIRC.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-15 14:28     ` Michal Hocko
@ 2017-11-15 14:43       ` Mel Gorman
  2017-11-15 14:57         ` Michal Hocko
  0 siblings, 1 reply; 21+ messages in thread
From: Mel Gorman @ 2017-11-15 14:43 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, linux-mm, linux-kernel, yasu.isimatu, koki.sanagi

On Wed, Nov 15, 2017 at 03:28:16PM +0100, Michal Hocko wrote:
> On Wed 15-11-17 14:13:29, Mel Gorman wrote:
> [...]
> > I doubt anyone well. Even the original reporter appeared to pick that
> > particular value just to trigger the OOM.
> 
> Then why do we care at all? The trace buffer size can be configured from
> the userspace if it is not sufficiently large IIRC.
> 

I guess there is the potential that the trace buffer needs to be large
enough early on in boot but I'm not sure why it would need to be that large
to be honest. Bottom line, it's fairly trivial to just serialise meminit
in the event that it's resized from command line. I'm also ok with just
leaving this is as a "don't set the buffer that large" but I don't think
spreading meminit concerns into ftrace is a good idea.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-15 14:43       ` Mel Gorman
@ 2017-11-15 14:57         ` Michal Hocko
  2017-11-15 19:17           ` YASUAKI ISHIMATSU
  0 siblings, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2017-11-15 14:57 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, linux-mm, linux-kernel, yasu.isimatu, koki.sanagi

On Wed 15-11-17 14:43:14, Mel Gorman wrote:
> On Wed, Nov 15, 2017 at 03:28:16PM +0100, Michal Hocko wrote:
> > On Wed 15-11-17 14:13:29, Mel Gorman wrote:
> > [...]
> > > I doubt anyone well. Even the original reporter appeared to pick that
> > > particular value just to trigger the OOM.
> > 
> > Then why do we care at all? The trace buffer size can be configured from
> > the userspace if it is not sufficiently large IIRC.
> > 
> 
> I guess there is the potential that the trace buffer needs to be large
> enough early on in boot but I'm not sure why it would need to be that large
> to be honest. Bottom line, it's fairly trivial to just serialise meminit
> in the event that it's resized from command line. I'm also ok with just
> leaving this is as a "don't set the buffer that large"

I would be reluctant to touch the code just because of insane kernel
command line option.

That being said, I will not object or block the patch it just seems
unnecessary for most reasonable setups I can think of. If there is a
legitimate usage of such a large trace buffer then I wouldn't oppose.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-15 14:57         ` Michal Hocko
@ 2017-11-15 19:17           ` YASUAKI ISHIMATSU
  2017-11-16  8:54             ` Michal Hocko
  0 siblings, 1 reply; 21+ messages in thread
From: YASUAKI ISHIMATSU @ 2017-11-15 19:17 UTC (permalink / raw)
  To: Michal Hocko, Mel Gorman
  Cc: Andrew Morton, linux-mm, linux-kernel, koki.sanagi, yasu.isimatu

Hi Michal and Mel,

To reproduce the issue, I specified the large trace buffer. The issue also occurs with
trace_buf_size=12M and movable_node on 4.14.0.

In my system, there are 384 CPUs and 8 nodes. So when not using movable_node boot option,
kernel can use about 16GB memory for trace buffer. So Kernel boots up with trace_buf_size=12M.
But when using movable_node, 6 nodes are managed as MOVABLE_ZONE in my system and kernel can
use only about 4GB memory for trace buffer. So memory allocation failure of trace buffer occurs
with trace_buf_size=12M and movable_node.

I don't know you still think 12M is large. But the latest Fujitsu server supports 448 CPUs.
The issue may occur with trace_buf_size=10M on the system. Additionally the number of CPU
in a server is increasing year by year. So the issue will occurs even if we don't specify
large trace buffer.

Thanks,
Yasuaki Ishimatsu

On 11/15/2017 09:57 AM, Michal Hocko wrote:
> On Wed 15-11-17 14:43:14, Mel Gorman wrote:
>> On Wed, Nov 15, 2017 at 03:28:16PM +0100, Michal Hocko wrote:
>>> On Wed 15-11-17 14:13:29, Mel Gorman wrote:
>>> [...]
>>>> I doubt anyone well. Even the original reporter appeared to pick that
>>>> particular value just to trigger the OOM.
>>>
>>> Then why do we care at all? The trace buffer size can be configured from
>>> the userspace if it is not sufficiently large IIRC.
>>>
>>
>> I guess there is the potential that the trace buffer needs to be large
>> enough early on in boot but I'm not sure why it would need to be that large
>> to be honest. Bottom line, it's fairly trivial to just serialise meminit
>> in the event that it's resized from command line. I'm also ok with just
>> leaving this is as a "don't set the buffer that large"
> 
> I would be reluctant to touch the code just because of insane kernel
> command line option.
> 
> That being said, I will not object or block the patch it just seems
> unnecessary for most reasonable setups I can think of. If there is a
> legitimate usage of such a large trace buffer then I wouldn't oppose.
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-15 19:17           ` YASUAKI ISHIMATSU
@ 2017-11-16  8:54             ` Michal Hocko
  2017-11-16 10:06               ` Mel Gorman
  0 siblings, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2017-11-16  8:54 UTC (permalink / raw)
  To: YASUAKI ISHIMATSU
  Cc: Mel Gorman, Andrew Morton, linux-mm, linux-kernel, koki.sanagi

On Wed 15-11-17 14:17:52, YASUAKI ISHIMATSU wrote:
> Hi Michal and Mel,
> 
> To reproduce the issue, I specified the large trace buffer. The issue also occurs with
> trace_buf_size=12M and movable_node on 4.14.0.

This is still 10x more than the default. Why do you need it in the first
place? You can of course find a size that will not fit into the initial
memory but I am questioning why do you want something like that during
early boot in the first place.

The whole deferred struct page allocation operates under assumption
that there are no large page allocator consumers that early during
the boot process. If this assumption is not correct then we probably
need a generic way to describe this. Add-hoc trace specific thing is
far from idea, imho. If anything the first approach to disable the
deferred initialization via kernel command line option sounds much more
appropriate and simpler to me.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-16  8:54             ` Michal Hocko
@ 2017-11-16 10:06               ` Mel Gorman
  2017-11-17 18:19                 ` Pavel Tatashin
  0 siblings, 1 reply; 21+ messages in thread
From: Mel Gorman @ 2017-11-16 10:06 UTC (permalink / raw)
  To: Michal Hocko
  Cc: YASUAKI ISHIMATSU, Andrew Morton, linux-mm, linux-kernel, koki.sanagi

On Thu, Nov 16, 2017 at 09:54:33AM +0100, Michal Hocko wrote:
> On Wed 15-11-17 14:17:52, YASUAKI ISHIMATSU wrote:
> > Hi Michal and Mel,
> > 
> > To reproduce the issue, I specified the large trace buffer. The issue also occurs with
> > trace_buf_size=12M and movable_node on 4.14.0.
> 
> This is still 10x more than the default. Why do you need it in the first
> place? You can of course find a size that will not fit into the initial
> memory but I am questioning why do you want something like that during
> early boot in the first place.
> 

This confused me as well. I couldn't think of a sensible use-case for
increasing the buffer other than stuffing trace_printk in multiple places
for debugging purposes. Even in such cases, it would be feasible to disable
the option in Kconfig just to have the large buffer.  Otherwise, just wait
until the system is booted and set if from userspace.

The lack of a sensible use-case is why I took a fairly blunt approach to
the problem. Keep it (relatively) simple and all that.

> The whole deferred struct page allocation operates under assumption
> that there are no large page allocator consumers that early during
> the boot process.

Yes.

> If this assumption is not correct then we probably
> need a generic way to describe this. Add-hoc trace specific thing is
> far from idea, imho. If anything the first approach to disable the
> deferred initialization via kernel command line option sounds much more
> appropriate and simpler to me.

So while the first approach was blunt, there are multiple other options.

1. Parse trace_buf_size in __setup to record the information before
   page alloc init starts. Take that into account in reset_deferred_meminit
   to increase the amount of memory that is serially initialised

2. Have tracing init with a small buffer and then resize it after
   page_alloc_init_late. This modifies tracing a bit but most of the
   helpers that are required are there. It would be more complex but
   it's doable

3. Add a kernel command line parameter that explicitly disables deferred
   meminit. We used to have something like this but it was never merged
   as we should be able to estimate how much memory is needed to boot.

4. Put a check into the page allocator slowpath that triggers serialised
   init if the system is booting and an allocation is about to fail. It
   would be such a cold path that it would never be noticable although it
   would leave dead code in the kernel image once boot had completed

However, it would be preferable by far to have a sensible use-case as to
why trace_buf_size= would be specified with a large buffer. It's hard to
know what level of complexity is justified without it.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-16 10:06               ` Mel Gorman
@ 2017-11-17 18:19                 ` Pavel Tatashin
  2017-11-17 21:32                   ` Mel Gorman
  2017-11-21  1:04                   ` Andrew Morton
  0 siblings, 2 replies; 21+ messages in thread
From: Pavel Tatashin @ 2017-11-17 18:19 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Michal Hocko, YASUAKI ISHIMATSU, Andrew Morton,
	Linux Memory Management List, linux-kernel, koki.sanagi,
	Steve Sistare

On Thu, Nov 16, 2017 at 5:06 AM, Mel Gorman <mgorman@techsingularity.net> wrote:
> 4. Put a check into the page allocator slowpath that triggers serialised
>    init if the system is booting and an allocation is about to fail. It
>    would be such a cold path that it would never be noticable although it
>    would leave dead code in the kernel image once boot had completed

Hi Mel,

The forth approach is the best as it is seamless for admins and
engineers, it will also work on any system configuration with any
parameters without any special involvement.

This approach will also address the following problem:
reset_deferred_meminit() has some assumptions about how much memory we
will need beforehand may break periodically as kernel requirements
change. For, instance, I recently reduced amount of memory system hash
tables take on large machines [1], so the comment in that function is
already outdated:
        /*
         * Initialise at least 2G of a node but also take into account that
         * two large system hashes that can take up 1GB for 0.25TB/node.
         */

With this approach we could always init a very small amount of struct
pages, and allow the rest to be initialized on demand as boot requires
until deferred struct pages are initialized. Since, having deferred
pages feature assumes that the machine is large, there is no drawback
of having some extra byte of dead code, especially that all the checks
can be permanently switched of via static branches once deferred init
is complete.

The second benefit that this approach may bring is the following: it
may enable to add a new feature which would initialize struct pages on
demand later, when needed by applications. This feature would be
configurable or enabled via kernel parameter (not sure which is
better).

if (allocation is failing)
  if (uninit struct pages available)
    init enought to finish alloc

Again, once all pages are initialized, the checks will be turned off
via static branching, so I think the code can be shared.

Here is the rationale for this feature:

Each physical machine may run a very large number of linux containers.
Steve Sistare (CCed), recently studied how much memory each instance
of clear container is taking, and it turns out to be about 125 MB,
when containers are booted with 2G of memory and 1 CPU. Out of those
125 MB, 32 MB is consumed by struct page array as we use 64-bytes per
page. Admins tend to be protective in the amount of memory that is
configured, therefore they may over-commit the amount of memory that
is actually required by the container. So, by allowing struct pages to
be initialized only on demand, we can save around 25% of the memory
that is consumed by fresh instance of container. Now, that struct
pages are not zeroed during boot [2], and if we will implement the
forth option, we can get closer to implementing a complete on demand
struct page initialization.

I can volunteer to work on these projects.

[1] https://patchwork.kernel.org/patch/9599545/
[2] https://lwn.net/Articles/734374

Thank you,
Pavel

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-17 18:19                 ` Pavel Tatashin
@ 2017-11-17 21:32                   ` Mel Gorman
  2017-11-30  3:41                     ` Pavel Tatashin
  2017-11-21  1:04                   ` Andrew Morton
  1 sibling, 1 reply; 21+ messages in thread
From: Mel Gorman @ 2017-11-17 21:32 UTC (permalink / raw)
  To: Pavel Tatashin
  Cc: Michal Hocko, YASUAKI ISHIMATSU, Andrew Morton,
	Linux Memory Management List, linux-kernel, koki.sanagi,
	Steve Sistare

On Fri, Nov 17, 2017 at 01:19:56PM -0500, Pavel Tatashin wrote:
> On Thu, Nov 16, 2017 at 5:06 AM, Mel Gorman <mgorman@techsingularity.net> wrote:
> > 4. Put a check into the page allocator slowpath that triggers serialised
> >    init if the system is booting and an allocation is about to fail. It
> >    would be such a cold path that it would never be noticable although it
> >    would leave dead code in the kernel image once boot had completed
> 
> Hi Mel,
> 

Hi Pavel,

> The forth approach is the best as it is seamless for admins and
> engineers, it will also work on any system configuration with any
> parameters without any special involvement.
> 

A lack of involvement from admins is indeed desirable. For example,
while I might concede on using a disable-everything-switch, I would not
be happy to introduce a switch that specified how much memory per node
to initialise.

For the forth approach, I really would be only thinking of a blunt
"initialise everything instead of going OOM". I was wary of making things
too complicated and I worried about some side-effects I'll cover later.

> This approach will also address the following problem:
> reset_deferred_meminit() has some assumptions about how much memory we
> will need beforehand may break periodically as kernel requirements
> change. For, instance, I recently reduced amount of memory system hash
> tables take on large machines [1], so the comment in that function is
> already outdated:
>         /*
>          * Initialise at least 2G of a node but also take into account that
>          * two large system hashes that can take up 1GB for 0.25TB/node.
>          */
> 

True, that could be updated although I would not necessarily alter the
value to minimise the memory requirements either. I would simply make
the comment a bit more general. More on this in a bit;

> With this approach we could always init a very small amount of struct
> pages, and allow the rest to be initialized on demand as boot requires
> until deferred struct pages are initialized. Since, having deferred
> pages feature assumes that the machine is large, there is no drawback
> of having some extra byte of dead code, especially that all the checks
> can be permanently switched of via static branches once deferred init
> is complete.
> 

This is where I fear there may be dragons. If we minimse the number of
struct pages and initialise serially as necessary, there is a danger that
we'll allocate remote memory in cases where local memory would have done
because a remote node had enough memory. To offset that risk, it would be
necessary at boot-time to force allocations from local node where possible
and initialise more memory as necessary. That starts getting complicated
because we'd need to adjust gfp-flags in the fast path with init-and-retry
logic in the slow path and that could be a constant penalty. We could offset
that in the fast path by using static branches but it's getting more and
more complex for what is a minor optimisation -- shorter boot times on
large machines where userspace itself could take a *long* time to get up
and running (think database reading in 1TB of data from disk as it warms up).

> The second benefit that this approach may bring is the following: it
> may enable to add a new feature which would initialize struct pages on
> demand later, when needed by applications. This feature would be
> configurable or enabled via kernel parameter (not sure which is
> better).
> 
> if (allocation is failing)
>   if (uninit struct pages available)
>     init enought to finish alloc
> 

There is a hidden hazard with this as well -- benchmarking. Early in the
lifetime of the system, it's going to be slower because we're initialising
memory while measuring performance when previously no such work would be
necessary. While it's somewhat of a corner-case, it's still real and it would
generate reports. For example, I got burned once by a "regression" that was
due to ext4's lazy_init feature because IO benchmarks appeared slower when in
reality, it was only due to a fresh filesystem initialising. It was necessary
to turn off the feature at mkfs time to get accurate measurements. I think
the same could happen with memory and we'd have to special case some things.

We'd want to be *very* sure there was a substantial benefit to the
complexity. For benchmarking a system, we'd also need to be able to turn
it off.

> Again, once all pages are initialized, the checks will be turned off
> via static branching, so I think the code can be shared.
> 
> Here is the rationale for this feature:
> 
> Each physical machine may run a very large number of linux containers.
> Steve Sistare (CCed), recently studied how much memory each instance
> of clear container is taking, and it turns out to be about 125 MB,
> when containers are booted with 2G of memory and 1 CPU. Out of those
> 125 MB, 32 MB is consumed by struct page array as we use 64-bytes per
> page. Admins tend to be protective in the amount of memory that is
> configured, therefore they may over-commit the amount of memory that
> is actually required by the container. So, by allowing struct pages to
> be initialized only on demand, we can save around 25% of the memory
> that is consumed by fresh instance of container. Now, that struct
> pages are not zeroed during boot [2], and if we will implement the
> forth option, we can get closer to implementing a complete on demand
> struct page initialization.
> 
> I can volunteer to work on these projects.
> 

I accept the potential for packing more containers into the system but While
I commend the work you've done so far, I'd be wary and warn you of going
too far down this path. I wouldn't NAK patches going in this direction as
long as they would eventually be behind static branches but I don't feel
it's the most urgent problem to work on either. This is why, even if I took
the fourth option, that it would be a blunt "init everything if we're going
OOM" approach.  However, that is my opinion and it's partially based on a
lack of sensible use cases. I suspect you have better justification that
would be included in changelogs.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-17 21:32                   ` Mel Gorman
@ 2017-11-30  3:41                     ` Pavel Tatashin
  2017-12-06 10:50                       ` Mel Gorman
  0 siblings, 1 reply; 21+ messages in thread
From: Pavel Tatashin @ 2017-11-30  3:41 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Michal Hocko, YASUAKI ISHIMATSU, Andrew Morton,
	Linux Memory Management List, linux-kernel, koki.sanagi,
	Steve Sistare

Hi Mel,

Thank you very much for your feedback, my replies below:

> A lack of involvement from admins is indeed desirable. For example,
> while I might concede on using a disable-everything-switch, I would not
> be happy to introduce a switch that specified how much memory per node
> to initialise.
>
> For the forth approach, I really would be only thinking of a blunt
> "initialise everything instead of going OOM". I was wary of making things
> too complicated and I worried about some side-effects I'll cover later.

I see, I misunderstood your suggestion. Switching to serial
initialization when OOM works, however, boot time becomes
unpredictable, with some configurations boot is fast with others it is
slow. All of that depends on whether predictions in
reset_deferred_meminit() were good or not which is not easy to debug
for users. Also, overtime predictions in reset_deferred_meminit() can
become very off, and I do not think that we want to continuously
adjust this function.

>> With this approach we could always init a very small amount of struct
>> pages, and allow the rest to be initialized on demand as boot requires
>> until deferred struct pages are initialized. Since, having deferred
>> pages feature assumes that the machine is large, there is no drawback
>> of having some extra byte of dead code, especially that all the checks
>> can be permanently switched of via static branches once deferred init
>> is complete.
>>
>
> This is where I fear there may be dragons. If we minimse the number of
> struct pages and initialise serially as necessary, there is a danger that
> we'll allocate remote memory in cases where local memory would have done
> because a remote node had enough memory.

True, but is not what we have now has the same issue as well? If one
node is gets out of memory we start using memory from another node,
before deferred pages are initialized?

 To offset that risk, it would be
> necessary at boot-time to force allocations from local node where possible
> and initialise more memory as necessary. That starts getting complicated
> because we'd need to adjust gfp-flags in the fast path with init-and-retry
> logic in the slow path and that could be a constant penalty. We could offset
> that in the fast path by using static branches

I will try to implement this, and see how complicated the patch will
be, if it gets too complicated for the problem I am trying to solve we
can return to one of your suggestions.

I was thinking to do something like this:

Start with every small amount of initialized pages in every node.
If allocation fails, initialize enough struct pages to cover this
particular allocation with struct pages rounded up to section size but
in every single node.

> but it's getting more and
> more complex for what is a minor optimisation -- shorter boot times on
> large machines where userspace itself could take a *long* time to get up
> and running (think database reading in 1TB of data from disk as it warms up).

On M6-32 with 32T [1] of memory it saves over 4 minutes of boot time,
and this is on SPARC with 8K pages, on x86 it would be around of 8
minutes because of twice as many pages. This feature improves
availability for larger machines quite a bit. Overtime, systems are
growing, so I expect this feature to become a default configuration in
the next several years on server configs.

>
>> The second benefit that this approach may bring is the following: it
>> may enable to add a new feature which would initialize struct pages on
>> demand later, when needed by applications. This feature would be
>> configurable or enabled via kernel parameter (not sure which is
>> better).
>>
>> if (allocation is failing)
>>   if (uninit struct pages available)
>>     init enought to finish alloc
>>
>
> There is a hidden hazard with this as well -- benchmarking. Early in the
> lifetime of the system, it's going to be slower because we're initialising
> memory while measuring performance when previously no such work would be
> necessary. While it's somewhat of a corner-case, it's still real and it would
> generate reports. For example, I got burned once by a "regression" that was
> due to ext4's lazy_init feature because IO benchmarks appeared slower when in
> reality, it was only due to a fresh filesystem initialising. It was necessary
> to turn off the feature at mkfs time to get accurate measurements. I think
> the same could happen with memory and we'd have to special case some things.
>
> We'd want to be *very* sure there was a substantial benefit to the
> complexity. For benchmarking a system, we'd also need to be able to turn
> it off.
>
>> Again, once all pages are initialized, the checks will be turned off
>> via static branching, so I think the code can be shared.
>>
>> Here is the rationale for this feature:
>>
>> Each physical machine may run a very large number of linux containers.
>> Steve Sistare (CCed), recently studied how much memory each instance
>> of clear container is taking, and it turns out to be about 125 MB,
>> when containers are booted with 2G of memory and 1 CPU. Out of those
>> 125 MB, 32 MB is consumed by struct page array as we use 64-bytes per
>> page. Admins tend to be protective in the amount of memory that is
>> configured, therefore they may over-commit the amount of memory that
>> is actually required by the container. So, by allowing struct pages to
>> be initialized only on demand, we can save around 25% of the memory
>> that is consumed by fresh instance of container. Now, that struct
>> pages are not zeroed during boot [2], and if we will implement the
>> forth option, we can get closer to implementing a complete on demand
>> struct page initialization.
>>
>> I can volunteer to work on these projects.
>>
>
> I accept the potential for packing more containers into the system but While
> I commend the work you've done so far, I'd be wary and warn you of going
> too far down this path. I wouldn't NAK patches going in this direction as
> long as they would eventually be behind static branches but I don't feel
> it's the most urgent problem to work on either. This is why, even if I took
> the fourth option, that it would be a blunt "init everything if we're going
> OOM" approach.  However, that is my opinion and it's partially based on a
> lack of sensible use cases. I suspect you have better justification that
> would be included in changelogs.

Thank you for providing your opinion. I agree, the benefit of on
demand page initialization after boot must considerably outweigh the
extra complexity and potential slow down. At the moment I do not see
it as a critical issue, so I won't be working on this 2nd proposal.

[1] http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/o13-066-sparc-m6-32-architecture-2016053.pdf

Thank you,
Pavel

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-30  3:41                     ` Pavel Tatashin
@ 2017-12-06 10:50                       ` Mel Gorman
  2018-01-31 17:28                         ` Koki.Sanagi
  0 siblings, 1 reply; 21+ messages in thread
From: Mel Gorman @ 2017-12-06 10:50 UTC (permalink / raw)
  To: Pavel Tatashin
  Cc: Michal Hocko, YASUAKI ISHIMATSU, Andrew Morton,
	Linux Memory Management List, linux-kernel, koki.sanagi,
	Steve Sistare

On Wed, Nov 29, 2017 at 10:41:59PM -0500, Pavel Tatashin wrote:
> Hi Mel,
> 
> Thank you very much for your feedback, my replies below:
> 
> > A lack of involvement from admins is indeed desirable. For example,
> > while I might concede on using a disable-everything-switch, I would not
> > be happy to introduce a switch that specified how much memory per node
> > to initialise.
> >
> > For the forth approach, I really would be only thinking of a blunt
> > "initialise everything instead of going OOM". I was wary of making things
> > too complicated and I worried about some side-effects I'll cover later.
> 
> I see, I misunderstood your suggestion. Switching to serial
> initialization when OOM works, however, boot time becomes
> unpredictable, with some configurations boot is fast with others it is
> slow. All of that depends on whether predictions in
> reset_deferred_meminit() were good or not which is not easy to debug
> for users. Also, overtime predictions in reset_deferred_meminit() can
> become very off, and I do not think that we want to continuously
> adjust this function.
> 

You could increase the probabilty of a report by doing a WARN_ON_ONCE if
the serialised meminit is used.

> >> With this approach we could always init a very small amount of struct
> >> pages, and allow the rest to be initialized on demand as boot requires
> >> until deferred struct pages are initialized. Since, having deferred
> >> pages feature assumes that the machine is large, there is no drawback
> >> of having some extra byte of dead code, especially that all the checks
> >> can be permanently switched of via static branches once deferred init
> >> is complete.
> >>
> >
> > This is where I fear there may be dragons. If we minimse the number of
> > struct pages and initialise serially as necessary, there is a danger that
> > we'll allocate remote memory in cases where local memory would have done
> > because a remote node had enough memory.
> 
> True, but is not what we have now has the same issue as well? If one
> node is gets out of memory we start using memory from another node,
> before deferred pages are initialized?
> 

It's possible but I'm not aware of it happening currently.

>  To offset that risk, it would be
> > necessary at boot-time to force allocations from local node where possible
> > and initialise more memory as necessary. That starts getting complicated
> > because we'd need to adjust gfp-flags in the fast path with init-and-retry
> > logic in the slow path and that could be a constant penalty. We could offset
> > that in the fast path by using static branches
> 
> I will try to implement this, and see how complicated the patch will
> be, if it gets too complicated for the problem I am trying to solve we
> can return to one of your suggestions.
> 
> I was thinking to do something like this:
> 
> Start with every small amount of initialized pages in every node.
> If allocation fails, initialize enough struct pages to cover this
> particular allocation with struct pages rounded up to section size but
> in every single node.
> 

Ok, just make sure it's all in the slow paths of the allocator when the
alternative is to fail the allocation.

> > but it's getting more and
> > more complex for what is a minor optimisation -- shorter boot times on
> > large machines where userspace itself could take a *long* time to get up
> > and running (think database reading in 1TB of data from disk as it warms up).
> 
> On M6-32 with 32T [1] of memory it saves over 4 minutes of boot time,
> and this is on SPARC with 8K pages, on x86 it would be around of 8
> minutes because of twice as many pages. This feature improves
> availability for larger machines quite a bit. Overtime, systems are
> growing, so I expect this feature to become a default configuration in
> the next several years on server configs.
> 

Ok, when developing the series originally, I had no machine even close
to 32T of memory.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-12-06 10:50                       ` Mel Gorman
@ 2018-01-31 17:28                         ` Koki.Sanagi
  2018-01-31 18:24                           ` Pavel Tatashin
  0 siblings, 1 reply; 21+ messages in thread
From: Koki.Sanagi @ 2018-01-31 17:28 UTC (permalink / raw)
  To: Mel Gorman, Pavel Tatashin
  Cc: Michal Hocko, YASUAKI ISHIMATSU, Andrew Morton,
	Linux Memory Management List, linux-kernel, Steve Sistare,
	msys.mizuma

Pavel,

I assume you are working on the fix.
Do you have any progress ?

Koki

>>-----Original Message-----
>>From: Mel Gorman [mailto:mgorman@techsingularity.net]
>>Sent: Wednesday, December 06, 2017 5:50 AM
>>To: Pavel Tatashin <pasha.tatashin@oracle.com>
>>Cc: Michal Hocko <mhocko@kernel.org>; YASUAKI ISHIMATSU
>><yasu.isimatu@gmail.com>; Andrew Morton <akpm@linux-foundation.org>;
>>Linux Memory Management List <linux-mm@kvack.org>; linux-
>>kernel@vger.kernel.org; Sanagi, Koki <Koki.Sanagi@us.fujitsu.com>; Steve
>>Sistare <steven.sistare@oracle.com>
>>Subject: Re: [PATCH] mm, meminit: Serially initialise deferred memory if
>>trace_buf_size is specified
>>
>>On Wed, Nov 29, 2017 at 10:41:59PM -0500, Pavel Tatashin wrote:
>>> Hi Mel,
>>>
>>> Thank you very much for your feedback, my replies below:
>>>
>>> > A lack of involvement from admins is indeed desirable. For example,
>>> > while I might concede on using a disable-everything-switch, I would
>>> > not be happy to introduce a switch that specified how much memory
>>> > per node to initialise.
>>> >
>>> > For the forth approach, I really would be only thinking of a blunt
>>> > "initialise everything instead of going OOM". I was wary of making
>>> > things too complicated and I worried about some side-effects I'll cover later.
>>>
>>> I see, I misunderstood your suggestion. Switching to serial
>>> initialization when OOM works, however, boot time becomes
>>> unpredictable, with some configurations boot is fast with others it is
>>> slow. All of that depends on whether predictions in
>>> reset_deferred_meminit() were good or not which is not easy to debug
>>> for users. Also, overtime predictions in reset_deferred_meminit() can
>>> become very off, and I do not think that we want to continuously
>>> adjust this function.
>>>
>>
>>You could increase the probabilty of a report by doing a WARN_ON_ONCE if the
>>serialised meminit is used.
>>
>>> >> With this approach we could always init a very small amount of
>>> >> struct pages, and allow the rest to be initialized on demand as
>>> >> boot requires until deferred struct pages are initialized. Since,
>>> >> having deferred pages feature assumes that the machine is large,
>>> >> there is no drawback of having some extra byte of dead code,
>>> >> especially that all the checks can be permanently switched of via
>>> >> static branches once deferred init is complete.
>>> >>
>>> >
>>> > This is where I fear there may be dragons. If we minimse the number
>>> > of struct pages and initialise serially as necessary, there is a
>>> > danger that we'll allocate remote memory in cases where local memory
>>> > would have done because a remote node had enough memory.
>>>
>>> True, but is not what we have now has the same issue as well? If one
>>> node is gets out of memory we start using memory from another node,
>>> before deferred pages are initialized?
>>>
>>
>>It's possible but I'm not aware of it happening currently.
>>
>>>  To offset that risk, it would be
>>> > necessary at boot-time to force allocations from local node where
>>> > possible and initialise more memory as necessary. That starts
>>> > getting complicated because we'd need to adjust gfp-flags in the
>>> > fast path with init-and-retry logic in the slow path and that could
>>> > be a constant penalty. We could offset that in the fast path by
>>> > using static branches
>>>
>>> I will try to implement this, and see how complicated the patch will
>>> be, if it gets too complicated for the problem I am trying to solve we
>>> can return to one of your suggestions.
>>>
>>> I was thinking to do something like this:
>>>
>>> Start with every small amount of initialized pages in every node.
>>> If allocation fails, initialize enough struct pages to cover this
>>> particular allocation with struct pages rounded up to section size but
>>> in every single node.
>>>
>>
>>Ok, just make sure it's all in the slow paths of the allocator when the alternative
>>is to fail the allocation.
>>
>>> > but it's getting more and
>>> > more complex for what is a minor optimisation -- shorter boot times
>>> > on large machines where userspace itself could take a *long* time to
>>> > get up and running (think database reading in 1TB of data from disk as it
>>warms up).
>>>
>>> On M6-32 with 32T [1] of memory it saves over 4 minutes of boot time,
>>> and this is on SPARC with 8K pages, on x86 it would be around of 8
>>> minutes because of twice as many pages. This feature improves
>>> availability for larger machines quite a bit. Overtime, systems are
>>> growing, so I expect this feature to become a default configuration in
>>> the next several years on server configs.
>>>
>>
>>Ok, when developing the series originally, I had no machine even close to 32T of
>>memory.
>>
>>--
>>Mel Gorman
>>SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2018-01-31 17:28                         ` Koki.Sanagi
@ 2018-01-31 18:24                           ` Pavel Tatashin
  2018-02-05 14:14                             ` Masayoshi Mizuma
  0 siblings, 1 reply; 21+ messages in thread
From: Pavel Tatashin @ 2018-01-31 18:24 UTC (permalink / raw)
  To: Koki.Sanagi
  Cc: Mel Gorman, Michal Hocko, YASUAKI ISHIMATSU, Andrew Morton,
	Linux Memory Management List, linux-kernel, Steve Sistare,
	msys.mizuma

Hi Koki,

Yes, the patch is here:
https://lkml.org/lkml/2018/1/12/600

It has not been reviewed yet.

Pavel

On Wed, Jan 31, 2018 at 12:28 PM, Koki.Sanagi@us.fujitsu.com
<Koki.Sanagi@us.fujitsu.com> wrote:
> Pavel,
>
> I assume you are working on the fix.
> Do you have any progress ?
>
> Koki
>
>>>-----Original Message-----
>>>From: Mel Gorman [mailto:mgorman@techsingularity.net]
>>>Sent: Wednesday, December 06, 2017 5:50 AM
>>>To: Pavel Tatashin <pasha.tatashin@oracle.com>
>>>Cc: Michal Hocko <mhocko@kernel.org>; YASUAKI ISHIMATSU
>>><yasu.isimatu@gmail.com>; Andrew Morton <akpm@linux-foundation.org>;
>>>Linux Memory Management List <linux-mm@kvack.org>; linux-
>>>kernel@vger.kernel.org; Sanagi, Koki <Koki.Sanagi@us.fujitsu.com>; Steve
>>>Sistare <steven.sistare@oracle.com>
>>>Subject: Re: [PATCH] mm, meminit: Serially initialise deferred memory if
>>>trace_buf_size is specified
>>>
>>>On Wed, Nov 29, 2017 at 10:41:59PM -0500, Pavel Tatashin wrote:
>>>> Hi Mel,
>>>>
>>>> Thank you very much for your feedback, my replies below:
>>>>
>>>> > A lack of involvement from admins is indeed desirable. For example,
>>>> > while I might concede on using a disable-everything-switch, I would
>>>> > not be happy to introduce a switch that specified how much memory
>>>> > per node to initialise.
>>>> >
>>>> > For the forth approach, I really would be only thinking of a blunt
>>>> > "initialise everything instead of going OOM". I was wary of making
>>>> > things too complicated and I worried about some side-effects I'll cover later.
>>>>
>>>> I see, I misunderstood your suggestion. Switching to serial
>>>> initialization when OOM works, however, boot time becomes
>>>> unpredictable, with some configurations boot is fast with others it is
>>>> slow. All of that depends on whether predictions in
>>>> reset_deferred_meminit() were good or not which is not easy to debug
>>>> for users. Also, overtime predictions in reset_deferred_meminit() can
>>>> become very off, and I do not think that we want to continuously
>>>> adjust this function.
>>>>
>>>
>>>You could increase the probabilty of a report by doing a WARN_ON_ONCE if the
>>>serialised meminit is used.
>>>
>>>> >> With this approach we could always init a very small amount of
>>>> >> struct pages, and allow the rest to be initialized on demand as
>>>> >> boot requires until deferred struct pages are initialized. Since,
>>>> >> having deferred pages feature assumes that the machine is large,
>>>> >> there is no drawback of having some extra byte of dead code,
>>>> >> especially that all the checks can be permanently switched of via
>>>> >> static branches once deferred init is complete.
>>>> >>
>>>> >
>>>> > This is where I fear there may be dragons. If we minimse the number
>>>> > of struct pages and initialise serially as necessary, there is a
>>>> > danger that we'll allocate remote memory in cases where local memory
>>>> > would have done because a remote node had enough memory.
>>>>
>>>> True, but is not what we have now has the same issue as well? If one
>>>> node is gets out of memory we start using memory from another node,
>>>> before deferred pages are initialized?
>>>>
>>>
>>>It's possible but I'm not aware of it happening currently.
>>>
>>>>  To offset that risk, it would be
>>>> > necessary at boot-time to force allocations from local node where
>>>> > possible and initialise more memory as necessary. That starts
>>>> > getting complicated because we'd need to adjust gfp-flags in the
>>>> > fast path with init-and-retry logic in the slow path and that could
>>>> > be a constant penalty. We could offset that in the fast path by
>>>> > using static branches
>>>>
>>>> I will try to implement this, and see how complicated the patch will
>>>> be, if it gets too complicated for the problem I am trying to solve we
>>>> can return to one of your suggestions.
>>>>
>>>> I was thinking to do something like this:
>>>>
>>>> Start with every small amount of initialized pages in every node.
>>>> If allocation fails, initialize enough struct pages to cover this
>>>> particular allocation with struct pages rounded up to section size but
>>>> in every single node.
>>>>
>>>
>>>Ok, just make sure it's all in the slow paths of the allocator when the alternative
>>>is to fail the allocation.
>>>
>>>> > but it's getting more and
>>>> > more complex for what is a minor optimisation -- shorter boot times
>>>> > on large machines where userspace itself could take a *long* time to
>>>> > get up and running (think database reading in 1TB of data from disk as it
>>>warms up).
>>>>
>>>> On M6-32 with 32T [1] of memory it saves over 4 minutes of boot time,
>>>> and this is on SPARC with 8K pages, on x86 it would be around of 8
>>>> minutes because of twice as many pages. This feature improves
>>>> availability for larger machines quite a bit. Overtime, systems are
>>>> growing, so I expect this feature to become a default configuration in
>>>> the next several years on server configs.
>>>>
>>>
>>>Ok, when developing the series originally, I had no machine even close to 32T of
>>>memory.
>>>
>>>--
>>>Mel Gorman
>>>SUSE Labs
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a hrefmailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2018-01-31 18:24                           ` Pavel Tatashin
@ 2018-02-05 14:14                             ` Masayoshi Mizuma
  2018-02-05 15:26                               ` Pavel Tatashin
  0 siblings, 1 reply; 21+ messages in thread
From: Masayoshi Mizuma @ 2018-02-05 14:14 UTC (permalink / raw)
  To: pasha.tatashin, Koki.Sanagi
  Cc: mgorman, mhocko, yasu.isimatu, akpm, linux-mm, linux-kernel,
	steven.sistare

Hello Pavel,

> Yes, the patch is here:
> https://lkml.org/lkml/2018/1/12/600

I tested your patch in my box and it worked well.
Please feel free to add the following.

Tested-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>

You may repost the patch after adding your reply for
Andrew's comment as [PATCH 0/1]...

- Masayoshi

Wed, 31 Jan 2018 13:24:55 -0500 Pavel Tatashin wrote:
> Hi Koki,
> 
> Yes, the patch is here:
> https://lkml.org/lkml/2018/1/12/600
> 
> It has not been reviewed yet.
> 
> Pavel
> 
> On Wed, Jan 31, 2018 at 12:28 PM, Koki.Sanagi@us.fujitsu.com
> <Koki.Sanagi@us.fujitsu.com> wrote:
>> Pavel,
>>
>> I assume you are working on the fix.
>> Do you have any progress ?
>>
>> Koki
>>
>>>> -----Original Message-----
>>>> From: Mel Gorman [mailto:mgorman@techsingularity.net]
>>>> Sent: Wednesday, December 06, 2017 5:50 AM
>>>> To: Pavel Tatashin <pasha.tatashin@oracle.com>
>>>> Cc: Michal Hocko <mhocko@kernel.org>; YASUAKI ISHIMATSU
>>>> <yasu.isimatu@gmail.com>; Andrew Morton <akpm@linux-foundation.org>;
>>>> Linux Memory Management List <linux-mm@kvack.org>; linux-
>>>> kernel@vger.kernel.org; Sanagi, Koki <Koki.Sanagi@us.fujitsu.com>; Steve
>>>> Sistare <steven.sistare@oracle.com>
>>>> Subject: Re: [PATCH] mm, meminit: Serially initialise deferred memory if
>>>> trace_buf_size is specified
>>>>
>>>> On Wed, Nov 29, 2017 at 10:41:59PM -0500, Pavel Tatashin wrote:
>>>>> Hi Mel,
>>>>>
>>>>> Thank you very much for your feedback, my replies below:
>>>>>
>>>>>> A lack of involvement from admins is indeed desirable. For example,
>>>>>> while I might concede on using a disable-everything-switch, I would
>>>>>> not be happy to introduce a switch that specified how much memory
>>>>>> per node to initialise.
>>>>>>
>>>>>> For the forth approach, I really would be only thinking of a blunt
>>>>>> "initialise everything instead of going OOM". I was wary of making
>>>>>> things too complicated and I worried about some side-effects I'll cover later.
>>>>>
>>>>> I see, I misunderstood your suggestion. Switching to serial
>>>>> initialization when OOM works, however, boot time becomes
>>>>> unpredictable, with some configurations boot is fast with others it is
>>>>> slow. All of that depends on whether predictions in
>>>>> reset_deferred_meminit() were good or not which is not easy to debug
>>>>> for users. Also, overtime predictions in reset_deferred_meminit() can
>>>>> become very off, and I do not think that we want to continuously
>>>>> adjust this function.
>>>>>
>>>>
>>>> You could increase the probabilty of a report by doing a WARN_ON_ONCE if the
>>>> serialised meminit is used.
>>>>
>>>>>>> With this approach we could always init a very small amount of
>>>>>>> struct pages, and allow the rest to be initialized on demand as
>>>>>>> boot requires until deferred struct pages are initialized. Since,
>>>>>>> having deferred pages feature assumes that the machine is large,
>>>>>>> there is no drawback of having some extra byte of dead code,
>>>>>>> especially that all the checks can be permanently switched of via
>>>>>>> static branches once deferred init is complete.
>>>>>>>
>>>>>>
>>>>>> This is where I fear there may be dragons. If we minimse the number
>>>>>> of struct pages and initialise serially as necessary, there is a
>>>>>> danger that we'll allocate remote memory in cases where local memory
>>>>>> would have done because a remote node had enough memory.
>>>>>
>>>>> True, but is not what we have now has the same issue as well? If one
>>>>> node is gets out of memory we start using memory from another node,
>>>>> before deferred pages are initialized?
>>>>>
>>>>
>>>> It's possible but I'm not aware of it happening currently.
>>>>
>>>>>  To offset that risk, it would be
>>>>>> necessary at boot-time to force allocations from local node where
>>>>>> possible and initialise more memory as necessary. That starts
>>>>>> getting complicated because we'd need to adjust gfp-flags in the
>>>>>> fast path with init-and-retry logic in the slow path and that could
>>>>>> be a constant penalty. We could offset that in the fast path by
>>>>>> using static branches
>>>>>
>>>>> I will try to implement this, and see how complicated the patch will
>>>>> be, if it gets too complicated for the problem I am trying to solve we
>>>>> can return to one of your suggestions.
>>>>>
>>>>> I was thinking to do something like this:
>>>>>
>>>>> Start with every small amount of initialized pages in every node.
>>>>> If allocation fails, initialize enough struct pages to cover this
>>>>> particular allocation with struct pages rounded up to section size but
>>>>> in every single node.
>>>>>
>>>>
>>>> Ok, just make sure it's all in the slow paths of the allocator when the alternative
>>>> is to fail the allocation.
>>>>
>>>>>> but it's getting more and
>>>>>> more complex for what is a minor optimisation -- shorter boot times
>>>>>> on large machines where userspace itself could take a *long* time to
>>>>>> get up and running (think database reading in 1TB of data from disk as it
>>>> warms up).
>>>>>
>>>>> On M6-32 with 32T [1] of memory it saves over 4 minutes of boot time,
>>>>> and this is on SPARC with 8K pages, on x86 it would be around of 8
>>>>> minutes because of twice as many pages. This feature improves
>>>>> availability for larger machines quite a bit. Overtime, systems are
>>>>> growing, so I expect this feature to become a default configuration in
>>>>> the next several years on server configs.
>>>>>
>>>>
>>>> Ok, when developing the series originally, I had no machine even close to 32T of
>>>> memory.
>>>>
>>>> --
>>>> Mel Gorman
>>>> SUSE Labs
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majordomo@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a hrefmailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2018-02-05 14:14                             ` Masayoshi Mizuma
@ 2018-02-05 15:26                               ` Pavel Tatashin
  0 siblings, 0 replies; 21+ messages in thread
From: Pavel Tatashin @ 2018-02-05 15:26 UTC (permalink / raw)
  To: Masayoshi Mizuma
  Cc: Koki.Sanagi, Mel Gorman, Michal Hocko, YASUAKI ISHIMATSU,
	Andrew Morton, Linux Memory Management List,
	Linux Kernel Mailing List, Steve Sistare

Thank you Masayoshi for verifying this work. I will submit it as you suggested.

Pavel

On Mon, Feb 5, 2018 at 9:14 AM, Masayoshi Mizuma <msys.mizuma@gmail.com> wrote:
> Hello Pavel,
>
>> Yes, the patch is here:
>> https://lkml.org/lkml/2018/1/12/600
>
> I tested your patch in my box and it worked well.
> Please feel free to add the following.
>
> Tested-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
>
> You may repost the patch after adding your reply for
> Andrew's comment as [PATCH 0/1]...
>
> - Masayoshi
>
> Wed, 31 Jan 2018 13:24:55 -0500 Pavel Tatashin wrote:
>> Hi Koki,
>>
>> Yes, the patch is here:
>> https://lkml.org/lkml/2018/1/12/600
>>
>> It has not been reviewed yet.
>>
>> Pavel
>>
>> On Wed, Jan 31, 2018 at 12:28 PM, Koki.Sanagi@us.fujitsu.com
>> <Koki.Sanagi@us.fujitsu.com> wrote:
>>> Pavel,
>>>
>>> I assume you are working on the fix.
>>> Do you have any progress ?
>>>
>>> Koki
>>>
>>>>> -----Original Message-----
>>>>> From: Mel Gorman [mailto:mgorman@techsingularity.net]
>>>>> Sent: Wednesday, December 06, 2017 5:50 AM
>>>>> To: Pavel Tatashin <pasha.tatashin@oracle.com>
>>>>> Cc: Michal Hocko <mhocko@kernel.org>; YASUAKI ISHIMATSU
>>>>> <yasu.isimatu@gmail.com>; Andrew Morton <akpm@linux-foundation.org>;
>>>>> Linux Memory Management List <linux-mm@kvack.org>; linux-
>>>>> kernel@vger.kernel.org; Sanagi, Koki <Koki.Sanagi@us.fujitsu.com>; Steve
>>>>> Sistare <steven.sistare@oracle.com>
>>>>> Subject: Re: [PATCH] mm, meminit: Serially initialise deferred memory if
>>>>> trace_buf_size is specified
>>>>>
>>>>> On Wed, Nov 29, 2017 at 10:41:59PM -0500, Pavel Tatashin wrote:
>>>>>> Hi Mel,
>>>>>>
>>>>>> Thank you very much for your feedback, my replies below:
>>>>>>
>>>>>>> A lack of involvement from admins is indeed desirable. For example,
>>>>>>> while I might concede on using a disable-everything-switch, I would
>>>>>>> not be happy to introduce a switch that specified how much memory
>>>>>>> per node to initialise.
>>>>>>>
>>>>>>> For the forth approach, I really would be only thinking of a blunt
>>>>>>> "initialise everything instead of going OOM". I was wary of making
>>>>>>> things too complicated and I worried about some side-effects I'll cover later.
>>>>>>
>>>>>> I see, I misunderstood your suggestion. Switching to serial
>>>>>> initialization when OOM works, however, boot time becomes
>>>>>> unpredictable, with some configurations boot is fast with others it is
>>>>>> slow. All of that depends on whether predictions in
>>>>>> reset_deferred_meminit() were good or not which is not easy to debug
>>>>>> for users. Also, overtime predictions in reset_deferred_meminit() can
>>>>>> become very off, and I do not think that we want to continuously
>>>>>> adjust this function.
>>>>>>
>>>>>
>>>>> You could increase the probabilty of a report by doing a WARN_ON_ONCE if the
>>>>> serialised meminit is used.
>>>>>
>>>>>>>> With this approach we could always init a very small amount of
>>>>>>>> struct pages, and allow the rest to be initialized on demand as
>>>>>>>> boot requires until deferred struct pages are initialized. Since,
>>>>>>>> having deferred pages feature assumes that the machine is large,
>>>>>>>> there is no drawback of having some extra byte of dead code,
>>>>>>>> especially that all the checks can be permanently switched of via
>>>>>>>> static branches once deferred init is complete.
>>>>>>>>
>>>>>>>
>>>>>>> This is where I fear there may be dragons. If we minimse the number
>>>>>>> of struct pages and initialise serially as necessary, there is a
>>>>>>> danger that we'll allocate remote memory in cases where local memory
>>>>>>> would have done because a remote node had enough memory.
>>>>>>
>>>>>> True, but is not what we have now has the same issue as well? If one
>>>>>> node is gets out of memory we start using memory from another node,
>>>>>> before deferred pages are initialized?
>>>>>>
>>>>>
>>>>> It's possible but I'm not aware of it happening currently.
>>>>>
>>>>>>  To offset that risk, it would be
>>>>>>> necessary at boot-time to force allocations from local node where
>>>>>>> possible and initialise more memory as necessary. That starts
>>>>>>> getting complicated because we'd need to adjust gfp-flags in the
>>>>>>> fast path with init-and-retry logic in the slow path and that could
>>>>>>> be a constant penalty. We could offset that in the fast path by
>>>>>>> using static branches
>>>>>>
>>>>>> I will try to implement this, and see how complicated the patch will
>>>>>> be, if it gets too complicated for the problem I am trying to solve we
>>>>>> can return to one of your suggestions.
>>>>>>
>>>>>> I was thinking to do something like this:
>>>>>>
>>>>>> Start with every small amount of initialized pages in every node.
>>>>>> If allocation fails, initialize enough struct pages to cover this
>>>>>> particular allocation with struct pages rounded up to section size but
>>>>>> in every single node.
>>>>>>
>>>>>
>>>>> Ok, just make sure it's all in the slow paths of the allocator when the alternative
>>>>> is to fail the allocation.
>>>>>
>>>>>>> but it's getting more and
>>>>>>> more complex for what is a minor optimisation -- shorter boot times
>>>>>>> on large machines where userspace itself could take a *long* time to
>>>>>>> get up and running (think database reading in 1TB of data from disk as it
>>>>> warms up).
>>>>>>
>>>>>> On M6-32 with 32T [1] of memory it saves over 4 minutes of boot time,
>>>>>> and this is on SPARC with 8K pages, on x86 it would be around of 8
>>>>>> minutes because of twice as many pages. This feature improves
>>>>>> availability for larger machines quite a bit. Overtime, systems are
>>>>>> growing, so I expect this feature to become a default configuration in
>>>>>> the next several years on server configs.
>>>>>>
>>>>>
>>>>> Ok, when developing the series originally, I had no machine even close to 32T of
>>>>> memory.
>>>>>
>>>>> --
>>>>> Mel Gorman
>>>>> SUSE Labs
>>>
>>> --
>>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>>> the body to majordomo@kvack.org.  For more info on Linux MM,
>>> see: http://www.linux-mm.org/ .
>>> Don't email: <a hrefmailto:"dont@kvack.org"> email@kvack.org </a>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-17 18:19                 ` Pavel Tatashin
  2017-11-17 21:32                   ` Mel Gorman
@ 2017-11-21  1:04                   ` Andrew Morton
  2017-11-30  3:49                     ` Pavel Tatashin
  1 sibling, 1 reply; 21+ messages in thread
From: Andrew Morton @ 2017-11-21  1:04 UTC (permalink / raw)
  To: Pavel Tatashin
  Cc: Mel Gorman, Michal Hocko, YASUAKI ISHIMATSU,
	Linux Memory Management List, linux-kernel, koki.sanagi,
	Steve Sistare

On Fri, 17 Nov 2017 13:19:56 -0500 Pavel Tatashin <pasha.tatashin@oracle.com> wrote:

> On Thu, Nov 16, 2017 at 5:06 AM, Mel Gorman <mgorman@techsingularity.net> wrote:
> > 4. Put a check into the page allocator slowpath that triggers serialised
> >    init if the system is booting and an allocation is about to fail. It
> >    would be such a cold path that it would never be noticable although it
> >    would leave dead code in the kernel image once boot had completed
> 
> Hi Mel,
> 
> The forth approach is the best as it is seamless for admins and
> engineers, it will also work on any system configuration with any
> parameters without any special involvement.

Apart from what-mel-said, I'd be concerned that this failsafe would
almost never get tested.  We should find some way to ensure that this
code gets exercised in some people's kernels on a permanent basis and
I'm not sure how to do that.

One option might be to ask Fengguang to add the occasional
test_pavels_stuff=1 to the kernel boot commandline.  That's better
than nothing but 0-day only runs on a small number of machine types.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-21  1:04                   ` Andrew Morton
@ 2017-11-30  3:49                     ` Pavel Tatashin
  0 siblings, 0 replies; 21+ messages in thread
From: Pavel Tatashin @ 2017-11-30  3:49 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Michal Hocko, YASUAKI ISHIMATSU,
	Linux Memory Management List, linux-kernel, koki.sanagi,
	Steve Sistare

>> Hi Mel,
>>
>> The forth approach is the best as it is seamless for admins and
>> engineers, it will also work on any system configuration with any
>> parameters without any special involvement.
>
> Apart from what-mel-said, I'd be concerned that this failsafe would
> almost never get tested.  We should find some way to ensure that this
> code gets exercised in some people's kernels on a permanent basis and
> I'm not sure how to do that.
>
> One option might be to ask Fengguang to add the occasional
> test_pavels_stuff=1 to the kernel boot commandline.  That's better
> than nothing but 0-day only runs on a small number of machine types.
>

Hi Andrew,

Excellent point about testing. I think, that if I implement it the way
I proposed in the previous e-mail:

1. initialize very few struct pages initially
2. initialize more as kernel needs them in every node
3. finally initialize all the rest when other cpus are started

We will have coverage for my code every time machine boots (and
deferred page init feature configured), as the the initial very few
struct pages is not going to be enough on any machine. Potentially, we
will also see some small boot time improvement because we will
initialize serially only as many pages as needed, and not do upper
bound guessing about how many pages is needed beforehand.

Thank you,
Pavel

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-15  8:55 [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified Mel Gorman
  2017-11-15 11:55 ` Michal Hocko
@ 2017-11-15 19:49 ` Andrew Morton
  2017-11-16  8:39   ` Mel Gorman
  1 sibling, 1 reply; 21+ messages in thread
From: Andrew Morton @ 2017-11-15 19:49 UTC (permalink / raw)
  To: Mel Gorman; +Cc: linux-mm, linux-kernel, yasu.isimatu, koki.sanagi

On Wed, 15 Nov 2017 08:55:56 +0000 Mel Gorman <mgorman@techsingularity.net> wrote:

> Yasuaki Ishimatsu reported a premature OOM when trace_buf_size=100m was
> specified on a machine with many CPUs. The kernel tried to allocate 38.4GB
> but only 16GB was available due to deferred memory initialisation.
> 
> The allocation context is within smp_init() so there are no opportunities
> to do the deferred meminit earlier. Furthermore, the partial initialisation
> of memory occurs before the size of the trace buffers is set so there is
> no opportunity to adjust the amount of memory that is pre-initialised. We
> could potentially catch when memory is low during system boot and adjust the
> amount that is initialised serially but it's a little clumsy as it would
> require a check in the failure path of the page allocator.  Given that
> deferred meminit is basically a minor optimisation that only benefits very
> large machines and trace_buf_size is somewhat specialised, it follows that
> the most straight-forward option is to go back to serialised meminit if
> trace_buf_size is specified.

Patch is rather messy.

I went cross-eyed trying to work out how tracing allocates that buffer,
but I assume it ends up somewhere in the page allocator.  If the page
allocator is about to fail an allocation request and sees that memory
initialization is still ongoing, surely the page allocator should just
wait?  That seems to be the most general fix?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified
  2017-11-15 19:49 ` Andrew Morton
@ 2017-11-16  8:39   ` Mel Gorman
  0 siblings, 0 replies; 21+ messages in thread
From: Mel Gorman @ 2017-11-16  8:39 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, linux-kernel, yasu.isimatu, koki.sanagi

On Wed, Nov 15, 2017 at 11:49:19AM -0800, Andrew Morton wrote:
> On Wed, 15 Nov 2017 08:55:56 +0000 Mel Gorman <mgorman@techsingularity.net> wrote:
> 
> > Yasuaki Ishimatsu reported a premature OOM when trace_buf_size=100m was
> > specified on a machine with many CPUs. The kernel tried to allocate 38.4GB
> > but only 16GB was available due to deferred memory initialisation.
> > 
> > The allocation context is within smp_init() so there are no opportunities
> > to do the deferred meminit earlier. Furthermore, the partial initialisation
> > of memory occurs before the size of the trace buffers is set so there is
> > no opportunity to adjust the amount of memory that is pre-initialised. We
> > could potentially catch when memory is low during system boot and adjust the
> > amount that is initialised serially but it's a little clumsy as it would
> > require a check in the failure path of the page allocator.  Given that
> > deferred meminit is basically a minor optimisation that only benefits very
> > large machines and trace_buf_size is somewhat specialised, it follows that
> > the most straight-forward option is to go back to serialised meminit if
> > trace_buf_size is specified.
> 
> Patch is rather messy.
> 
> I went cross-eyed trying to work out how tracing allocates that buffer,
> but I assume it ends up somewhere in the page allocator. 

Basic path is

[ ]  __alloc_pages_slowpath+0x9a6/0xba7
[ ]  __alloc_pages_nodemask+0x26a/0x290
[ ]  new_slab+0x297/0x500
[ ]  ___slab_alloc+0x335/0x4a0
[ ]  __slab_alloc+0x40/0x66
[ ]  __kmalloc_node+0xbd/0x270
[ ]  __rb_allocate_pages+0xae/0x180
[ ]  rb_allocate_cpu_buffer+0x204/0x2f0
[ ]  trace_rb_cpu_prepare+0x7e/0xc5
[ ]  cpuhp_invoke_callback+0x3ea/0x5c0
[ ]  _cpu_up+0xbc/0x190
[ ]  do_cpu_up+0x87/0xb0
[ ]  cpu_up+0x13/0x20
[ ]  smp_init+0x69/0xca
[ ]  kernel_init_freeable+0x115/0x244

Note that it's during smp_init and part of the CPU onlining which is before
deferred meminit can start.

> If the page
> allocator is about to fail an allocation request and sees that memory
> initialization is still ongoing, surely the page allocator should just
> wait?  That seems to be the most general fix?
> 

In other contexts yes, but as deferred meminit has not started, there is
nothing to wait for yet.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2018-02-05 15:27 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-15  8:55 [PATCH] mm, meminit: Serially initialise deferred memory if trace_buf_size is specified Mel Gorman
2017-11-15 11:55 ` Michal Hocko
2017-11-15 14:13   ` Mel Gorman
2017-11-15 14:28     ` Michal Hocko
2017-11-15 14:43       ` Mel Gorman
2017-11-15 14:57         ` Michal Hocko
2017-11-15 19:17           ` YASUAKI ISHIMATSU
2017-11-16  8:54             ` Michal Hocko
2017-11-16 10:06               ` Mel Gorman
2017-11-17 18:19                 ` Pavel Tatashin
2017-11-17 21:32                   ` Mel Gorman
2017-11-30  3:41                     ` Pavel Tatashin
2017-12-06 10:50                       ` Mel Gorman
2018-01-31 17:28                         ` Koki.Sanagi
2018-01-31 18:24                           ` Pavel Tatashin
2018-02-05 14:14                             ` Masayoshi Mizuma
2018-02-05 15:26                               ` Pavel Tatashin
2017-11-21  1:04                   ` Andrew Morton
2017-11-30  3:49                     ` Pavel Tatashin
2017-11-15 19:49 ` Andrew Morton
2017-11-16  8:39   ` Mel Gorman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox