* [RFC][PATCH] tunable zone watermarks
@ 2005-03-28 6:43 Levent Serinol
2005-03-28 19:30 ` Martin J. Bligh
0 siblings, 1 reply; 6+ messages in thread
From: Levent Serinol @ 2005-03-28 6:43 UTC (permalink / raw)
To: linux-mm; +Cc: marcelo.tosatti
===========================================================
--- linux-2.6.11.4/include/linux/sysctl.h.org 2005-03-16
02:09:07.000000000 +0200
+++ linux-2.6.11.4/include/linux/sysctl.h 2005-03-27
20:33:17.000000000 +0300
@@ -169,6 +169,7 @@ enum
VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual
address space layout */
VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
+ VM_ZONE_WATERMARKS=29, /* zone watermarks */
};
--- linux-2.6.11.4/include/linux/mmzone.h.org 2005-03-16
02:09:07.000000000 +0200
+++ linux-2.6.11.4/include/linux/mmzone.h 2005-03-27
20:33:17.000000000 +0300
@@ -27,6 +27,12 @@ struct free_area {
struct pglist_data;
+typedef struct zone_watermarks_vals {
+ unsigned long pages_min;
+ unsigned long pages_low;
+ unsigned long pages_high;
+ }zone_watermarks_vals_t;
+
/*
* zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
* So add a wild amount of padding here to ensure that they fall into separate
@@ -364,6 +370,8 @@ struct ctl_table;
struct file;
int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
+int zone_watermarks_sysctl_handler(struct ctl_table *, int, struct file *,
+ void __user *, size_t *, loff_t *);
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
--- linux-2.6.11.4/mm/page_alloc.c.org 2005-03-16 02:09:27.000000000 +0200
+++ linux-2.6.11.4/mm/page_alloc.c 2005-03-27 20:33:53.000000000 +0300
@@ -66,7 +66,11 @@ EXPORT_SYMBOL(zone_table);
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
int min_free_kbytes = 1024;
-
+#ifdef CONFIG_NUMA
+zone_watermarks_vals_t zone_watermarks_sysctl[num_online_nodes() *
MAX_NR_ZONES];
+#else
+zone_watermarks_vals_t zone_watermarks_sysctl[MAX_NUMNODES * MAX_NR_ZONES];
+#endif
unsigned long __initdata nr_kernel_pages;
unsigned long __initdata nr_all_pages;
@@ -1911,6 +1915,24 @@ void __init page_alloc_init(void)
hotcpu_notifier(page_alloc_cpu_notify, 0);
}
+static void setup_zone_watermarks_vals(void)
+{
+ pg_data_t *pgdat;
+ unsigned int j,i;
+
+ j=0;
+ for_each_pgdat(pgdat) {
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ zone_watermarks_sysctl[j].pages_min = K(zone->pages_min);
+ zone_watermarks_sysctl[j].pages_low = K(zone->pages_low);
+ zone_watermarks_sysctl[j].pages_high = K(zone->pages_high);
+ j++;
+ }
+ }
+}
+
/*
* setup_per_zone_lowmem_reserve - called whenever
* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
@@ -1990,6 +2012,7 @@ static void setup_per_zone_pages_min(voi
zone->pages_high = (zone->pages_min * 6) / 4;
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
+ setup_zone_watermarks_vals();
}
/*
@@ -2029,6 +2052,7 @@ static int __init init_per_zone_pages_mi
min_free_kbytes = 65536;
setup_per_zone_pages_min();
setup_per_zone_lowmem_reserve();
+ setup_zone_watermarks_vals();
return 0;
}
module_init(init_per_zone_pages_min)
@@ -2046,6 +2070,66 @@ int min_free_kbytes_sysctl_handler(ctl_t
return 0;
}
+int zone_watermarks_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t
*length, loff_t *ppos)
+{
+ unsigned long flags;
+ unsigned long zone_pages = 0;
+ unsigned long lowmem_pages = 0;
+ pg_data_t *pgdat;
+ unsigned int j,i;
+ int err;
+
+
+ err = proc_dointvec(table, write, file, buffer, length, ppos);
+
+ if ((err >= 0) && write) {
+ j=0;
+ for_each_pgdat(pgdat) {
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+ if (!is_highmem(zone))
+ lowmem_pages += zone->present_pages;
+ }
+ }
+ for_each_pgdat(pgdat) {
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+ unsigned long lowmem_min;
+
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ zone_pages =
(zone_watermarks_sysctl[j].pages_min >> (PAGE_SHIFT - 10));
+ if (is_highmem(zone)) {
+ if (zone_pages < SWAP_CLUSTER_MAX)
+ zone_pages = SWAP_CLUSTER_MAX;
+ if (zone_pages >= zone->present_pages)
+ zone_pages =
zone->present_pages;
+ zone->pages_min = zone_pages;
+
+ } else {
+ lowmem_min = (zone_pages *
zone->present_pages) /
+ lowmem_pages;
+ if (lowmem_min > zone_pages)
+ zone_pages = lowmem_min;
+ zone->pages_min = zone_pages;
+ }
+ zone_pages =
(zone_watermarks_sysctl[j].pages_low >> (PAGE_SHIFT - 10));
+ if (zone_pages >= zone->present_pages)
+ zone_pages = zone->present_pages;
+ zone->pages_low = zone_pages;
+ zone_pages =
(zone_watermarks_sysctl[j].pages_high >> (PAGE_SHIFT - 10));
+ if (zone_pages >= zone->present_pages)
+ zone_pages = zone->present_pages;
+ zone->pages_high = zone_pages;
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ j++;
+ }
+ }
+ }
+
+ return 0;
+}
+
/*
* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
--- linux-2.6.11.4/kernel/sysctl.c.org 2005-03-16 02:09:00.000000000 +0200
+++ linux-2.6.11.4/kernel/sysctl.c 2005-03-27 20:33:17.000000000 +0300
@@ -62,6 +62,11 @@ extern char core_pattern[];
extern int cad_pid;
extern int pid_max;
extern int min_free_kbytes;
+#ifdef CONFIG_NUMA
+extern zone_watermarks_vals_t
zone_watermarks_sysctl[num_online_nodes() * MAX_NR_ZONES];
+#else
+extern zone_watermarks_vals_t zone_watermarks_sysctl[MAX_NUMNODES *
MAX_NR_ZONES];
+#endif
extern int printk_ratelimit_jiffies;
extern int printk_ratelimit_burst;
extern int pid_max_min, pid_max_max;
@@ -825,6 +830,15 @@ static ctl_table vm_table[] = {
.strategy = &sysctl_jiffies,
},
#endif
+ {
+ .ctl_name = VM_ZONE_WATERMARKS,
+ .procname = "zone_watermarks",
+ .data = &zone_watermarks_sysctl,
+ .maxlen = sizeof(zone_watermarks_sysctl),
+ .mode = 0644,
+ .proc_handler = &zone_watermarks_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ },
{ .ctl_name = 0 }
};
===========================================================
--
Stay out of the road, if you want to grow old.
~ Pink Floyd ~.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [RFC][PATCH] tunable zone watermarks
2005-03-28 6:43 [RFC][PATCH] tunable zone watermarks Levent Serinol
@ 2005-03-28 19:30 ` Martin J. Bligh
2005-03-28 19:51 ` Marcelo Tosatti
0 siblings, 1 reply; 6+ messages in thread
From: Martin J. Bligh @ 2005-03-28 19:30 UTC (permalink / raw)
To: Levent Serinol, linux-mm; +Cc: marcelo.tosatti
What situations do you want to tune them in? Would be nicer to do this
automagically ...
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC][PATCH] tunable zone watermarks
2005-03-28 19:30 ` Martin J. Bligh
@ 2005-03-28 19:51 ` Marcelo Tosatti
2005-03-29 0:45 ` Martin J. Bligh
0 siblings, 1 reply; 6+ messages in thread
From: Marcelo Tosatti @ 2005-03-28 19:51 UTC (permalink / raw)
To: Martin J. Bligh; +Cc: Levent Serinol, linux-mm
Hi Martin,
On Mon, Mar 28, 2005 at 11:30:32AM -0800, Martin J. Bligh wrote:
> What situations do you want to tune them in? Would be nicer to do this
> automagically ...
We do it automagically right now. What do you mean?
It is useful for testing purpose - but then you need an understanding of
MM internals to make good use of it.
The only tweak available now is lowmem_reserve.
I'm sure there are loads where the default watermark values are
not optimal.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC][PATCH] tunable zone watermarks
2005-03-28 19:51 ` Marcelo Tosatti
@ 2005-03-29 0:45 ` Martin J. Bligh
2005-03-29 2:27 ` Nick Piggin
0 siblings, 1 reply; 6+ messages in thread
From: Martin J. Bligh @ 2005-03-29 0:45 UTC (permalink / raw)
To: Marcelo Tosatti; +Cc: Levent Serinol, linux-mm
> On Mon, Mar 28, 2005 at 11:30:32AM -0800, Martin J. Bligh wrote:
>> What situations do you want to tune them in? Would be nicer to do this
>> automagically ...
>
> We do it automagically right now. What do you mean?
>
> It is useful for testing purpose - but then you need an understanding of
> MM internals to make good use of it.
>
> The only tweak available now is lowmem_reserve.
>
> I'm sure there are loads where the default watermark values are
> not optimal.
Yeah, I'm just not at all convinced that the solution to these problems
is to make everything tunable up the wazoo ... all that seems to do is
1) Encourage customers to break their systems in new and interesting ways
2) Line the pockets of "tuning consultants".
If there are loads where the default watermark values are not optimal
(and I agree there probably are) then what we really need is to auto
recognise those, and fix them in the OS ... rather than provide a tweakable.
I see that being able to poke those is useful in diagnosing the above ...
just not sure we want those in mainline. Perhaps we should have
CONFIG_TWEAK_EVERYTHING_UP_THE_WAZOO and not enable it in distros,
or by default. But as an IBM employee, I can assure you IBM would
whine mercilessly at the distros until they turned it on, so I'm not
sure it helps ;-)
I've been in customer situations dealing with 10 billion tunables before,
it makes life impossible ;-(
M.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC][PATCH] tunable zone watermarks
2005-03-29 0:45 ` Martin J. Bligh
@ 2005-03-29 2:27 ` Nick Piggin
2005-03-29 9:10 ` Levent Serinol
0 siblings, 1 reply; 6+ messages in thread
From: Nick Piggin @ 2005-03-29 2:27 UTC (permalink / raw)
To: Martin J. Bligh; +Cc: Marcelo Tosatti, Levent Serinol, linux-mm
Martin J. Bligh wrote:
>>On Mon, Mar 28, 2005 at 11:30:32AM -0800, Martin J. Bligh wrote:
>>
>>>What situations do you want to tune them in? Would be nicer to do this
>>>automagically ...
>>>
>>We do it automagically right now. What do you mean?
>>
>>It is useful for testing purpose - but then you need an understanding of
>>MM internals to make good use of it.
>>
>>The only tweak available now is lowmem_reserve.
>>
>>
min_free_kbytes is closer to what you want (sorry if it has already
been mentioned).
>>I'm sure there are loads where the default watermark values are
>>not optimal.
>>
>
>Yeah, I'm just not at all convinced that the solution to these problems
>is to make everything tunable up the wazoo ... all that seems to do is
>
>1) Encourage customers to break their systems in new and interesting ways
>2) Line the pockets of "tuning consultants".
>
>If there are loads where the default watermark values are not optimal
>(and I agree there probably are) then what we really need is to auto
>recognise those, and fix them in the OS ... rather than provide a tweakable.
>
>I see that being able to poke those is useful in diagnosing the above ...
>just not sure we want those in mainline. Perhaps we should have
>CONFIG_TWEAK_EVERYTHING_UP_THE_WAZOO and not enable it in distros,
>or by default. But as an IBM employee, I can assure you IBM would
>whine mercilessly at the distros until they turned it on, so I'm not
>sure it helps ;-)
>
>I've been in customer situations dealing with 10 billion tunables before,
>it makes life impossible ;-(
>
>
I agree, FWIW. The *first* barrier to make something tunable in the
kernel.org
kernel should be a real world(ish) case where current heuristics fall
down (if
I do this, then setting "blah" to X gives a 200% improvement, wheras if
I do that,
then X is bad and Y gives a 200% improvement).
The second condition should be that attempts to make the heuristic
automatically
handle those cases fails or results in too intrusive / complex code.
And I guess thirdly, it should be documented and understandable to
(usable by)
non kernel hackers.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC][PATCH] tunable zone watermarks
2005-03-29 2:27 ` Nick Piggin
@ 2005-03-29 9:10 ` Levent Serinol
0 siblings, 0 replies; 6+ messages in thread
From: Levent Serinol @ 2005-03-29 9:10 UTC (permalink / raw)
To: Nick Piggin; +Cc: Martin J. Bligh, Marcelo Tosatti, linux-mm
Hi Nick,
On Tue, 29 Mar 2005 12:27:09 +1000, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> Martin J. Bligh wrote:
>
> >>On Mon, Mar 28, 2005 at 11:30:32AM -0800, Martin J. Bligh wrote:
> >>
> >>>What situations do you want to tune them in? Would be nicer to do this
> >>>automagically ...
> >>>
> >>We do it automagically right now. What do you mean?
> >>
> >>It is useful for testing purpose - but then you need an understanding of
> >>MM internals to make good use of it.
> >>
> >>The only tweak available now is lowmem_reserve.
> >>
> >>
>
> min_free_kbytes is closer to what you want (sorry if it has already
> been mentioned).
Yes, closer but has a fixed ratio between min,low,high limits. For
example if you like to kick kswapd earlier and let it to make more
work, you have no chance with min_free_kbytes. You can only kick
kswapd earlier but you cannot tell it to free how much pages you
want due to fixed ratio. Therefore, tuning zone watermarks makes it
more customizable.
Also, as Mr Tosatti mentioned you can use this tunables for debugging.
Also, as you know There're many tunable things in kernel that can
vanish your system If you don't know what u're doing ;-)
For example on Solaris, you have an option to tell mm how much memory
you want it to try to free (desfree). It's default value is lotsfree /
2, but you have such an option to define desfree besides it default
value. with min_free_kbytes you don't have such an option :-(
Documentation is no problem. But if you still think that, this
tunables are very dangerous it
can go into CONFIG_DEBUG_KERNEL stuff.
>
> >>I'm sure there are loads where the default watermark values are
> >>not optimal.
> >>
> >
> >Yeah, I'm just not at all convinced that the solution to these problems
> >is to make everything tunable up the wazoo ... all that seems to do is
> >
> >1) Encourage customers to break their systems in new and interesting ways
> >2) Line the pockets of "tuning consultants".
> >
> >If there are loads where the default watermark values are not optimal
> >(and I agree there probably are) then what we really need is to auto
> >recognise those, and fix them in the OS ... rather than provide a tweakable.
> >
> >I see that being able to poke those is useful in diagnosing the above ...
> >just not sure we want those in mainline. Perhaps we should have
> >CONFIG_TWEAK_EVERYTHING_UP_THE_WAZOO and not enable it in distros,
> >or by default. But as an IBM employee, I can assure you IBM would
> >whine mercilessly at the distros until they turned it on, so I'm not
> >sure it helps ;-)
> >
> >I've been in customer situations dealing with 10 billion tunables before,
> >it makes life impossible ;-(
> >
> >
>
> I agree, FWIW. The *first* barrier to make something tunable in the
> kernel.org
> kernel should be a real world(ish) case where current heuristics fall
> down (if
> I do this, then setting "blah" to X gives a 200% improvement, wheras if
> I do that,
> then X is bad and Y gives a 200% improvement).
>
> The second condition should be that attempts to make the heuristic
> automatically
> handle those cases fails or results in too intrusive / complex code.
>
> And I guess thirdly, it should be documented and understandable to
> (usable by)
> non kernel hackers.
>
>
--
Stay out of the road, if you want to grow old.
~ Pink Floyd ~.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2005-03-29 9:10 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-03-28 6:43 [RFC][PATCH] tunable zone watermarks Levent Serinol
2005-03-28 19:30 ` Martin J. Bligh
2005-03-28 19:51 ` Marcelo Tosatti
2005-03-29 0:45 ` Martin J. Bligh
2005-03-29 2:27 ` Nick Piggin
2005-03-29 9:10 ` Levent Serinol
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox