* [RFC 2/7] cpuset write pdflush nodemask
2007-04-23 23:20 [RFC 1/7] cpuset write dirty map Ethan Solomita
@ 2007-04-23 23:30 ` Ethan Solomita
2007-04-23 23:31 ` [RFC 3/7] cpuset write throttle Ethan Solomita
` (4 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: Ethan Solomita @ 2007-04-23 23:30 UTC (permalink / raw)
To: linux-mm
If we want to support nodeset specific writeout then we need a way
to communicate the set of nodes that an operation should affect.
So add a nodemask_t parameter to the pdflush functions and also
store the nodemask in the pdflush control structure.
Originally by Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ethan Solomita <solo@google.com>
---
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 1/fs/buffer.c 2/fs/buffer.c
--- 1/fs/buffer.c 2007-04-23 14:59:52.000000000 -0700
+++ 2/fs/buffer.c 2007-04-23 15:00:03.000000000 -0700
@@ -366,7 +366,7 @@ static void free_more_memory(void)
struct zone **zones;
pg_data_t *pgdat;
- wakeup_pdflush(1024);
+ wakeup_pdflush(1024, NULL);
yield();
for_each_online_pgdat(pgdat) {
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 1/fs/super.c 2/fs/super.c
--- 1/fs/super.c 2007-04-23 14:22:35.000000000 -0700
+++ 2/fs/super.c 2007-04-23 14:37:06.000000000 -0700
@@ -619,7 +619,7 @@ int do_remount_sb(struct super_block *sb
return 0;
}
-static void do_emergency_remount(unsigned long foo)
+static void do_emergency_remount(unsigned long foo, nodemask_t *bar)
{
struct super_block *sb;
@@ -647,7 +647,7 @@ static void do_emergency_remount(unsigne
void emergency_remount(void)
{
- pdflush_operation(do_emergency_remount, 0);
+ pdflush_operation(do_emergency_remount, 0, NULL);
}
/*
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 1/fs/sync.c 2/fs/sync.c
--- 1/fs/sync.c 2007-04-23 14:22:35.000000000 -0700
+++ 2/fs/sync.c 2007-04-23 14:37:06.000000000 -0700
@@ -21,9 +21,9 @@
* sync everything. Start out by waking pdflush, because that writes back
* all queues in parallel.
*/
-static void do_sync(unsigned long wait)
+static void do_sync(unsigned long wait, nodemask_t *unused)
{
- wakeup_pdflush(0);
+ wakeup_pdflush(0, NULL);
sync_inodes(0); /* All mappings, inodes and their blockdevs */
DQUOT_SYNC(NULL);
sync_supers(); /* Write the superblocks */
@@ -38,13 +38,13 @@ static void do_sync(unsigned long wait)
asmlinkage long sys_sync(void)
{
- do_sync(1);
+ do_sync(1, NULL);
return 0;
}
void emergency_sync(void)
{
- pdflush_operation(do_sync, 0);
+ pdflush_operation(do_sync, 0, NULL);
}
/*
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 1/include/linux/writeback.h 2/include/linux/writeback.h
--- 1/include/linux/writeback.h 2007-04-23 14:28:13.000000000 -0700
+++ 2/include/linux/writeback.h 2007-04-23 14:37:06.000000000 -0700
@@ -82,7 +82,7 @@ static inline void wait_on_inode(struct
/*
* mm/page-writeback.c
*/
-int wakeup_pdflush(long nr_pages);
+int wakeup_pdflush(long nr_pages, nodemask_t *nodes);
void laptop_io_completion(void);
void laptop_sync_completion(void);
void throttle_vm_writeout(gfp_t gfp_mask);
@@ -119,7 +119,8 @@ balance_dirty_pages_ratelimited(struct a
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
void *data);
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
+int pdflush_operation(void (*fn)(unsigned long, nodemask_t *nodes),
+ unsigned long arg0, nodemask_t *nodes);
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int write_cache_pages(struct address_space *mapping,
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 1/mm/page-writeback.c 2/mm/page-writeback.c
--- 1/mm/page-writeback.c 2007-04-23 14:28:13.000000000 -0700
+++ 2/mm/page-writeback.c 2007-04-23 14:37:06.000000000 -0700
@@ -101,7 +101,7 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
-static void background_writeout(unsigned long _min_pages);
+static void background_writeout(unsigned long _min_pages, nodemask_t *nodes);
/*
* Work out the current dirty-memory clamping and background writeout
@@ -272,7 +272,7 @@ static void balance_dirty_pages(struct a
*/
if ((laptop_mode && pages_written) ||
(!laptop_mode && (nr_reclaimable > background_thresh)))
- pdflush_operation(background_writeout, 0);
+ pdflush_operation(background_writeout, 0, NULL);
}
void set_page_dirty_balance(struct page *page)
@@ -362,7 +362,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
* writeback at least _min_pages, and keep writing until the amount of dirty
* memory is less than the background threshold, or until we're all clean.
*/
-static void background_writeout(unsigned long _min_pages)
+static void background_writeout(unsigned long _min_pages, nodemask_t *unused)
{
long min_pages = _min_pages;
struct writeback_control wbc = {
@@ -402,12 +402,12 @@ static void background_writeout(unsigned
* the whole world. Returns 0 if a pdflush thread was dispatched. Returns
* -1 if all pdflush threads were busy.
*/
-int wakeup_pdflush(long nr_pages)
+int wakeup_pdflush(long nr_pages, nodemask_t *nodes)
{
if (nr_pages == 0)
nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- return pdflush_operation(background_writeout, nr_pages);
+ return pdflush_operation(background_writeout, nr_pages, nodes);
}
static void wb_timer_fn(unsigned long unused);
@@ -431,7 +431,7 @@ DEFINE_TIMER(laptop_mode_wb_timer, lapto
* older_than_this takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
-static void wb_kupdate(unsigned long arg)
+static void wb_kupdate(unsigned long arg, nodemask_t *unused)
{
unsigned long oldest_jif;
unsigned long start_jif;
@@ -491,18 +491,18 @@ int dirty_writeback_centisecs_handler(ct
static void wb_timer_fn(unsigned long unused)
{
- if (pdflush_operation(wb_kupdate, 0) < 0)
+ if (pdflush_operation(wb_kupdate, 0, NULL) < 0)
mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
}
-static void laptop_flush(unsigned long unused)
+static void laptop_flush(unsigned long unused, nodemask_t *unused2)
{
sys_sync();
}
static void laptop_timer_fn(unsigned long unused)
{
- pdflush_operation(laptop_flush, 0);
+ pdflush_operation(laptop_flush, 0, NULL);
}
/*
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 1/mm/pdflush.c 2/mm/pdflush.c
--- 1/mm/pdflush.c 2007-04-23 14:22:36.000000000 -0700
+++ 2/mm/pdflush.c 2007-04-23 14:37:06.000000000 -0700
@@ -83,10 +83,12 @@ static unsigned long last_empty_jifs;
*/
struct pdflush_work {
struct task_struct *who; /* The thread */
- void (*fn)(unsigned long); /* A callback function */
+ void (*fn)(unsigned long, nodemask_t *); /* A callback function */
unsigned long arg0; /* An argument to the callback */
struct list_head list; /* On pdflush_list, when idle */
unsigned long when_i_went_to_sleep;
+ int have_nodes; /* Nodes were specified */
+ nodemask_t nodes; /* Nodes of interest */
};
static int __pdflush(struct pdflush_work *my_work)
@@ -123,7 +125,8 @@ static int __pdflush(struct pdflush_work
}
spin_unlock_irq(&pdflush_lock);
- (*my_work->fn)(my_work->arg0);
+ (*my_work->fn)(my_work->arg0,
+ my_work->have_nodes ? &my_work->nodes : NULL);
/*
* Thread creation: For how long have there been zero
@@ -197,7 +200,8 @@ static int pdflush(void *dummy)
* Returns zero if it indeed managed to find a worker thread, and passed your
* payload to it.
*/
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
+int pdflush_operation(void (*fn)(unsigned long, nodemask_t *),
+ unsigned long arg0, nodemask_t *nodes)
{
unsigned long flags;
int ret = 0;
@@ -217,6 +221,11 @@ int pdflush_operation(void (*fn)(unsigne
last_empty_jifs = jiffies;
pdf->fn = fn;
pdf->arg0 = arg0;
+ if (nodes) {
+ pdf->nodes = *nodes;
+ pdf->have_nodes = 1;
+ } else
+ pdf->have_nodes = 0;
wake_up_process(pdf->who);
spin_unlock_irqrestore(&pdflush_lock, flags);
}
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 1/mm/vmscan.c 2/mm/vmscan.c
--- 1/mm/vmscan.c 2007-04-23 14:22:36.000000000 -0700
+++ 2/mm/vmscan.c 2007-04-23 14:37:06.000000000 -0700
@@ -1174,7 +1174,7 @@ unsigned long try_to_free_pages(struct z
*/
if (total_scanned > sc.swap_cluster_max +
sc.swap_cluster_max / 2) {
- wakeup_pdflush(laptop_mode ? 0 : total_scanned);
+ wakeup_pdflush(laptop_mode ? 0 : total_scanned, NULL);
sc.may_writepage = 1;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC 3/7] cpuset write throttle
2007-04-23 23:20 [RFC 1/7] cpuset write dirty map Ethan Solomita
2007-04-23 23:30 ` [RFC 2/7] cpuset write pdflush nodemask Ethan Solomita
@ 2007-04-23 23:31 ` Ethan Solomita
2007-04-23 23:31 ` [RFC 4/7] cpuset write vmscan Ethan Solomita
` (3 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: Ethan Solomita @ 2007-04-23 23:31 UTC (permalink / raw)
To: linux-mm
Make page writeback obey cpuset constraints
Currently dirty throttling does not work properly in a cpuset.
If f.e a cpuset contains only 1/10th of available memory then all of the
memory of a cpuset can be dirtied without any writes being triggered.
If all of the cpusets memory is dirty then only 10% of total memory is dirty.
The background writeback threshold is usually set at 10% and the synchrononous
threshold at 40%. So we are still below the global limits while the dirty
ratio in the cpuset is 100%! Writeback throttling and background writeout
do not work at all in such scenarios.
This patch makes dirty writeout cpuset aware. When determining the
dirty limits in get_dirty_limits() we calculate values based on the
nodes that are reachable from the current process (that has been
dirtying the page). Then we can trigger writeout based on the
dirty ratio of the memory in the cpuset.
We trigger writeout in a a cpuset specific way. We go through the dirty
inodes and search for inodes that have dirty pages on the nodes of the
active cpuset. If an inode fulfills that requirement then we begin writeout
of the dirty pages of that inode.
Adding up all the counters for each node in a cpuset may seem to be quite
an expensive operation (in particular for large cpusets with hundreds of
nodes) compared to just accessing the global counters if we do not have
a cpuset. However, please remember that the global counters were only
introduced recently. Before 2.6.18 we did add up per processor
counters for each processor on each invocation of get_dirty_limits().
We now add per node information which I think is equal or less effort
since there are less nodes than processors.
Originally by Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ethan Solomita <solo@google.com>
---
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 2/mm/page-writeback.c 3/mm/page-writeback.c
--- 2/mm/page-writeback.c 2007-04-23 14:37:06.000000000 -0700
+++ 3/mm/page-writeback.c 2007-04-23 15:10:46.000000000 -0700
@@ -103,6 +103,14 @@ EXPORT_SYMBOL(laptop_mode);
static void background_writeout(unsigned long _min_pages, nodemask_t *nodes);
+struct dirty_limits {
+ long thresh_background;
+ long thresh_dirty;
+ unsigned long nr_dirty;
+ unsigned long nr_unstable;
+ unsigned long nr_writeback;
+};
+
/*
* Work out the current dirty-memory clamping and background writeout
* thresholds.
@@ -121,13 +129,15 @@ static void background_writeout(unsigned
* clamping level.
*/
-static unsigned long highmem_dirtyable_memory(unsigned long total)
+static unsigned long highmem_dirtyable_memory(nodemask_t *nodes, unsigned long total)
{
#ifdef CONFIG_HIGHMEM
int node;
unsigned long x = 0;
- for_each_online_node(node) {
+ if (nodes == NULL)
+ nodes = &node_online_mask;
+ for_each_node_mask(node, *nodes) {
struct zone *z =
&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
@@ -154,13 +164,13 @@ static unsigned long determine_dirtyable
x = global_page_state(NR_FREE_PAGES)
+ global_page_state(NR_INACTIVE)
+ global_page_state(NR_ACTIVE);
- x -= highmem_dirtyable_memory(x);
+ x -= highmem_dirtyable_memory(NULL, x);
return x + 1; /* Ensure that we never return 0 */
}
-static void
-get_dirty_limits(long *pbackground, long *pdirty,
- struct address_space *mapping)
+static int
+get_dirty_limits(struct dirty_limits *dl, struct address_space *mapping,
+ nodemask_t *nodes)
{
int background_ratio; /* Percentages */
int dirty_ratio;
@@ -168,12 +178,60 @@ get_dirty_limits(long *pbackground, long
long background;
long dirty;
unsigned long available_memory = determine_dirtyable_memory();
+ unsigned long dirtyable_memory;
+ unsigned long nr_mapped;
struct task_struct *tsk;
+ int is_subset = 0;
+
+#ifdef CONFIG_CPUSETS
+ if (unlikely(nodes &&
+ !nodes_subset(node_online_map, *nodes))) {
+ int node;
+
+ /*
+ * Calculate the limits relative to the current cpuset.
+ *
+ * We do not disregard highmem because all nodes (except
+ * maybe node 0) have either all memory in HIGHMEM (32 bit) or
+ * all memory in non HIGHMEM (64 bit). If we would disregard
+ * highmem then cpuset throttling would not work on 32 bit.
+ */
+ is_subset = 1;
+ memset(dl, 0, sizeof(struct dirty_limits));
+ dirtyable_memory = 0;
+ nr_mapped = 0;
+ for_each_node_mask(node, *nodes) {
+ if (!node_online(node))
+ continue;
+ dl->nr_dirty += node_page_state(node, NR_FILE_DIRTY);
+ dl->nr_unstable +=
+ node_page_state(node, NR_UNSTABLE_NFS);
+ dl->nr_writeback +=
+ node_page_state(node, NR_WRITEBACK);
+ dirtyable_memory +=
+ node_page_state(node, NR_ACTIVE) +
+ node_page_state(node, NR_INACTIVE) +
+ node_page_state(node, NR_FREE_PAGES);
+ nr_mapped += node_page_state(node, NR_FILE_MAPPED) +
+ node_page_state(node, NR_ANON_PAGES);
+ }
+ dirtyable_memory -= highmem_dirtyable_memory(nodes,
+ dirtyable_memory);
+ } else
+#endif
+ {
+ /* Global limits */
+ dl->nr_dirty = global_page_state(NR_FILE_DIRTY);
+ dl->nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ dl->nr_writeback = global_page_state(NR_WRITEBACK);
+ dirtyable_memory = determine_dirtyable_memory();
+ nr_mapped = global_page_state(NR_FILE_MAPPED) +
+ global_page_state(NR_ANON_PAGES);
+ }
unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
global_page_state(NR_ANON_PAGES)) * 100) /
- available_memory;
-
+ vm_total_pages;
dirty_ratio = vm_dirty_ratio;
if (dirty_ratio > unmapped_ratio / 2)
dirty_ratio = unmapped_ratio / 2;
@@ -185,15 +243,16 @@ get_dirty_limits(long *pbackground, long
if (background_ratio >= dirty_ratio)
background_ratio = dirty_ratio / 2;
- background = (background_ratio * available_memory) / 100;
- dirty = (dirty_ratio * available_memory) / 100;
+ background = (background_ratio * dirtyable_memory) / 100;
+ dirty = (dirty_ratio * dirtyable_memory) / 100;
tsk = current;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
background += background / 4;
dirty += dirty / 4;
}
- *pbackground = background;
- *pdirty = dirty;
+ dl->thresh_background = background;
+ dl->thresh_dirty = dirty;
+ return is_subset;
}
/*
@@ -206,8 +265,7 @@ get_dirty_limits(long *pbackground, long
static void balance_dirty_pages(struct address_space *mapping)
{
long nr_reclaimable;
- long background_thresh;
- long dirty_thresh;
+ struct dirty_limits dl;
unsigned long pages_written = 0;
unsigned long write_chunk = sync_writeback_pages();
@@ -222,11 +280,12 @@ static void balance_dirty_pages(struct a
.range_cyclic = 1,
};
- get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
- dirty_thresh)
+ if (get_dirty_limits(&dl, mapping,
+ &cpuset_current_mems_allowed))
+ wbc.nodes = &cpuset_current_mems_allowed;
+ nr_reclaimable = dl.nr_dirty + dl.nr_unstable;
+ if (nr_reclaimable + dl.nr_writeback <=
+ dl.thresh_dirty)
break;
if (!dirty_exceeded)
@@ -240,13 +299,10 @@ static void balance_dirty_pages(struct a
*/
if (nr_reclaimable) {
writeback_inodes(&wbc);
- get_dirty_limits(&background_thresh,
- &dirty_thresh, mapping);
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- if (nr_reclaimable +
- global_page_state(NR_WRITEBACK)
- <= dirty_thresh)
+ get_dirty_limits(&dl, mapping,
+ &cpuset_current_mems_allowed);
+ nr_reclaimable = dl.nr_dirty + dl.nr_unstable;
+ if (nr_reclaimable + dl.nr_writeback <= dl.thresh_dirty)
break;
pages_written += write_chunk - wbc.nr_to_write;
if (pages_written >= write_chunk)
@@ -255,8 +311,8 @@ static void balance_dirty_pages(struct a
congestion_wait(WRITE, HZ/10);
}
- if (nr_reclaimable + global_page_state(NR_WRITEBACK)
- <= dirty_thresh && dirty_exceeded)
+ if (nr_reclaimable + dl.nr_writeback
+ <= dl.thresh_dirty && dirty_exceeded)
dirty_exceeded = 0;
if (writeback_in_progress(bdi))
@@ -271,8 +327,9 @@ static void balance_dirty_pages(struct a
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && (nr_reclaimable > background_thresh)))
- pdflush_operation(background_writeout, 0, NULL);
+ (!laptop_mode && (nr_reclaimable > dl.thresh_background)))
+ pdflush_operation(background_writeout, 0,
+ &cpuset_current_mems_allowed);
}
void set_page_dirty_balance(struct page *page)
@@ -329,8 +386,7 @@ EXPORT_SYMBOL(balance_dirty_pages_rateli
void throttle_vm_writeout(gfp_t gfp_mask)
{
- long background_thresh;
- long dirty_thresh;
+ struct dirty_limits dl;
if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) {
/*
@@ -342,27 +398,26 @@ void throttle_vm_writeout(gfp_t gfp_mask
return;
}
- for ( ; ; ) {
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+ for ( ; ; ) {
+ get_dirty_limits(&dl, NULL, &node_online_map);
- /*
- * Boost the allowable dirty threshold a bit for page
- * allocators so they don't get DoS'ed by heavy writers
- */
- dirty_thresh += dirty_thresh / 10; /* wheeee... */
-
- if (global_page_state(NR_UNSTABLE_NFS) +
- global_page_state(NR_WRITEBACK) <= dirty_thresh)
- break;
- congestion_wait(WRITE, HZ/10);
- }
+ /*
+ * Boost the allowable dirty threshold a bit for page
+ * allocators so they don't get DoS'ed by heavy writers
+ */
+ dl.thresh_dirty += dl.thresh_dirty / 10; /* wheeee... */
+
+ if (dl.nr_unstable + dl.nr_writeback <= dl.thresh_dirty)
+ break;
+ congestion_wait(WRITE, HZ/10);
+ }
}
/*
* writeback at least _min_pages, and keep writing until the amount of dirty
* memory is less than the background threshold, or until we're all clean.
*/
-static void background_writeout(unsigned long _min_pages, nodemask_t *unused)
+static void background_writeout(unsigned long _min_pages, nodemask_t *nodes)
{
long min_pages = _min_pages;
struct writeback_control wbc = {
@@ -375,12 +430,11 @@ static void background_writeout(unsigned
};
for ( ; ; ) {
- long background_thresh;
- long dirty_thresh;
+ struct dirty_limits dl;
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
- if (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) < background_thresh
+ if (get_dirty_limits(&dl, NULL, nodes))
+ wbc.nodes = nodes;
+ if (dl.nr_dirty + dl.nr_unstable < dl.thresh_background
&& min_pages <= 0)
break;
wbc.encountered_congestion = 0;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC 4/7] cpuset write vmscan
2007-04-23 23:20 [RFC 1/7] cpuset write dirty map Ethan Solomita
2007-04-23 23:30 ` [RFC 2/7] cpuset write pdflush nodemask Ethan Solomita
2007-04-23 23:31 ` [RFC 3/7] cpuset write throttle Ethan Solomita
@ 2007-04-23 23:31 ` Ethan Solomita
2007-04-23 23:35 ` [Corrected] " Ethan Solomita
2007-04-23 23:32 ` [RFC 5/7] cpuset write vm writeout Ethan Solomita
` (2 subsequent siblings)
5 siblings, 1 reply; 8+ messages in thread
From: Ethan Solomita @ 2007-04-23 23:31 UTC (permalink / raw)
To: linux-mm
Direct reclaim: cpuset aware writeout
During direct reclaim we traverse down a zonelist and are carefully
checking each zone if its a member of the active cpuset. But then we call
pdflush without enforcing the same restrictions. In a larger system this
may have the effect of a massive amount of pages being dirtied and then
either
A. No writeout occurs because global dirty limits have not been reached
or
B. Writeout starts randomly for some dirty inode in the system. Pdflush
may just write out data for nodes in another cpuset and miss doing
proper dirty handling for the current cpuset.
In both cases dirty pages in the zones of interest may not be affected
and writeout may not occur as necessary.
Fix that by restricting pdflush to the active cpuset. Writeout will occur
from direct reclaim the same way as without a cpuset.
Originally by Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ethan Solomita <solo@google.com>
---
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 3/mm/vmscan.c
4/mm/vmscan.c
--- 3/mm/vmscan.c 2007-04-23 14:37:28.000000000 -0700
+++ 4/mm/vmscan.c 2007-04-23 14:37:32.000000000 -0700
@@ -1174,7 +1174,8 @@ unsigned long try_to_free_pages(struct z
*/
if (total_scanned > sc.swap_cluster_max +
sc.swap_cluster_max / 2) {
- wakeup_pdflush(laptop_mode ? 0 : total_scanned, NULL);
+ wakeup_pdflush(laptop_mode ? 0 : total_scanned,
+ &cpuset_current_mems_allowed);
sc.may_writepage = 1;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread* [Corrected] [RFC 4/7] cpuset write vmscan
2007-04-23 23:31 ` [RFC 4/7] cpuset write vmscan Ethan Solomita
@ 2007-04-23 23:35 ` Ethan Solomita
0 siblings, 0 replies; 8+ messages in thread
From: Ethan Solomita @ 2007-04-23 23:35 UTC (permalink / raw)
To: linux-mm
Direct reclaim: cpuset aware writeout
During direct reclaim we traverse down a zonelist and are carefully
checking each zone if its a member of the active cpuset. But then we call
pdflush without enforcing the same restrictions. In a larger system this
may have the effect of a massive amount of pages being dirtied and then either
A. No writeout occurs because global dirty limits have not been reached
or
B. Writeout starts randomly for some dirty inode in the system. Pdflush
may just write out data for nodes in another cpuset and miss doing
proper dirty handling for the current cpuset.
In both cases dirty pages in the zones of interest may not be affected
and writeout may not occur as necessary.
Fix that by restricting pdflush to the active cpuset. Writeout will occur
from direct reclaim the same way as without a cpuset.
Originally by Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ethan Solomita <solo@google.com>
---
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 3/mm/vmscan.c 4/mm/vmscan.c
--- 3/mm/vmscan.c 2007-04-23 14:37:28.000000000 -0700
+++ 4/mm/vmscan.c 2007-04-23 14:37:32.000000000 -0700
@@ -1174,7 +1174,8 @@ unsigned long try_to_free_pages(struct z
*/
if (total_scanned > sc.swap_cluster_max +
sc.swap_cluster_max / 2) {
- wakeup_pdflush(laptop_mode ? 0 : total_scanned, NULL);
+ wakeup_pdflush(laptop_mode ? 0 : total_scanned,
+ &cpuset_current_mems_allowed);
sc.may_writepage = 1;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* [RFC 5/7] cpuset write vm writeout
2007-04-23 23:20 [RFC 1/7] cpuset write dirty map Ethan Solomita
` (2 preceding siblings ...)
2007-04-23 23:31 ` [RFC 4/7] cpuset write vmscan Ethan Solomita
@ 2007-04-23 23:32 ` Ethan Solomita
2007-04-23 23:33 ` [RFC 6/7] cpuset write fixes Ethan Solomita
2007-04-23 23:33 ` [RFC 7/7] cpuset dirty limits Ethan Solomita
5 siblings, 0 replies; 8+ messages in thread
From: Ethan Solomita @ 2007-04-23 23:32 UTC (permalink / raw)
To: linux-mm
Throttle VM writeout in a cpuset aware way
This bases the vm throttling from the reclaim path on the dirty ratio
of the cpuset. Note that a cpuset is only effective if shrink_zone is called
from direct reclaim.
kswapd has a cpuset context that includes the whole machine. VM throttling
will only work during synchrononous reclaim and not from kswapd.
Originally by Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ethan Solomita <solo@google.com>
---
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 4/include/linux/writeback.h 5/include/linux/writeback.h
--- 4/include/linux/writeback.h 2007-04-23 14:37:31.000000000 -0700
+++ 5/include/linux/writeback.h 2007-04-23 14:37:51.000000000 -0700
@@ -85,7 +85,7 @@ static inline void wait_on_inode(struct
int wakeup_pdflush(long nr_pages, nodemask_t *nodes);
void laptop_io_completion(void);
void laptop_sync_completion(void);
-void throttle_vm_writeout(gfp_t gfp_mask);
+void throttle_vm_writeout(nodemask_t *nodes,gfp_t gfp_mask);
extern struct timer_list laptop_mode_wb_timer;
static inline int laptop_spinned_down(void)
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 4/mm/page-writeback.c 5/mm/page-writeback.c
--- 4/mm/page-writeback.c 2007-04-23 15:12:27.000000000 -0700
+++ 5/mm/page-writeback.c 2007-04-23 15:13:15.000000000 -0700
@@ -384,7 +384,7 @@ void balance_dirty_pages_ratelimited_nr(
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
-void throttle_vm_writeout(gfp_t gfp_mask)
+void throttle_vm_writeout(nodemask_t *nodes, gfp_t gfp_mask)
{
struct dirty_limits dl;
@@ -399,7 +399,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
}
for ( ; ; ) {
- get_dirty_limits(&dl, NULL, &node_online_map);
+ get_dirty_limits(&dl, NULL, nodes);
/*
* Boost the allowable dirty threshold a bit for page
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 4/mm/vmscan.c 5/mm/vmscan.c
--- 4/mm/vmscan.c 2007-04-23 14:37:32.000000000 -0700
+++ 5/mm/vmscan.c 2007-04-23 14:37:51.000000000 -0700
@@ -1055,7 +1055,7 @@ static unsigned long shrink_zone(int pri
}
}
- throttle_vm_writeout(sc->gfp_mask);
+ throttle_vm_writeout(&cpuset_current_mems_allowed, sc->gfp_mask);
atomic_dec(&zone->reclaim_in_progress);
return nr_reclaimed;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC 6/7] cpuset write fixes
2007-04-23 23:20 [RFC 1/7] cpuset write dirty map Ethan Solomita
` (3 preceding siblings ...)
2007-04-23 23:32 ` [RFC 5/7] cpuset write vm writeout Ethan Solomita
@ 2007-04-23 23:33 ` Ethan Solomita
2007-04-23 23:33 ` [RFC 7/7] cpuset dirty limits Ethan Solomita
5 siblings, 0 replies; 8+ messages in thread
From: Ethan Solomita @ 2007-04-23 23:33 UTC (permalink / raw)
To: linux-mm
Remove unneeded local variable.
Originally by Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ethan Solomita <solo@google.com>
---
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 5/mm/page-writeback.c 6/mm/page-writeback.c
--- 5/mm/page-writeback.c 2007-04-23 15:13:15.000000000 -0700
+++ 6/mm/page-writeback.c 2007-04-23 15:14:25.000000000 -0700
@@ -177,7 +177,6 @@ get_dirty_limits(struct dirty_limits *dl
int unmapped_ratio;
long background;
long dirty;
- unsigned long available_memory = determine_dirtyable_memory();
unsigned long dirtyable_memory;
unsigned long nr_mapped;
struct task_struct *tsk;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC 7/7] cpuset dirty limits
2007-04-23 23:20 [RFC 1/7] cpuset write dirty map Ethan Solomita
` (4 preceding siblings ...)
2007-04-23 23:33 ` [RFC 6/7] cpuset write fixes Ethan Solomita
@ 2007-04-23 23:33 ` Ethan Solomita
5 siblings, 0 replies; 8+ messages in thread
From: Ethan Solomita @ 2007-04-23 23:33 UTC (permalink / raw)
To: linux-mm
Per cpuset dirty ratios
This implements dirty ratios per cpuset. Two new files are added
to the cpuset directories:
background_dirty_ratio Percentage at which background writeback starts
throttle_dirty_ratio Percentage at which the application is throttled
and we start synchrononous writeout.
Both variables are set to -1 by default which means that the global
limits (/proc/sys/vm/vm_dirty_ratio and /proc/sys/vm/dirty_background_ratio)
are used for a cpuset.
Originally by Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ethan Solomita <solo@google.com>
---
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 6/include/linux/cpuset.h 7/include/linux/cpuset.h
--- 6/include/linux/cpuset.h 2007-04-23 14:38:07.000000000 -0700
+++ 7/include/linux/cpuset.h 2007-04-23 14:38:39.000000000 -0700
@@ -75,6 +75,7 @@ static inline int cpuset_do_slab_mem_spr
extern void cpuset_track_online_nodes(void);
+extern void cpuset_get_current_ratios(int *background, int *ratio);
/*
* We need macros since struct address_space is not defined yet
*/
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 6/kernel/cpuset.c 7/kernel/cpuset.c
--- 6/kernel/cpuset.c 2007-04-23 14:38:08.000000000 -0700
+++ 7/kernel/cpuset.c 2007-04-23 14:38:39.000000000 -0700
@@ -50,6 +50,7 @@
#include <linux/time.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
+#include <linux/writeback.h>
#include <asm/uaccess.h>
#include <asm/atomic.h>
@@ -100,6 +101,9 @@ struct cpuset {
int mems_generation;
struct fmeter fmeter; /* memory_pressure filter */
+
+ int background_dirty_ratio;
+ int throttle_dirty_ratio;
};
/* bits in struct cpuset flags field */
@@ -177,6 +181,8 @@ static struct cpuset top_cpuset = {
.count = ATOMIC_INIT(0),
.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
.children = LIST_HEAD_INIT(top_cpuset.children),
+ .background_dirty_ratio = -1,
+ .throttle_dirty_ratio = -1,
};
static struct vfsmount *cpuset_mount;
@@ -1009,6 +1015,21 @@ static int update_flag(cpuset_flagbits_t
return 0;
}
+static int update_int(int *cs_int, char *buf, int min, int max)
+{
+ char *endp;
+ int val;
+
+ val = simple_strtol(buf, &endp, 10);
+ if (val < min || val > max)
+ return -EINVAL;
+
+ mutex_lock(&callback_mutex);
+ *cs_int = val;
+ mutex_unlock(&callback_mutex);
+ return 0;
+}
+
/*
* Frequency meter - How fast is some event occurring?
*
@@ -1217,6 +1238,8 @@ typedef enum {
FILE_SPREAD_PAGE,
FILE_SPREAD_SLAB,
FILE_TASKLIST,
+ FILE_THROTTLE_DIRTY_RATIO,
+ FILE_BACKGROUND_DIRTY_RATIO,
} cpuset_filetype_t;
static ssize_t cpuset_common_file_write(struct file *file,
@@ -1287,6 +1310,12 @@ static ssize_t cpuset_common_file_write(
case FILE_TASKLIST:
retval = attach_task(cs, buffer, &pathbuf);
break;
+ case FILE_BACKGROUND_DIRTY_RATIO:
+ retval = update_int(&cs->background_dirty_ratio, buffer, -1, 100);
+ break;
+ case FILE_THROTTLE_DIRTY_RATIO:
+ retval = update_int(&cs->throttle_dirty_ratio, buffer, -1, 100);
+ break;
default:
retval = -EINVAL;
goto out2;
@@ -1399,6 +1428,12 @@ static ssize_t cpuset_common_file_read(s
case FILE_SPREAD_SLAB:
*s++ = is_spread_slab(cs) ? '1' : '0';
break;
+ case FILE_BACKGROUND_DIRTY_RATIO:
+ s += sprintf(s, "%d", cs->background_dirty_ratio);
+ break;
+ case FILE_THROTTLE_DIRTY_RATIO:
+ s += sprintf(s, "%d", cs->throttle_dirty_ratio);
+ break;
default:
retval = -EINVAL;
goto out;
@@ -1772,6 +1807,16 @@ static struct cftype cft_spread_slab = {
.private = FILE_SPREAD_SLAB,
};
+static struct cftype cft_background_dirty_ratio = {
+ .name = "background_dirty_ratio",
+ .private = FILE_BACKGROUND_DIRTY_RATIO,
+};
+
+static struct cftype cft_throttle_dirty_ratio = {
+ .name = "throttle_dirty_ratio",
+ .private = FILE_THROTTLE_DIRTY_RATIO,
+};
+
static int cpuset_populate_dir(struct dentry *cs_dentry)
{
int err;
@@ -1794,6 +1839,10 @@ static int cpuset_populate_dir(struct de
return err;
if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0)
return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_background_dirty_ratio)) < 0)
+ return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_throttle_dirty_ratio)) < 0)
+ return err;
if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
return err;
return 0;
@@ -1833,6 +1882,8 @@ static long cpuset_create(struct cpuset
INIT_LIST_HEAD(&cs->children);
cs->mems_generation = cpuset_mems_generation++;
fmeter_init(&cs->fmeter);
+ cs->background_dirty_ratio = parent->background_dirty_ratio;
+ cs->throttle_dirty_ratio = parent->throttle_dirty_ratio;
cs->parent = parent;
@@ -2451,8 +2502,30 @@ int cpuset_mem_spread_node(void)
}
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
-#if MAX_NUMNODES > BITS_PER_LONG
+/*
+ * Determine the dirty ratios for the currently active cpuset
+ */
+void cpuset_get_current_ratios(int *background_ratio, int *throttle_ratio)
+{
+ int background = -1;
+ int throttle = -1;
+ struct task_struct *tsk = current;
+
+ task_lock(tsk);
+ background = tsk->cpuset->background_dirty_ratio;
+ throttle = tsk->cpuset->throttle_dirty_ratio;
+ task_unlock(tsk);
+ if (background == -1)
+ background = dirty_background_ratio;
+ if (throttle == -1)
+ throttle = vm_dirty_ratio;
+
+ *background_ratio = background;
+ *throttle_ratio = throttle;
+}
+
+#if MAX_NUMNODES > BITS_PER_LONG
/*
* Special functions for NUMA systems with a large number of nodes.
* The nodemask is pointed to from the address space structures.
diff -uprN -X linux-2.6.21-rc4-mm1/Documentation/dontdiff 6/mm/page-writeback.c 7/mm/page-writeback.c
--- 6/mm/page-writeback.c 2007-04-23 15:14:25.000000000 -0700
+++ 7/mm/page-writeback.c 2007-04-23 15:16:54.000000000 -0700
@@ -216,6 +216,7 @@ get_dirty_limits(struct dirty_limits *dl
}
dirtyable_memory -= highmem_dirtyable_memory(nodes,
dirtyable_memory);
+ cpuset_get_current_ratios(&background_ratio, &dirty_ratio);
} else
#endif
{
@@ -226,19 +227,19 @@ get_dirty_limits(struct dirty_limits *dl
dirtyable_memory = determine_dirtyable_memory();
nr_mapped = global_page_state(NR_FILE_MAPPED) +
global_page_state(NR_ANON_PAGES);
+ dirty_ratio = vm_dirty_ratio;
+ background_ratio = dirty_background_ratio;
}
unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
global_page_state(NR_ANON_PAGES)) * 100) /
vm_total_pages;
- dirty_ratio = vm_dirty_ratio;
if (dirty_ratio > unmapped_ratio / 2)
dirty_ratio = unmapped_ratio / 2;
if (dirty_ratio < 5)
dirty_ratio = 5;
- background_ratio = dirty_background_ratio;
if (background_ratio >= dirty_ratio)
background_ratio = dirty_ratio / 2;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread