[PATCH 0/6] per device dirty throttling -V2

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/6] per device dirty throttling -V2
@ 2007-04-03 14:40 Peter Zijlstra
  2007-04-03 14:40 ` [PATCH 1/6] mm: scalable bdi statistics counters Peter Zijlstra
                   ` (5 more replies)
  0 siblings, 6 replies; 18+ messages in thread
From: Peter Zijlstra @ 2007-04-03 14:40 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra

Hi,

A new version of the per BDI dirty page throttle patches.

This is against 2.6.21-rc5-mm4 with:

 per-backing_dev-dirty-and-writeback-page-accounting.patch

reverted.

These patches should solve several problem we current have in this area,
namely:

 - mutual interference starvation (for any number of BDIs), and
 - deadlocks with stacked BDIs (loop and FUSE).

Dave, would you mind testing if the XFS umount problem is still present?
I could not reproduce it with this code.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/6] mm: scalable bdi statistics counters.
  2007-04-03 14:40 [PATCH 0/6] per device dirty throttling -V2 Peter Zijlstra
@ 2007-04-03 14:40 ` Peter Zijlstra
  2007-04-04  9:20   ` Miklos Szeredi
  2007-04-03 14:40 ` [PATCH 2/6] mm: count dirty pages per BDI Peter Zijlstra
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2007-04-03 14:40 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra

[-- Attachment #1: bdi_stat.patch --]
[-- Type: text/plain, Size: 9439 bytes --]

Provide scalable per backing_dev_info statistics counters modeled on the ZVC
code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 block/ll_rw_blk.c           |    1 
 drivers/block/rd.c          |    2 
 drivers/char/mem.c          |    2 
 fs/char_dev.c               |    1 
 fs/fuse/inode.c             |    1 
 fs/nfs/client.c             |    1 
 include/linux/backing-dev.h |   98 +++++++++++++++++++++++++++++++++++++++++
 mm/backing-dev.c            |  103 ++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 209 insertions(+)

Index: linux-2.6/block/ll_rw_blk.c
===================================================================
--- linux-2.6.orig/block/ll_rw_blk.c
+++ linux-2.6/block/ll_rw_blk.c
@@ -208,6 +208,7 @@ void blk_queue_make_request(request_queu
 	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
 	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
 	q->make_request_fn = mfn;
+	bdi_stat_init(&q->backing_dev_info);
 	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
 	blk_queue_hardsect_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h
+++ linux-2.6/include/linux/backing-dev.h
@@ -8,6 +8,7 @@
 #ifndef _LINUX_BACKING_DEV_H
 #define _LINUX_BACKING_DEV_H
 
+#include <linux/spinlock.h>
 #include <asm/atomic.h>
 
 struct page;
@@ -22,6 +23,17 @@ enum bdi_state {
 	BDI_unused,		/* Available bits start here */
 };
 
+enum bdi_stat_item {
+	NR_BDI_STAT_ITEMS
+};
+
+#ifdef CONFIG_SMP
+struct bdi_per_cpu_data {
+	s8 stat_threshold;
+	s8 bdi_stat_diff[NR_BDI_STAT_ITEMS];
+} ____cacheline_aligned_in_smp;
+#endif
+
 typedef int (congested_fn)(void *, int);
 
 struct backing_dev_info {
@@ -34,8 +46,94 @@ struct backing_dev_info {
 	void *congested_data;	/* Pointer to aux data for congested func */
 	void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
 	void *unplug_io_data;
+
+	atomic_long_t bdi_stats[NR_BDI_STAT_ITEMS];
+#ifdef CONFIG_SMP
+	struct bdi_per_cpu_data pcd[NR_CPUS];
+#endif
 };
 
+extern atomic_long_t bdi_stats[NR_BDI_STAT_ITEMS];
+
+static inline void bdi_stat_add(long x, struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	atomic_long_add(x, &bdi->bdi_stats[item]);
+	atomic_long_add(x, &bdi_stats[item]);
+}
+
+static inline unsigned long __global_bdi_stat(enum bdi_stat_item item)
+{
+	return atomic_long_read(&bdi_stats[item]);
+}
+
+static inline unsigned long __bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	return atomic_long_read(&bdi->bdi_stats[item]);
+}
+
+/*
+ * cannot be unsigned long and clip on 0.
+ */
+static inline unsigned long global_bdi_stat(enum bdi_stat_item item)
+{
+	long x = atomic_long_read(&bdi_stats[item]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
+static inline unsigned long bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	long x = atomic_long_read(&bdi->bdi_stats[item]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
+#ifdef CONFIG_SMP
+void __mod_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item, int delta);
+void __inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
+void __dec_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
+
+void mod_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item, int delta);
+void inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
+void dec_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
+
+#else /* CONFIG_SMP */
+
+static inline void __mod_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item, int delta)
+{
+	bdi_stat_add(delta, bdi, item);
+}
+
+static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	atomic_long_inc(&bdi->bdi_stats[item]);
+	atomic_long_inc(&bdi_stats[item]);
+}
+
+static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	atomic_long_dec(&bdi->bdi_stats[item]);
+	atomic_long_dec(&bdi_stats[item]);
+}
+
+#define mod_bdi_stat __mod_bdi_stat
+#define inc_bdi_stat __inc_bdi_stat
+#define dec_bdi_stat __dec_bdi_stat
+#endif
+
+void bdi_stat_init(struct backing_dev_info *bdi);
 
 /*
  * Flags in backing_dev_info::capability
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c
+++ linux-2.6/mm/backing-dev.c
@@ -83,3 +83,106 @@ void congestion_end(int rw)
 		wake_up(wqh);
 }
 EXPORT_SYMBOL(congestion_end);
+
+atomic_long_t bdi_stats[NR_BDI_STAT_ITEMS];
+EXPORT_SYMBOL(bdi_stats);
+
+void bdi_stat_init(struct backing_dev_info *bdi)
+{
+	int i;
+
+	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+		atomic_long_set(&bdi->bdi_stats[i], 0);
+
+#ifdef CONFIG_SMP
+	for (i = 0; i < NR_CPUS; i++) {
+		int j;
+		for (j = 0; j < NR_BDI_STAT_ITEMS; j++)
+			bdi->pcd[i].bdi_stat_diff[j] = 0;
+		bdi->pcd[i].stat_threshold = 8 * ilog2(num_online_cpus());
+	}
+#endif
+}
+EXPORT_SYMBOL(bdi_stat_init);
+
+#ifdef CONFIG_SMP
+void __mod_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item, int delta)
+{
+	struct bdi_per_cpu_data *pcd = &bdi->pcd[smp_processor_id()];
+	s8 *p = pcd->bdi_stat_diff + item;
+	long x;
+
+	x = delta + *p;
+
+	if (unlikely(x > pcd->stat_threshold || x < -pcd->stat_threshold)) {
+		bdi_stat_add(x, bdi, item);
+		x = 0;
+	}
+	*p = x;
+}
+EXPORT_SYMBOL(__mod_bdi_stat);
+
+void mod_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item, int delta)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__mod_bdi_stat(bdi, item, delta);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_bdi_stat);
+
+void __inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item)
+{
+	struct bdi_per_cpu_data *pcd = &bdi->pcd[smp_processor_id()];
+	s8 *p = pcd->bdi_stat_diff + item;
+
+	(*p)++;
+
+	if (unlikely(*p > pcd->stat_threshold)) {
+		int overstep = pcd->stat_threshold / 2;
+
+		bdi_stat_add(*p + overstep, bdi, item);
+		*p = -overstep;
+	}
+}
+EXPORT_SYMBOL(__inc_bdi_stat);
+
+void inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__inc_bdi_stat(bdi, item);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_bdi_stat);
+
+void __dec_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item)
+{
+	struct bdi_per_cpu_data *pcd = &bdi->pcd[smp_processor_id()];
+	s8 *p = pcd->bdi_stat_diff + item;
+
+	(*p)--;
+
+	if (unlikely(*p < -pcd->stat_threshold)) {
+		int overstep = pcd->stat_threshold / 2;
+
+		bdi_stat_add(*p - overstep, bdi, item);
+		*p = overstep;
+	}
+}
+EXPORT_SYMBOL(__dec_bdi_stat);
+
+void dec_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__dec_bdi_stat(bdi, item);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(dec_bdi_stat);
+#endif
Index: linux-2.6/drivers/block/rd.c
===================================================================
--- linux-2.6.orig/drivers/block/rd.c
+++ linux-2.6/drivers/block/rd.c
@@ -421,6 +421,8 @@ static int __init rd_init(void)
 	int i;
 	int err = -ENOMEM;
 
+	bdi_stat_init(&rd_file_backing_dev_info);
+
 	if (rd_blocksize > PAGE_SIZE || rd_blocksize < 512 ||
 			(rd_blocksize & (rd_blocksize-1))) {
 		printk("RAMDISK: wrong blocksize %d, reverting to defaults\n",
Index: linux-2.6/drivers/char/mem.c
===================================================================
--- linux-2.6.orig/drivers/char/mem.c
+++ linux-2.6/drivers/char/mem.c
@@ -987,6 +987,8 @@ static int __init chr_dev_init(void)
 			      MKDEV(MEM_MAJOR, devlist[i].minor),
 			      devlist[i].name);
 
+	bdi_stat_init(&zero_bdi);
+
 	return 0;
 }
 
Index: linux-2.6/fs/char_dev.c
===================================================================
--- linux-2.6.orig/fs/char_dev.c
+++ linux-2.6/fs/char_dev.c
@@ -548,6 +548,7 @@ static struct kobject *base_probe(dev_t 
 void __init chrdev_init(void)
 {
 	cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
+	bdi_stat_init(&directly_mappable_cdev_bdi);
 }
 
 
Index: linux-2.6/fs/fuse/inode.c
===================================================================
--- linux-2.6.orig/fs/fuse/inode.c
+++ linux-2.6/fs/fuse/inode.c
@@ -413,6 +413,7 @@ static struct fuse_conn *new_conn(void)
 		atomic_set(&fc->num_waiting, 0);
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
+		bdi_stat_init(&fc->bdi);
 		fc->reqctr = 0;
 		fc->blocked = 1;
 		get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
Index: linux-2.6/fs/nfs/client.c
===================================================================
--- linux-2.6.orig/fs/nfs/client.c
+++ linux-2.6/fs/nfs/client.c
@@ -661,6 +661,7 @@ static void nfs_server_set_fsinfo(struct
 	server->backing_dev_info.ra_pages0 = min_t(unsigned, server->rpages,
 				VM_MIN_READAHEAD >> (PAGE_CACHE_SHIFT - 10));
 	server->backing_dev_info.ra_thrash_bytes = server->rsize * NFS_MAX_READAHEAD;
+	bdi_stat_init(&server->backing_dev_info);
 
 	if (server->wsize > max_rpc_payload)
 		server->wsize = max_rpc_payload;

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/6] mm: scalable bdi statistics counters.
  2007-04-03 14:40 ` [PATCH 1/6] mm: scalable bdi statistics counters Peter Zijlstra
@ 2007-04-04  9:20   ` Miklos Szeredi
  2007-04-04  9:25     ` Peter Zijlstra
  0 siblings, 1 reply; 18+ messages in thread
From: Miklos Szeredi @ 2007-04-04  9:20 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: linux-mm, linux-kernel, akpm, neilb, dgc, tomoki.sekiyama.qu

> Provide scalable per backing_dev_info statistics counters modeled on the ZVC
> code.

Why do we need global_bdi_stat()?  It should give approximately the
same numbers as global_page_state(), no?

Thanks,
Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/6] mm: scalable bdi statistics counters.
  2007-04-04  9:20   ` Miklos Szeredi
@ 2007-04-04  9:25     ` Peter Zijlstra
  0 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2007-04-04  9:25 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-mm, linux-kernel, akpm, neilb, dgc, tomoki.sekiyama.qu

On Wed, 2007-04-04 at 11:20 +0200, Miklos Szeredi wrote:
> > Provide scalable per backing_dev_info statistics counters modeled on the ZVC
> > code.
> 
> Why do we need global_bdi_stat()?  It should give approximately the
> same numbers as global_page_state(), no?

For those counters that are shared, yes. However I find it not obvious
that all BDI counter will always be mirrored in the page stats, and I
actually use it in 6/6, which introduces counters that are not mirrored
in the page stats.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 2/6] mm: count dirty pages per BDI
  2007-04-03 14:40 [PATCH 0/6] per device dirty throttling -V2 Peter Zijlstra
  2007-04-03 14:40 ` [PATCH 1/6] mm: scalable bdi statistics counters Peter Zijlstra
@ 2007-04-03 14:40 ` Peter Zijlstra
  2007-04-03 14:40 ` [PATCH 3/6] mm: count writeback " Peter Zijlstra
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2007-04-03 14:40 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra

[-- Attachment #1: bdi_stat_dirty.patch --]
[-- Type: text/plain, Size: 2581 bytes --]

Count per BDI dirty pages.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 fs/buffer.c                 |    1 +
 include/linux/backing-dev.h |    1 +
 mm/page-writeback.c         |    2 ++
 mm/truncate.c               |    1 +
 4 files changed, 5 insertions(+)

Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -740,6 +740,7 @@ int __set_page_dirty_buffers(struct page
 	if (page->mapping) {	/* Race with truncate? */
 		if (mapping_cap_account_dirty(mapping)) {
 			__inc_zone_page_state(page, NR_FILE_DIRTY);
+			__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
 			task_io_account_write(PAGE_CACHE_SIZE);
 		}
 		radix_tree_tag_set(&mapping->page_tree,
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -828,6 +828,7 @@ int __set_page_dirty_nobuffers(struct pa
 			BUG_ON(mapping2 != mapping);
 			if (mapping_cap_account_dirty(mapping)) {
 				__inc_zone_page_state(page, NR_FILE_DIRTY);
+				__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
 				task_io_account_write(PAGE_CACHE_SIZE);
 			}
 			radix_tree_tag_set(&mapping->page_tree,
@@ -961,6 +962,7 @@ int clear_page_dirty_for_io(struct page 
 		 */
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
+			dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
 			return 1;
 		}
 		return 0;
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c
+++ linux-2.6/mm/truncate.c
@@ -71,6 +71,7 @@ void cancel_dirty_page(struct page *page
 		struct address_space *mapping = page->mapping;
 		if (mapping && mapping_cap_account_dirty(mapping)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
+			dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
 			if (account_size)
 				task_io_account_cancelled_write(account_size);
 		}
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h
+++ linux-2.6/include/linux/backing-dev.h
@@ -24,6 +24,7 @@ enum bdi_state {
 };
 
 enum bdi_stat_item {
+	BDI_DIRTY,
 	NR_BDI_STAT_ITEMS
 };
 

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 3/6] mm: count writeback pages per BDI
  2007-04-03 14:40 [PATCH 0/6] per device dirty throttling -V2 Peter Zijlstra
  2007-04-03 14:40 ` [PATCH 1/6] mm: scalable bdi statistics counters Peter Zijlstra
  2007-04-03 14:40 ` [PATCH 2/6] mm: count dirty pages per BDI Peter Zijlstra
@ 2007-04-03 14:40 ` Peter Zijlstra
  2007-04-03 14:40 ` [PATCH 4/6] mm: count unstable " Peter Zijlstra
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2007-04-03 14:40 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra

[-- Attachment #1: bdi_stat_writeback.patch --]
[-- Type: text/plain, Size: 1857 bytes --]

Count per BDI writeback pages.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/backing-dev.h |    1 +
 mm/page-writeback.c         |    8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -981,10 +981,12 @@ int test_clear_page_writeback(struct pag
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestClearPageWriteback(page);
-		if (ret)
+		if (ret) {
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
+			__dec_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestClearPageWriteback(page);
@@ -1004,10 +1006,12 @@ int test_set_page_writeback(struct page 
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestSetPageWriteback(page);
-		if (!ret)
+		if (!ret) {
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
+			__inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+		}
 		if (!PageDirty(page))
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h
+++ linux-2.6/include/linux/backing-dev.h
@@ -25,6 +25,7 @@ enum bdi_state {
 
 enum bdi_stat_item {
 	BDI_DIRTY,
+	BDI_WRITEBACK,
 	NR_BDI_STAT_ITEMS
 };
 

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 4/6] mm: count unstable pages per BDI
  2007-04-03 14:40 [PATCH 0/6] per device dirty throttling -V2 Peter Zijlstra
                   ` (2 preceding siblings ...)
  2007-04-03 14:40 ` [PATCH 3/6] mm: count writeback " Peter Zijlstra
@ 2007-04-03 14:40 ` Peter Zijlstra
  2007-04-03 14:40 ` [PATCH 5/6] mm: expose BDI statistics in sysfs Peter Zijlstra
  2007-04-03 14:40 ` [PATCH 6/6] mm: per device dirty threshold Peter Zijlstra
  5 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2007-04-03 14:40 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra

[-- Attachment #1: bdi_stat_unstable.patch --]
[-- Type: text/plain, Size: 2245 bytes --]

Count per BDI unstable pages.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 fs/nfs/write.c              |    4 ++++
 include/linux/backing-dev.h |    1 +
 2 files changed, 5 insertions(+)

Index: linux-2.6/fs/nfs/write.c
===================================================================
--- linux-2.6.orig/fs/nfs/write.c
+++ linux-2.6/fs/nfs/write.c
@@ -451,6 +451,7 @@ nfs_mark_request_commit(struct nfs_page 
 	nfsi->ncommit++;
 	spin_unlock(&nfsi->req_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE);
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
 #endif
@@ -511,6 +512,7 @@ static void nfs_cancel_commit_list(struc
 	while(!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE);
 		nfs_list_remove_request(req);
 		nfs_inode_remove_request(req);
 		nfs_unlock_request(req);
@@ -1236,6 +1238,7 @@ nfs_commit_list(struct inode *inode, str
 		nfs_list_remove_request(req);
 		nfs_mark_request_commit(req);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE);
 		nfs_clear_page_writeback(req);
 	}
 	return -ENOMEM;
@@ -1260,6 +1263,7 @@ static void nfs_commit_done(struct rpc_t
 		req = nfs_list_entry(data->pages.next);
 		nfs_list_remove_request(req);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE);
 
 		dprintk("NFS: commit (%s/%Ld %d@%Ld)",
 			req->wb_context->dentry->d_inode->i_sb->s_id,
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h
+++ linux-2.6/include/linux/backing-dev.h
@@ -26,6 +26,7 @@ enum bdi_state {
 enum bdi_stat_item {
 	BDI_DIRTY,
 	BDI_WRITEBACK,
+	BDI_UNSTABLE,
 	NR_BDI_STAT_ITEMS
 };
 

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 5/6] mm: expose BDI statistics in sysfs.
  2007-04-03 14:40 [PATCH 0/6] per device dirty throttling -V2 Peter Zijlstra
                   ` (3 preceding siblings ...)
  2007-04-03 14:40 ` [PATCH 4/6] mm: count unstable " Peter Zijlstra
@ 2007-04-03 14:40 ` Peter Zijlstra
  2007-04-03 14:40 ` [PATCH 6/6] mm: per device dirty threshold Peter Zijlstra
  5 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2007-04-03 14:40 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra

[-- Attachment #1: bdi_stat_sysfs.patch --]
[-- Type: text/plain, Size: 2407 bytes --]

Expose the per BDI stats in /sys/block/<dev>/queue/*

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 block/ll_rw_blk.c   |   81 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c |    2 -
 2 files changed, 82 insertions(+), 1 deletion(-)

Index: linux-2.6/block/ll_rw_blk.c
===================================================================
--- linux-2.6.orig/block/ll_rw_blk.c
+++ linux-2.6/block/ll_rw_blk.c
@@ -3975,6 +3975,20 @@ static ssize_t queue_max_hw_sectors_show
 	return queue_var_show(max_hw_sectors_kb, (page));
 }
 
+static ssize_t queue_nr_dirty_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%lu\n", bdi_stat(&q->backing_dev_info, BDI_DIRTY));
+}
+
+static ssize_t queue_nr_writeback_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%lu\n", bdi_stat(&q->backing_dev_info, BDI_WRITEBACK));
+}
+
+static ssize_t queue_nr_unstable_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%lu\n", bdi_stat(&q->backing_dev_info, BDI_UNSTABLE));
+}
 
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -4005,6 +4019,21 @@ static struct queue_sysfs_entry queue_ma
 	.show = queue_max_hw_sectors_show,
 };
 
+static struct queue_sysfs_entry queue_dirty_entry = {
+	.attr = {.name = "dirty_pages", .mode = S_IRUGO },
+	.show = queue_nr_dirty_show,
+};
+
+static struct queue_sysfs_entry queue_writeback_entry = {
+	.attr = {.name = "writeback_pages", .mode = S_IRUGO },
+	.show = queue_nr_writeback_show,
+};
+
+static struct queue_sysfs_entry queue_unstable_entry = {
+	.attr = {.name = "unstable_pages", .mode = S_IRUGO },
+	.show = queue_nr_unstable_show,
+};
+
 static struct queue_sysfs_entry queue_iosched_entry = {
 	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
 	.show = elv_iosched_show,
@@ -4017,6 +4046,9 @@ static struct attribute *default_attrs[]
 	&queue_initial_ra_entry.attr,
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
+	&queue_dirty_entry.attr,
+	&queue_writeback_entry.attr,
+	&queue_unstable_entry.attr,
 	&queue_iosched_entry.attr,
 	NULL,
 };

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 6/6] mm: per device dirty threshold
  2007-04-03 14:40 [PATCH 0/6] per device dirty throttling -V2 Peter Zijlstra
                   ` (4 preceding siblings ...)
  2007-04-03 14:40 ` [PATCH 5/6] mm: expose BDI statistics in sysfs Peter Zijlstra
@ 2007-04-03 14:40 ` Peter Zijlstra
  2007-04-04  9:34   ` Miklos Szeredi
  5 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2007-04-03 14:40 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra

[-- Attachment #1: writeback-balance-per-backing_dev.patch --]
[-- Type: text/plain, Size: 11075 bytes --]

Scale writeback cache per backing device, proportional to its writeout speed.

akpm sayeth:
> Which problem are we trying to solve here?  afaik our two uppermost
> problems are:
> 
> a) Heavy write to queue A causes light writer to queue B to blok for a long
> time in balance_dirty_pages().  Even if the devices have the same speed.  

This one; esp when not the same speed. The - my usb stick makes my
computer suck - problem. But even on similar speed, the separation of
device should avoid blocking dev B when dev A is being throttled.

The writeout speed is measure dynamically, so when it doesn't have
anything to write out for a while its writeback cache size goes to 0.

Conversely, when starting up it will in the beginning act almost
synchronous but will quickly build up a 'fair' share of the writeback
cache.

> b) heavy write to device A causes light write to device A to block for a
> long time in balance_dirty_pages(), occasionally.  Harder to fix.

This will indeed take more. I've thought about it though. But one
quickly ends up with per task state.


How it all works:

We pick a 2^n value based on the total vm size to act as a period -
vm_cycle_shift. This period measures 'time' in writeout events.

Each writeout increases time and adds to a per bdi counter. This counter is 
halved when a period expires. So per bdi speed is:

  0.5 * (previous cycle speed) + this cycle's events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/backing-dev.h |    8 ++
 mm/backing-dev.c            |    3 
 mm/page-writeback.c         |  166 +++++++++++++++++++++++++++++++++++---------
 3 files changed, 145 insertions(+), 32 deletions(-)

Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h
+++ linux-2.6/include/linux/backing-dev.h
@@ -27,6 +27,8 @@ enum bdi_stat_item {
 	BDI_DIRTY,
 	BDI_WRITEBACK,
 	BDI_UNSTABLE,
+	BDI_WRITEOUT,
+	BDI_WRITEOUT_TOTAL,
 	NR_BDI_STAT_ITEMS
 };
 
@@ -50,6 +52,12 @@ struct backing_dev_info {
 	void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
 	void *unplug_io_data;
 
+	/*
+	 * data used for scaling the writeback cache
+	 */
+	spinlock_t lock;	/* protect the cycle count */
+	unsigned long cycles;	/* writeout cycles */
+
 	atomic_long_t bdi_stats[NR_BDI_STAT_ITEMS];
 #ifdef CONFIG_SMP
 	struct bdi_per_cpu_data pcd[NR_CPUS];
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -49,8 +49,6 @@
  */
 static long ratelimit_pages = 32;
 
-static int dirty_exceeded __cacheline_aligned_in_smp;	/* Dirty mem may be over limit */
-
 /*
  * When balance_dirty_pages decides that the caller needs to perform some
  * non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +101,87 @@ EXPORT_SYMBOL(laptop_mode);
 static void background_writeout(unsigned long _min_pages);
 
 /*
+ * Scale the writeback cache size proportional to the relative writeout speeds.
+ *
+ * We do this by tracking a floating average per BDI and a global floating
+ * average. We optimize away the '/= 2' for the global average by noting that:
+ *
+ *  if (++i > thresh) i /= 2:
+ *
+ * Can be approximated by:
+ *
+ *   thresh/2 + (++i % thresh/2)
+ *
+ * Furthermore, when we choose thresh to be 2^n it can be written in terms of
+ * binary operations and wraparound artifacts disappear.
+ *
+ * Also note that this yields a natural counter of the elapsed periods:
+ *
+ *   i / thresh
+ *
+ * Its monotonous increasing property can be applied to mitigate the wrap-
+ * around issue.
+ */
+static int vm_cycle_shift __read_mostly;
+
+/*
+ * Sync up the per BDI average to the global cycle.
+ */
+static void bdi_writeout_norm(struct backing_dev_info *bdi)
+{
+	int bits = vm_cycle_shift;
+	unsigned long cycle = 1UL << bits;
+	unsigned long mask = ~(cycle - 1);
+	unsigned long global_cycle =
+		(__global_bdi_stat(BDI_WRITEOUT_TOTAL) << 1) & mask;
+	unsigned long flags;
+
+	if ((bdi->cycles & mask) == global_cycle)
+		return;
+
+	spin_lock_irqsave(&bdi->lock, flags);
+	while ((bdi->cycles & mask) != global_cycle) {
+		unsigned long val = __bdi_stat(bdi, BDI_WRITEOUT);
+		unsigned long half = (val + 1) >> 1;
+
+		if (!val)
+			break;
+
+		mod_bdi_stat(bdi, BDI_WRITEOUT, -half);
+		bdi->cycles += cycle;
+	}
+	bdi->cycles = global_cycle;
+	spin_unlock_irqrestore(&bdi->lock, flags);
+}
+
+static void bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+	if (!bdi_cap_writeback_dirty(bdi))
+		return;
+
+	bdi_writeout_norm(bdi);
+
+	__inc_bdi_stat(bdi, BDI_WRITEOUT);
+	__inc_bdi_stat(bdi, BDI_WRITEOUT_TOTAL);
+}
+
+void get_writeout_scale(struct backing_dev_info *bdi, int *scale, int *div)
+{
+	int bits = vm_cycle_shift - 1;
+	unsigned long total = __global_bdi_stat(BDI_WRITEOUT_TOTAL);
+	unsigned long cycle = 1UL << bits;
+	unsigned long mask = cycle - 1;
+
+	if (bdi_cap_writeback_dirty(bdi)) {
+		bdi_writeout_norm(bdi);
+		*scale = __bdi_stat(bdi, BDI_WRITEOUT);
+	} else
+		*scale = 0;
+
+	*div = cycle + (total & mask);
+}
+
+/*
  * Work out the current dirty-memory clamping and background writeout
  * thresholds.
  *
@@ -158,8 +237,8 @@ static unsigned long determine_dirtyable
 }
 
 static void
-get_dirty_limits(long *pbackground, long *pdirty,
-					struct address_space *mapping)
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+		 struct backing_dev_info *bdi)
 {
 	int background_ratio;		/* Percentages */
 	int dirty_ratio;
@@ -193,6 +272,31 @@ get_dirty_limits(long *pbackground, long
 	}
 	*pbackground = background;
 	*pdirty = dirty;
+
+	if (bdi) {
+		long long tmp = dirty;
+		long reserve;
+		int scale, div;
+
+		get_writeout_scale(bdi, &scale, &div);
+
+		tmp *= scale;
+		do_div(tmp, div);
+
+		reserve = dirty -
+			(global_bdi_stat(BDI_DIRTY) +
+			 global_bdi_stat(BDI_WRITEBACK) +
+			 global_bdi_stat(BDI_UNSTABLE));
+
+		if (reserve < 0)
+			reserve = 0;
+
+		reserve += bdi_stat(bdi, BDI_DIRTY) +
+			bdi_stat(bdi, BDI_WRITEBACK) +
+			bdi_stat(bdi, BDI_UNSTABLE);
+
+		*pbdi_dirty = min((long)tmp, reserve);
+	}
 }
 
 /*
@@ -204,9 +308,10 @@ get_dirty_limits(long *pbackground, long
  */
 static void balance_dirty_pages(struct address_space *mapping)
 {
-	long nr_reclaimable;
+	long bdi_nr_reclaimable;
 	long background_thresh;
 	long dirty_thresh;
+	long bdi_thresh;
 	unsigned long pages_written = 0;
 	unsigned long write_chunk = sync_writeback_pages();
 
@@ -221,32 +326,31 @@ static void balance_dirty_pages(struct a
 			.range_cyclic	= 1,
 		};
 
-		get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
-		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
-		if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
-			dirty_thresh)
+		get_dirty_limits(&background_thresh, &dirty_thresh,
+				&bdi_thresh, bdi);
+		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+					bdi_stat(bdi, BDI_UNSTABLE);
+		if (bdi_nr_reclaimable + bdi_stat(bdi, BDI_WRITEBACK) <=
+		     	bdi_thresh)
 				break;
 
-		if (!dirty_exceeded)
-			dirty_exceeded = 1;
-
 		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 		 * Unstable writes are a feature of certain networked
 		 * filesystems (i.e. NFS) in which data may have been
 		 * written to the server's write cache, but has not yet
 		 * been flushed to permanent storage.
 		 */
-		if (nr_reclaimable) {
+		if (bdi_nr_reclaimable) {
 			writeback_inodes(&wbc);
-			get_dirty_limits(&background_thresh,
-					 	&dirty_thresh, mapping);
-			nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
-			if (nr_reclaimable +
-				global_page_state(NR_WRITEBACK)
-					<= dirty_thresh)
-						break;
+
+			get_dirty_limits(&background_thresh, &dirty_thresh,
+				       &bdi_thresh, bdi);
+			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+						bdi_stat(bdi, BDI_UNSTABLE);
+			if (bdi_nr_reclaimable + bdi_stat(bdi, BDI_WRITEBACK) <=
+			     	bdi_thresh)
+				break;
+
 			pages_written += write_chunk - wbc.nr_to_write;
 			if (pages_written >= write_chunk)
 				break;		/* We've done our duty */
@@ -254,10 +358,6 @@ static void balance_dirty_pages(struct a
 		congestion_wait(WRITE, HZ/10);
 	}
 
-	if (nr_reclaimable + global_page_state(NR_WRITEBACK)
-		<= dirty_thresh && dirty_exceeded)
-			dirty_exceeded = 0;
-
 	if (writeback_in_progress(bdi))
 		return;		/* pdflush is already working this queue */
 
@@ -270,7 +370,9 @@ static void balance_dirty_pages(struct a
 	 * background_thresh, to keep the amount of dirty memory low.
 	 */
 	if ((laptop_mode && pages_written) ||
-	     (!laptop_mode && (nr_reclaimable > background_thresh)))
+			(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+					  + global_page_state(NR_UNSTABLE_NFS)
+					  > background_thresh)))
 		pdflush_operation(background_writeout, 0);
 }
 
@@ -305,9 +407,7 @@ void balance_dirty_pages_ratelimited_nr(
 	unsigned long ratelimit;
 	unsigned long *p;
 
-	ratelimit = ratelimit_pages;
-	if (dirty_exceeded)
-		ratelimit = 8;
+	ratelimit = 8;
 
 	/*
 	 * Check the rate limiting. Also, we do not want to throttle real-time
@@ -342,7 +442,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
 	}
 
         for ( ; ; ) {
-		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 
                 /*
                  * Boost the allowable dirty threshold a bit for page
@@ -377,7 +477,7 @@ static void background_writeout(unsigned
 		long background_thresh;
 		long dirty_thresh;
 
-		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 		if (global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) < background_thresh
 				&& min_pages <= 0)
@@ -585,6 +685,7 @@ void __init page_writeback_init(void)
 	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
+	vm_cycle_shift = 1 + ilog2(vm_total_pages);
 }
 
 /**
@@ -986,6 +1087,7 @@ int test_clear_page_writeback(struct pag
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
 			__dec_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+			bdi_writeout_inc(mapping->backing_dev_info);
 		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c
+++ linux-2.6/mm/backing-dev.c
@@ -91,6 +91,9 @@ void bdi_stat_init(struct backing_dev_in
 {
 	int i;
 
+	spin_lock_init(&bdi->lock);
+	bdi->cycles = 0;
+
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		atomic_long_set(&bdi->bdi_stats[i], 0);
 

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] mm: per device dirty threshold
  2007-04-03 14:40 ` [PATCH 6/6] mm: per device dirty threshold Peter Zijlstra
@ 2007-04-04  9:34   ` Miklos Szeredi
  2007-04-04 10:16     ` Peter Zijlstra
  0 siblings, 1 reply; 18+ messages in thread
From: Miklos Szeredi @ 2007-04-04  9:34 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: linux-mm, linux-kernel, akpm, neilb, dgc, tomoki.sekiyama.qu

> Scale writeback cache per backing device, proportional to its writeout speed.
> 
> akpm sayeth:
> > Which problem are we trying to solve here?  afaik our two uppermost
> > problems are:
> > 
> > a) Heavy write to queue A causes light writer to queue B to blok for a long
> > time in balance_dirty_pages().  Even if the devices have the same speed.  
> 
> This one; esp when not the same speed. The - my usb stick makes my
> computer suck - problem. But even on similar speed, the separation of
> device should avoid blocking dev B when dev A is being throttled.
> 
> The writeout speed is measure dynamically, so when it doesn't have
> anything to write out for a while its writeback cache size goes to 0.
> 
> Conversely, when starting up it will in the beginning act almost
> synchronous but will quickly build up a 'fair' share of the writeback
> cache.

I'm worried about two things:

1) If the per-bdi threshold becomes smaller than the granularity of
   the per-bdi stat (due to the per-CPU counters), then things will
   break.  Shouldn't there be some sanity checking for the calculated
   threshold?

2) The loop is sleeping in congestion_wait(WRITE), which seems wrong.
   It may well be possible that none of the queues are congested, so
   it will sleep the full .1 second.  But by that time the queue may
   have become idle and is just sitting there doing nothing.  Maybe
   there should be a per-bdi waitq, that is woken up, when the per-bdi
   stats are updated.


Thanks,
Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] mm: per device dirty threshold
  2007-04-04  9:34   ` Miklos Szeredi
@ 2007-04-04 10:16     ` Peter Zijlstra
  2007-04-04 10:29       ` Miklos Szeredi
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2007-04-04 10:16 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-mm, linux-kernel, akpm, neilb, dgc, tomoki.sekiyama.qu

On Wed, 2007-04-04 at 11:34 +0200, Miklos Szeredi wrote:
> > Scale writeback cache per backing device, proportional to its writeout speed.
> > 
> > akpm sayeth:
> > > Which problem are we trying to solve here?  afaik our two uppermost
> > > problems are:
> > > 
> > > a) Heavy write to queue A causes light writer to queue B to blok for a long
> > > time in balance_dirty_pages().  Even if the devices have the same speed.  
> > 
> > This one; esp when not the same speed. The - my usb stick makes my
> > computer suck - problem. But even on similar speed, the separation of
> > device should avoid blocking dev B when dev A is being throttled.
> > 
> > The writeout speed is measure dynamically, so when it doesn't have
> > anything to write out for a while its writeback cache size goes to 0.
> > 
> > Conversely, when starting up it will in the beginning act almost
> > synchronous but will quickly build up a 'fair' share of the writeback
> > cache.
> 
> I'm worried about two things:
> 
> 1) If the per-bdi threshold becomes smaller than the granularity of
>    the per-bdi stat (due to the per-CPU counters), then things will
>    break.  Shouldn't there be some sanity checking for the calculated
>    threshold?

I'm not sure what you're referring to.

void get_writeout_scale(struct backing_dev_info *bdi, int *scale, int *div)
{
        int bits = vm_cycle_shift - 1;
        unsigned long total = __global_bdi_stat(BDI_WRITEOUT_TOTAL);
        unsigned long cycle = 1UL << bits;
        unsigned long mask = cycle - 1;

        if (bdi_cap_writeback_dirty(bdi)) {
                bdi_writeout_norm(bdi);
                *scale = __bdi_stat(bdi, BDI_WRITEOUT);
        } else
                *scale = 0;

        *div = cycle + (total & mask);
}

where cycle ~ vm_total_pages
scale can be a tad off due to overstep here:

void __inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item)
{
        struct bdi_per_cpu_data *pcd = &bdi->pcd[smp_processor_id()];
        s8 *p = pcd->bdi_stat_diff + item;

        (*p)++;

        if (unlikely(*p > pcd->stat_threshold)) {
                int overstep = pcd->stat_threshold / 2;

                bdi_stat_add(*p + overstep, bdi, item);
                *p = -overstep;
        }
}

so it could be that: scale / cycle > 1
by a very small amount; however:

if (bdi) {
        long long tmp = dirty;
        long reserve;
        int scale, div;

        get_writeout_scale(bdi, &scale, &div);

        tmp *= scale;
        do_div(tmp, div);

        reserve = dirty -
                (global_bdi_stat(BDI_DIRTY) +
                 global_bdi_stat(BDI_WRITEBACK) +
                 global_bdi_stat(BDI_UNSTABLE));

        if (reserve < 0)
                reserve = 0;

        reserve += bdi_stat(bdi, BDI_DIRTY) +
                bdi_stat(bdi, BDI_WRITEBACK) +
                bdi_stat(bdi, BDI_UNSTABLE);

        *pbdi_dirty = min((long)tmp, reserve);
}

here we clip to 'reserve' which is the total amount of dirty threshold
not dirty by others.

> 2) The loop is sleeping in congestion_wait(WRITE), which seems wrong.
>    It may well be possible that none of the queues are congested, so
>    it will sleep the full .1 second.  But by that time the queue may
>    have become idle and is just sitting there doing nothing.  Maybe
>    there should be a per-bdi waitq, that is woken up, when the per-bdi
>    stats are updated.

Good point, .1 seconds is a lot of time.

I'll cook up something like that if nobody beats me to it :-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] mm: per device dirty threshold
  2007-04-04 10:16     ` Peter Zijlstra
@ 2007-04-04 10:29       ` Miklos Szeredi
  2007-04-04 11:01         ` Peter Zijlstra
  0 siblings, 1 reply; 18+ messages in thread
From: Miklos Szeredi @ 2007-04-04 10:29 UTC (permalink / raw)
  To: a.p.zijlstra
  Cc: miklos, linux-mm, linux-kernel, akpm, neilb, dgc, tomoki.sekiyama.qu

> > I'm worried about two things:
> > 
> > 1) If the per-bdi threshold becomes smaller than the granularity of
> >    the per-bdi stat (due to the per-CPU counters), then things will
> >    break.  Shouldn't there be some sanity checking for the calculated
> >    threshold?
> 
> I'm not sure what you're referring to.
> 
> void get_writeout_scale(struct backing_dev_info *bdi, int *scale, int *div)
> {
>         int bits = vm_cycle_shift - 1;
>         unsigned long total = __global_bdi_stat(BDI_WRITEOUT_TOTAL);
>         unsigned long cycle = 1UL << bits;
>         unsigned long mask = cycle - 1;
> 
>         if (bdi_cap_writeback_dirty(bdi)) {
>                 bdi_writeout_norm(bdi);
>                 *scale = __bdi_stat(bdi, BDI_WRITEOUT);
>         } else
>                 *scale = 0;
> 
>         *div = cycle + (total & mask);
> }
> 
> where cycle ~ vm_total_pages
> scale can be a tad off due to overstep here:
> 
> void __inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item)
> {
>         struct bdi_per_cpu_data *pcd = &bdi->pcd[smp_processor_id()];
>         s8 *p = pcd->bdi_stat_diff + item;
> 
>         (*p)++;
> 
>         if (unlikely(*p > pcd->stat_threshold)) {
>                 int overstep = pcd->stat_threshold / 2;
> 
>                 bdi_stat_add(*p + overstep, bdi, item);
>                 *p = -overstep;
>         }
> }
> 
> so it could be that: scale / cycle > 1
> by a very small amount; however:

No, I'm worried about the case when scale is too small.  If the
per-bdi threshold becomes smaller than stat_threshold, then things
won't work, because dirty+writeback will never go below the threshold,
possibly resulting in the deadlock we are trying to avoid.

BTW, the second argument of get_dirty_limits() doesn't seem to get
used by the caller, or does it?

> here we clip to 'reserve' which is the total amount of dirty threshold
> not dirty by others.
> 
> > 2) The loop is sleeping in congestion_wait(WRITE), which seems wrong.
> >    It may well be possible that none of the queues are congested, so
> >    it will sleep the full .1 second.  But by that time the queue may
> >    have become idle and is just sitting there doing nothing.  Maybe
> >    there should be a per-bdi waitq, that is woken up, when the per-bdi
> >    stats are updated.
> 
> Good point, .1 seconds is a lot of time.
> 
> I'll cook up something like that if nobody beats me to it :-)

I realized, that it's maybe worth storing last the threshold in the
bdi as well, so that balance_dirty_pages() doesn't get woken up too
many times unnecessarilty.  But I don't know...

Thanks,
Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] mm: per device dirty threshold
  2007-04-04 10:29       ` Miklos Szeredi
@ 2007-04-04 11:01         ` Peter Zijlstra
  2007-04-04 11:12           ` Miklos Szeredi
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2007-04-04 11:01 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-mm, linux-kernel, akpm, neilb, dgc, tomoki.sekiyama.qu

On Wed, 2007-04-04 at 12:29 +0200, Miklos Szeredi wrote:
> > > I'm worried about two things:
> > > 
> > > 1) If the per-bdi threshold becomes smaller than the granularity of
> > >    the per-bdi stat (due to the per-CPU counters), then things will
> > >    break.  Shouldn't there be some sanity checking for the calculated
> > >    threshold?
> > 
> > I'm not sure what you're referring to.
> > 
> > void get_writeout_scale(struct backing_dev_info *bdi, int *scale, int *div)
> > {
> >         int bits = vm_cycle_shift - 1;
> >         unsigned long total = __global_bdi_stat(BDI_WRITEOUT_TOTAL);
> >         unsigned long cycle = 1UL << bits;
> >         unsigned long mask = cycle - 1;
> > 
> >         if (bdi_cap_writeback_dirty(bdi)) {
> >                 bdi_writeout_norm(bdi);
> >                 *scale = __bdi_stat(bdi, BDI_WRITEOUT);
> >         } else
> >                 *scale = 0;
> > 
> >         *div = cycle + (total & mask);
> > }
> > 
> > where cycle ~ vm_total_pages
> > scale can be a tad off due to overstep here:
> > 
> > void __inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item)
> > {
> >         struct bdi_per_cpu_data *pcd = &bdi->pcd[smp_processor_id()];
> >         s8 *p = pcd->bdi_stat_diff + item;
> > 
> >         (*p)++;
> > 
> >         if (unlikely(*p > pcd->stat_threshold)) {
> >                 int overstep = pcd->stat_threshold / 2;
> > 
> >                 bdi_stat_add(*p + overstep, bdi, item);
> >                 *p = -overstep;
> >         }
> > }
> > 
> > so it could be that: scale / cycle > 1
> > by a very small amount; however:
> 
> No, I'm worried about the case when scale is too small.  If the
> per-bdi threshold becomes smaller than stat_threshold, then things
> won't work, because dirty+writeback will never go below the threshold,
> possibly resulting in the deadlock we are trying to avoid.

/me goes refresh the deadlock details..

A writes to B; A exceeds the dirty limit but writeout is blocked by B
because the dirty limit is exceeded, right?

This cannot happen when we decouple the BDI dirty thresholds, even when
a threshold is 0.

A write to B; A exceeds A's limit and writes to B, B has limit of 0, the
1 dirty page gets written out (we gain ratio) and life goes on.

Right?

> BTW, the second argument of get_dirty_limits() doesn't seem to get
> used by the caller, or does it?

Correct, there are currently no in-tree users left. However I do use it
in a debug patch that shows bdi_dirty of total_dirty. We could remove
it, I have no strong feelings on it, I thought it might still be useful
for reporting or something.

> > > 2) The loop is sleeping in congestion_wait(WRITE), which seems wrong.
> > >    It may well be possible that none of the queues are congested, so
> > >    it will sleep the full .1 second.  But by that time the queue may
> > >    have become idle and is just sitting there doing nothing.  Maybe
> > >    there should be a per-bdi waitq, that is woken up, when the per-bdi
> > >    stats are updated.
> > 
> > Good point, .1 seconds is a lot of time.
> > 
> > I'll cook up something like that if nobody beats me to it :-)
> 
> I realized, that it's maybe worth storing last the threshold in the
> bdi as well, so that balance_dirty_pages() doesn't get woken up too
> many times unnecessarilty.  But I don't know...

There is already a ratelimit somewhere, but I've heard it suggested to
remove that....

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] mm: per device dirty threshold
  2007-04-04 11:01         ` Peter Zijlstra
@ 2007-04-04 11:12           ` Miklos Szeredi
  2007-04-04 12:05             ` Peter Zijlstra
  0 siblings, 1 reply; 18+ messages in thread
From: Miklos Szeredi @ 2007-04-04 11:12 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: linux-mm, linux-kernel, akpm, neilb, dgc, tomoki.sekiyama.qu

> > > so it could be that: scale / cycle > 1
> > > by a very small amount; however:
> > 
> > No, I'm worried about the case when scale is too small.  If the
> > per-bdi threshold becomes smaller than stat_threshold, then things
> > won't work, because dirty+writeback will never go below the threshold,
> > possibly resulting in the deadlock we are trying to avoid.
> 
> /me goes refresh the deadlock details..
> 
> A writes to B; A exceeds the dirty limit but writeout is blocked by B
> because the dirty limit is exceeded, right?
> 
> This cannot happen when we decouple the BDI dirty thresholds, even when
> a threshold is 0.
> 
> A write to B; A exceeds A's limit and writes to B, B has limit of 0, the
> 1 dirty page gets written out (we gain ratio) and life goes on.
> 
> Right?

If the limit is zero, then we need the per-bdi dirty+write to go to
zero, otherwise balance_dirty_pages() loops.  But the per-bdi
writeback counter is not necessarily updated after the writeback,
because the per-bdi per-CPU counter may not trip the update of the
per-bdi counter.

Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] mm: per device dirty threshold
  2007-04-04 11:12           ` Miklos Szeredi
@ 2007-04-04 12:05             ` Peter Zijlstra
  2007-04-04 12:32               ` Miklos Szeredi
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2007-04-04 12:05 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-mm, linux-kernel, akpm, neilb, dgc, tomoki.sekiyama.qu

On Wed, 2007-04-04 at 13:12 +0200, Miklos Szeredi wrote:
> > > > so it could be that: scale / cycle > 1
> > > > by a very small amount; however:
> > > 
> > > No, I'm worried about the case when scale is too small.  If the
> > > per-bdi threshold becomes smaller than stat_threshold, then things
> > > won't work, because dirty+writeback will never go below the threshold,
> > > possibly resulting in the deadlock we are trying to avoid.
> > 
> > /me goes refresh the deadlock details..
> > 
> > A writes to B; A exceeds the dirty limit but writeout is blocked by B
> > because the dirty limit is exceeded, right?
> > 
> > This cannot happen when we decouple the BDI dirty thresholds, even when
> > a threshold is 0.
> > 
> > A write to B; A exceeds A's limit and writes to B, B has limit of 0, the
> > 1 dirty page gets written out (we gain ratio) and life goes on.
> > 
> > Right?
> 
> If the limit is zero, then we need the per-bdi dirty+write to go to
> zero, otherwise balance_dirty_pages() loops.  But the per-bdi
> writeback counter is not necessarily updated after the writeback,
> because the per-bdi per-CPU counter may not trip the update of the
> per-bdi counter.

Aaah, Doh, yeah, that makes sense. I must be dense.

Funny that that never triggered, I do run SMP boxen. Hmm, what to do?

Preferably you'd want to be able to 'flush' the per cpu diffs or
something like that in cases where thresh ~< NR_CPUS * stat_diff.

How about something like this:

---
 include/linux/backing-dev.h |    5 ++++
 mm/backing-dev.c            |   51 ++++++++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c         |    4 +++
 3 files changed, 60 insertions(+)

Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h
+++ linux-2.6/include/linux/backing-dev.h
@@ -117,6 +117,8 @@ void mod_bdi_stat(struct backing_dev_inf
 void inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
 void dec_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
 
+void bdi_flush_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
+void bdi_flush_all(struct backing_dev_info *bdi, enum bdi_stat_item item);
 #else /* CONFIG_SMP */
 
 static inline void __mod_bdi_stat(struct backing_dev_info *bdi,
@@ -142,6 +144,9 @@ static inline void __dec_bdi_stat(struct
 #define mod_bdi_stat __mod_bdi_stat
 #define inc_bdi_stat __inc_bdi_stat
 #define dec_bdi_stat __dec_bdi_stat
+
+#define bdi_flush_stat(bdi, item) do { } while (0)
+#define bdi_flush_all(bdi) do { } while (0)
 #endif
 
 void bdi_stat_init(struct backing_dev_info *bdi);
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c
+++ linux-2.6/mm/backing-dev.c
@@ -188,4 +188,55 @@ void dec_bdi_stat(struct backing_dev_inf
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(dec_bdi_stat);
+
+void ___bdi_flush_stat(struct backing_dev_info *bdi, enum bdi_stat_item item)
+{
+	struct bdi_per_cpu_data *pcd = &bdi->pcd[smp_processor_id()];
+	s8 *p = pcd->bdi_stat_diff + item;
+
+	bdi_stat_add(*p, bdi, item);
+	*p = 0;
+}
+
+struct bdi_flush_struct {
+	struct backing_dev_info *bdi;
+	enum bdi_stat_item item;
+};
+
+void __bdi_flush_stat(struct bdi_flush_struct *flush)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	___bdi_flush_stat(flush->bdi, flush->item);
+	local_irq_restore(flags);
+}
+
+void __bdi_flush_all(struct backing_dev_info *bdi)
+{
+	unsigned long flags;
+	int i;
+
+	local_irq_save(flags);
+	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+		___bdi_flush_stat(bdi, i);
+	local_irq_restore(flags);
+}
+
+void bdi_flush_stat(struct backing_dev_info *bdi, enum bdi_stat_item item)
+{
+	struct bdi_flush_struct flush = {
+		bdi,
+		item
+	};
+
+	on_each_cpu(__bdi_flush_stat, &flush, 0, 1);
+}
+EXPORT_SYMBOL(bdi_flush_stat);
+
+void bdi_flush_all(struct backing_dev_info *bdi)
+{
+	on_each_cpu(__bdi_flush_all, bdi, 0, 1);
+}
+EXPORT_SYMBOL(bdi_flush_all);
 #endif
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -345,6 +345,10 @@ static void balance_dirty_pages(struct a
 
 			get_dirty_limits(&background_thresh, &dirty_thresh,
 				       &bdi_thresh, bdi);
+
+			if (bdi_thresh < NR_CPUS * 8 * ilog2(NR_CPUS))
+				bdi_flush_all(bdi);
+
 			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
 						bdi_stat(bdi, BDI_UNSTABLE);
 			if (bdi_nr_reclaimable + bdi_stat(bdi, BDI_WRITEBACK) <=





--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] mm: per device dirty threshold
  2007-04-04 12:05             ` Peter Zijlstra
@ 2007-04-04 12:32               ` Miklos Szeredi
  2007-04-04 12:43                 ` Peter Zijlstra
  0 siblings, 1 reply; 18+ messages in thread
From: Miklos Szeredi @ 2007-04-04 12:32 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: linux-mm, linux-kernel, akpm, neilb, dgc, tomoki.sekiyama.qu

> Preferably you'd want to be able to 'flush' the per cpu diffs or
> something like that in cases where thresh ~< NR_CPUS * stat_diff.
> 
> How about something like this:

Yes, maybe underscores and EXPORT_SYMBOLs are a bit excessive.

Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] mm: per device dirty threshold
  2007-04-04 12:32               ` Miklos Szeredi
@ 2007-04-04 12:43                 ` Peter Zijlstra
  2007-04-04 20:03                   ` Peter Zijlstra
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2007-04-04 12:43 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-mm, linux-kernel, akpm, neilb, dgc, tomoki.sekiyama.qu

On Wed, 2007-04-04 at 14:32 +0200, Miklos Szeredi wrote:
> > Preferably you'd want to be able to 'flush' the per cpu diffs or
> > something like that in cases where thresh ~< NR_CPUS * stat_diff.
> > 
> > How about something like this:
> 
> Yes, maybe underscores and EXPORT_SYMBOLs are a bit excessive.

probably, here is one that actually compiles and handles cpu hotplug;
albeit a bit racy - which lock does exclude cpu hotplug these days?

---
 include/linux/backing-dev.h |   13 +++++++
 mm/backing-dev.c            |   79 ++++++++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c         |    4 ++
 3 files changed, 96 insertions(+)

Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h
+++ linux-2.6/include/linux/backing-dev.h
@@ -8,6 +8,7 @@
 #ifndef _LINUX_BACKING_DEV_H
 #define _LINUX_BACKING_DEV_H
 
+#include <linux/cpumask.h>
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
@@ -117,6 +118,13 @@ void mod_bdi_stat(struct backing_dev_inf
 void inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
 void dec_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
 
+void bdi_flush_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
+void bdi_flush_all(struct backing_dev_info *bdi);
+
+static inline unsigned long bdi_stat_delta(void)
+{
+	return 8UL * num_online_cpus() * ilog2(num_online_cpus());
+}
 #else /* CONFIG_SMP */
 
 static inline void __mod_bdi_stat(struct backing_dev_info *bdi,
@@ -142,6 +150,11 @@ static inline void __dec_bdi_stat(struct
 #define mod_bdi_stat __mod_bdi_stat
 #define inc_bdi_stat __inc_bdi_stat
 #define dec_bdi_stat __dec_bdi_stat
+
+#define bdi_flush_stat(bdi, item) do { } while (0)
+#define bdi_flush_all(bdi) do { } while (0)
+
+#define bdi_stat_delta() 1UL
 #endif
 
 void bdi_stat_init(struct backing_dev_info *bdi);
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c
+++ linux-2.6/mm/backing-dev.c
@@ -188,4 +188,83 @@ void dec_bdi_stat(struct backing_dev_inf
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(dec_bdi_stat);
+
+void ___bdi_flush_stat(struct backing_dev_info *bdi, enum bdi_stat_item item,
+		int cpu)
+{
+	struct bdi_per_cpu_data *pcd = &bdi->pcd[cpu];
+	s8 *p = pcd->bdi_stat_diff + item;
+
+	bdi_stat_add(*p, bdi, item);
+	*p = 0;
+}
+
+struct bdi_flush_struct {
+	struct backing_dev_info *bdi;
+	enum bdi_stat_item item;
+};
+
+void __bdi_flush_stat(struct bdi_flush_struct *flush)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	___bdi_flush_stat(flush->bdi, flush->item, smp_processor_id());
+	local_irq_restore(flags);
+}
+
+void __bdi_flush_all(struct backing_dev_info *bdi)
+{
+	unsigned long flags;
+	int i, cpu;
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+		___bdi_flush_stat(bdi, i, cpu);
+	local_irq_restore(flags);
+}
+
+void bdi_flush_stat(struct backing_dev_info *bdi, enum bdi_stat_item item)
+{
+	struct bdi_flush_struct flush = {
+		bdi,
+		item
+	};
+
+#ifdef CONFIG_HOTPLUG_CPU
+	cpumask_t mask;
+	int cpu;
+
+	cpus_complement(mask, cpu_online_map);
+	for_each_cpu_mask(cpu, mask) {
+		unsigned long flags;
+
+		local_irq_save(flags);
+		___bdi_flush_stat(bdi, item, cpu);
+		local_irq_restore(flags);
+	}
+#endif
+	on_each_cpu((void (*)(void *))__bdi_flush_stat, &flush, 0, 1);
+}
+
+void bdi_flush_all(struct backing_dev_info *bdi)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	cpumask_t mask;
+	int cpu;
+
+	cpus_complement(mask, cpu_online_map);
+	for_each_cpu_mask(cpu, mask) {
+		int i;
+		unsigned long flags;
+
+		local_irq_save(flags);
+		for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+			___bdi_flush_stat(bdi, i, cpu);
+		local_irq_restore(flags);
+	}
+#endif
+	on_each_cpu((void (*)(void *))__bdi_flush_all, bdi, 0, 1);
+}
 #endif
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -345,6 +345,10 @@ static void balance_dirty_pages(struct a
 
 			get_dirty_limits(&background_thresh, &dirty_thresh,
 				       &bdi_thresh, bdi);
+
+			if (bdi_thresh < bdi_stat_delta())
+				bdi_flush_all(bdi);
+
 			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
 						bdi_stat(bdi, BDI_UNSTABLE);
 			if (bdi_nr_reclaimable + bdi_stat(bdi, BDI_WRITEBACK) <=


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] mm: per device dirty threshold
  2007-04-04 12:43                 ` Peter Zijlstra
@ 2007-04-04 20:03                   ` Peter Zijlstra
  0 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2007-04-04 20:03 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-mm, linux-kernel, akpm, neilb, dgc, tomoki.sekiyama.qu

Ok, so that all wasn't really good, sending IPIs about like that on
LargeSMP will not make me any friends, so how about this:

---
 include/linux/backing-dev.h |   13 ++++++++++++-
 mm/backing-dev.c            |   28 ++++++++++++++++++++++------
 mm/page-writeback.c         |   19 +++++++++++++++----
 3 files changed, 49 insertions(+), 11 deletions(-)

Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h
+++ linux-2.6/include/linux/backing-dev.h
@@ -8,6 +8,7 @@
 #ifndef _LINUX_BACKING_DEV_H
 #define _LINUX_BACKING_DEV_H
 
+#include <linux/cpumask.h>
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
@@ -34,7 +35,6 @@ enum bdi_stat_item {
 
 #ifdef CONFIG_SMP
 struct bdi_per_cpu_data {
-	s8 stat_threshold;
 	s8 bdi_stat_diff[NR_BDI_STAT_ITEMS];
 } ____cacheline_aligned_in_smp;
 #endif
@@ -60,6 +60,7 @@ struct backing_dev_info {
 
 	atomic_long_t bdi_stats[NR_BDI_STAT_ITEMS];
 #ifdef CONFIG_SMP
+	int stat_threshold;
 	struct bdi_per_cpu_data pcd[NR_CPUS];
 #endif
 };
@@ -109,6 +110,8 @@ static inline unsigned long bdi_stat(str
 }
 
 #ifdef CONFIG_SMP
+unsigned long bdi_stat_accurate(struct backing_dev_info *bdi, enum bdi_stat_item item);
+
 void __mod_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item, int delta);
 void __inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
 void __dec_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
@@ -117,8 +120,14 @@ void mod_bdi_stat(struct backing_dev_inf
 void inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
 void dec_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item);
 
+static inline unsigned long bdi_stat_delta(struct backing_dev_info *bdi)
+{
+	return num_online_cpus() * bdi->stat_threshold;
+}
 #else /* CONFIG_SMP */
 
+#define bdi_stat_accurate bdi_stat
+
 static inline void __mod_bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item, int delta)
 {
@@ -142,6 +151,8 @@ static inline void __dec_bdi_stat(struct
 #define mod_bdi_stat __mod_bdi_stat
 #define inc_bdi_stat __inc_bdi_stat
 #define dec_bdi_stat __dec_bdi_stat
+
+#define bdi_stat_delta(bdi) 1UL
 #endif
 
 void bdi_stat_init(struct backing_dev_info *bdi);
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c
+++ linux-2.6/mm/backing-dev.c
@@ -98,17 +98,33 @@ void bdi_stat_init(struct backing_dev_in
 		atomic_long_set(&bdi->bdi_stats[i], 0);
 
 #ifdef CONFIG_SMP
+	bdi->stat_threshold = 8 * ilog2(num_online_cpus());
 	for (i = 0; i < NR_CPUS; i++) {
 		int j;
 		for (j = 0; j < NR_BDI_STAT_ITEMS; j++)
 			bdi->pcd[i].bdi_stat_diff[j] = 0;
-		bdi->pcd[i].stat_threshold = 8 * ilog2(num_online_cpus());
 	}
 #endif
 }
 EXPORT_SYMBOL(bdi_stat_init);
 
 #ifdef CONFIG_SMP
+unsigned long bdi_stat_accurate(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	unsigned long x = __bdi_stat(bdi, item);
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct bdi_per_cpu_data *pcd = &bdi->pcd[cpu];
+		s8 *p = pcd->bdi_stat_diff + item;
+
+		x += *p;
+	}
+
+	return x;
+}
+
 void __mod_bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item, int delta)
 {
@@ -118,7 +134,7 @@ void __mod_bdi_stat(struct backing_dev_i
 
 	x = delta + *p;
 
-	if (unlikely(x > pcd->stat_threshold || x < -pcd->stat_threshold)) {
+	if (unlikely(x > bdi->stat_threshold || x < -bdi->stat_threshold)) {
 		bdi_stat_add(x, bdi, item);
 		x = 0;
 	}
@@ -144,8 +160,8 @@ void __inc_bdi_stat(struct backing_dev_i
 
 	(*p)++;
 
-	if (unlikely(*p > pcd->stat_threshold)) {
-		int overstep = pcd->stat_threshold / 2;
+	if (unlikely(*p > bdi->stat_threshold)) {
+		int overstep = bdi->stat_threshold / 2;
 
 		bdi_stat_add(*p + overstep, bdi, item);
 		*p = -overstep;
@@ -170,8 +186,8 @@ void __dec_bdi_stat(struct backing_dev_i
 
 	(*p)--;
 
-	if (unlikely(*p < -pcd->stat_threshold)) {
-		int overstep = pcd->stat_threshold / 2;
+	if (unlikely(*p < -bdi->stat_threshold)) {
+		int overstep = bdi->stat_threshold / 2;
 
 		bdi_stat_add(*p - overstep, bdi, item);
 		*p = overstep;
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -341,14 +341,25 @@ static void balance_dirty_pages(struct a
 		 * been flushed to permanent storage.
 		 */
 		if (bdi_nr_reclaimable) {
+			unsigned long bdi_nr_writeback;
 			writeback_inodes(&wbc);
 
 			get_dirty_limits(&background_thresh, &dirty_thresh,
 				       &bdi_thresh, bdi);
-			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
-						bdi_stat(bdi, BDI_UNSTABLE);
-			if (bdi_nr_reclaimable + bdi_stat(bdi, BDI_WRITEBACK) <=
-			     	bdi_thresh)
+
+			if (bdi_thresh < bdi_stat_delta(bdi)) {
+				bdi_nr_reclaimable =
+					bdi_stat_accurate(bdi, BDI_DIRTY) +
+					bdi_stat_accurate(bdi, BDI_UNSTABLE);
+				bdi_nr_writeback =
+					bdi_stat_accurate(bdi, NR_WRITEBACK);
+			} else {
+				bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+					bdi_stat(bdi, BDI_UNSTABLE);
+				bdi_nr_writeback = bdi_stat(bdi, NR_WRITEBACK);
+			}
+
+			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
 				break;
 
 			pages_written += write_chunk - wbc.nr_to_write;


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2007-04-04 20:03 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-04-03 14:40 [PATCH 0/6] per device dirty throttling -V2 Peter Zijlstra
2007-04-03 14:40 ` [PATCH 1/6] mm: scalable bdi statistics counters Peter Zijlstra
2007-04-04  9:20   ` Miklos Szeredi
2007-04-04  9:25     ` Peter Zijlstra
2007-04-03 14:40 ` [PATCH 2/6] mm: count dirty pages per BDI Peter Zijlstra
2007-04-03 14:40 ` [PATCH 3/6] mm: count writeback " Peter Zijlstra
2007-04-03 14:40 ` [PATCH 4/6] mm: count unstable " Peter Zijlstra
2007-04-03 14:40 ` [PATCH 5/6] mm: expose BDI statistics in sysfs Peter Zijlstra
2007-04-03 14:40 ` [PATCH 6/6] mm: per device dirty threshold Peter Zijlstra
2007-04-04  9:34   ` Miklos Szeredi
2007-04-04 10:16     ` Peter Zijlstra
2007-04-04 10:29       ` Miklos Szeredi
2007-04-04 11:01         ` Peter Zijlstra
2007-04-04 11:12           ` Miklos Szeredi
2007-04-04 12:05             ` Peter Zijlstra
2007-04-04 12:32               ` Miklos Szeredi
2007-04-04 12:43                 ` Peter Zijlstra
2007-04-04 20:03                   ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox