From: "Ken Chen" <kenchen@google.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Subject: [patch] fix periodic superblock dirty inode flushing
Date: Wed, 11 Jul 2007 21:21:19 -0700 [thread overview]
Message-ID: <b040c32a0707112121y21d08438u8ca7f138931827b0@mail.gmail.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 4898 bytes --]
Current -mm tree has bucketful of bug fixes in periodic writeback path.
However, we still hit a glitch where dirty pages on a given inode aren't
completely flushed to the disk, and system will accumulate large amount
of dirty pages pass beyond what dirty_expire_interval is designed for.
The problem is __sync_single_inode() will move inode to sb->s_dirty list
even when there are more pending dirty pages on that inode. If there is
another inode with small amount of dirty pages, we hit a case where loop
iteration in wb_kupdate() terminates prematurely because wbc.nr_to_write > 0.
Thus leaving the inode that has large amount of dirty pages behind and it has
to wait for another dirty_writeback_interval before we flush it again. It
effectively only writeout MAX_WRITEBACK_PAGES every dirty_writeback_interval.
If the rate of dirtying is sufficiently high, system will start accumulate
large amount of dirty pages.
So fix it by having another sb->s_more_io list to park the inode while we
iterate through sb->s_io and allow each dirty inode resides on that sb has
an equal chance of flushing some amount of dirty pages.
Signed-off-by: Ken Chen <kenchen@google.com>
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6d961d1..a0cf041 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -140,25 +140,11 @@ static int write_inode(struct inode *ino
}
/*
- * Redirty an inode, but mark it as the very next-to-be-written inode on its
- * superblock's dirty-inode list.
- * We need to preserve s_dirty's reverse-time-orderedness, so we cheat by
- * setting this inode's dirtied_when to the same value as that of the inode
- * which is presently head-of-list, if present head-of-list is newer than this
- * inode. (head-of-list is the least-recently-dirtied inode: the oldest one).
+ * requeue inode for re-scanning after sb->s_io list is exhausted.
*/
-static void redirty_head(struct inode *inode)
+static void requeue_io(struct inode *inode)
{
- struct super_block *sb = inode->i_sb;
-
- if (!list_empty(&sb->s_dirty)) {
- struct inode *head_inode;
-
- head_inode = list_entry(sb->s_dirty.prev, struct inode, i_list);
- if (time_after(inode->dirtied_when, head_inode->dirtied_when))
- inode->dirtied_when = head_inode->dirtied_when;
- }
- list_move_tail(&inode->i_list, &sb->s_dirty);
+ list_move(&inode->i_list, &inode->i_sb->s_more_io);
}
/*
@@ -254,7 +240,7 @@ __sync_single_inode(struct inode *inode,
* uncongested.
*/
inode->i_state |= I_DIRTY_PAGES;
- redirty_head(inode);
+ requeue_io(inode);
} else {
/*
* Otherwise fully redirty the inode so that
@@ -314,7 +300,7 @@ __writeback_single_inode(struct inode *i
* on s_io. We'll have another go at writing back this inode
* when the s_dirty iodes get moved back onto s_io.
*/
- redirty_head(inode);
+ requeue_io(inode);
/*
* Even if we don't actually write the inode itself here,
@@ -409,14 +395,14 @@ sync_sb_inodes(struct super_block *sb, s
wbc->encountered_congestion = 1;
if (!sb_is_blkdev_sb(sb))
break; /* Skip a congested fs */
- redirty_head(inode);
+ requeue_io(inode);
continue; /* Skip a congested blockdev */
}
if (wbc->bdi && bdi != wbc->bdi) {
if (!sb_is_blkdev_sb(sb))
break; /* fs has the wrong queue */
- redirty_head(inode);
+ requeue_io(inode);
continue; /* blockdev has wrong queue */
}
@@ -426,8 +412,10 @@ sync_sb_inodes(struct super_block *sb, s
/* Was this inode dirtied too recently? */
if (wbc->older_than_this && time_after(inode->dirtied_when,
- *wbc->older_than_this))
+ *wbc->older_than_this)) {
+ list_splice_init(&sb->s_io, sb->s_dirty.prev);
break;
+ }
/* Is another pdflush already flushing this queue? */
if (current_is_pdflush() && !writeback_acquire(bdi))
@@ -457,6 +445,10 @@ sync_sb_inodes(struct super_block *sb, s
if (wbc->nr_to_write <= 0)
break;
}
+
+ if (list_empty(&sb->s_io))
+ list_splice_init(&sb->s_more_io, &sb->s_io);
+
return; /* Leave any unwritten inodes on s_io */
}
diff --git a/fs/super.c b/fs/super.c
index 5260d62..8c6fa35 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -67,6 +67,7 @@ static struct super_block *alloc_super(s
}
INIT_LIST_HEAD(&s->s_dirty);
INIT_LIST_HEAD(&s->s_io);
+ INIT_LIST_HEAD(&s->s_more_io);
INIT_LIST_HEAD(&s->s_files);
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_HEAD(&s->s_anon);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ae77c..e135913 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -934,6 +934,7 @@ #endif
struct list_head s_inodes; /* all inodes */
struct list_head s_dirty; /* dirty inodes */
struct list_head s_io; /* parked for writeback */
+ struct list_head s_more_io; /* parked for more writeback */
struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_files;
[-- Attachment #2: wb-s_more_io.patch --]
[-- Type: text/x-patch, Size: 3733 bytes --]
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6d961d1..a0cf041 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -140,25 +140,11 @@ static int write_inode(struct inode *ino
}
/*
- * Redirty an inode, but mark it as the very next-to-be-written inode on its
- * superblock's dirty-inode list.
- * We need to preserve s_dirty's reverse-time-orderedness, so we cheat by
- * setting this inode's dirtied_when to the same value as that of the inode
- * which is presently head-of-list, if present head-of-list is newer than this
- * inode. (head-of-list is the least-recently-dirtied inode: the oldest one).
+ * requeue inode for re-scanning after sb->s_io list is exhausted.
*/
-static void redirty_head(struct inode *inode)
+static void requeue_io(struct inode *inode)
{
- struct super_block *sb = inode->i_sb;
-
- if (!list_empty(&sb->s_dirty)) {
- struct inode *head_inode;
-
- head_inode = list_entry(sb->s_dirty.prev, struct inode, i_list);
- if (time_after(inode->dirtied_when, head_inode->dirtied_when))
- inode->dirtied_when = head_inode->dirtied_when;
- }
- list_move_tail(&inode->i_list, &sb->s_dirty);
+ list_move(&inode->i_list, &inode->i_sb->s_more_io);
}
/*
@@ -254,7 +240,7 @@ __sync_single_inode(struct inode *inode,
* uncongested.
*/
inode->i_state |= I_DIRTY_PAGES;
- redirty_head(inode);
+ requeue_io(inode);
} else {
/*
* Otherwise fully redirty the inode so that
@@ -314,7 +300,7 @@ __writeback_single_inode(struct inode *i
* on s_io. We'll have another go at writing back this inode
* when the s_dirty iodes get moved back onto s_io.
*/
- redirty_head(inode);
+ requeue_io(inode);
/*
* Even if we don't actually write the inode itself here,
@@ -409,14 +395,14 @@ sync_sb_inodes(struct super_block *sb, s
wbc->encountered_congestion = 1;
if (!sb_is_blkdev_sb(sb))
break; /* Skip a congested fs */
- redirty_head(inode);
+ requeue_io(inode);
continue; /* Skip a congested blockdev */
}
if (wbc->bdi && bdi != wbc->bdi) {
if (!sb_is_blkdev_sb(sb))
break; /* fs has the wrong queue */
- redirty_head(inode);
+ requeue_io(inode);
continue; /* blockdev has wrong queue */
}
@@ -426,8 +412,10 @@ sync_sb_inodes(struct super_block *sb, s
/* Was this inode dirtied too recently? */
if (wbc->older_than_this && time_after(inode->dirtied_when,
- *wbc->older_than_this))
+ *wbc->older_than_this)) {
+ list_splice_init(&sb->s_io, sb->s_dirty.prev);
break;
+ }
/* Is another pdflush already flushing this queue? */
if (current_is_pdflush() && !writeback_acquire(bdi))
@@ -457,6 +445,10 @@ sync_sb_inodes(struct super_block *sb, s
if (wbc->nr_to_write <= 0)
break;
}
+
+ if (list_empty(&sb->s_io))
+ list_splice_init(&sb->s_more_io, &sb->s_io);
+
return; /* Leave any unwritten inodes on s_io */
}
diff --git a/fs/super.c b/fs/super.c
index 5260d62..8c6fa35 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -67,6 +67,7 @@ static struct super_block *alloc_super(s
}
INIT_LIST_HEAD(&s->s_dirty);
INIT_LIST_HEAD(&s->s_io);
+ INIT_LIST_HEAD(&s->s_more_io);
INIT_LIST_HEAD(&s->s_files);
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_HEAD(&s->s_anon);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ae77c..e135913 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -934,6 +934,7 @@ #endif
struct list_head s_inodes; /* all inodes */
struct list_head s_dirty; /* dirty inodes */
struct list_head s_io; /* parked for writeback */
+ struct list_head s_more_io; /* parked for more writeback */
struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_files;
next reply other threads:[~2007-07-12 4:21 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-07-12 4:21 Ken Chen [this message]
2007-07-12 19:05 ` Andrew Morton
2007-07-13 22:17 ` Ken Chen
2007-07-17 0:01 ` Ken Chen
2007-07-17 0:15 ` Andrew Morton
[not found] ` <20070719025927.GA11874@mail.ustc.edu.cn>
2007-07-19 2:59 ` Fengguang Wu
2007-07-19 3:10 ` Andrew Morton
[not found] ` <20070719080910.GA7459@mail.ustc.edu.cn>
2007-07-19 8:09 ` Fengguang Wu
2007-07-19 8:18 ` Andrew Morton
2007-07-19 22:18 ` David Chinner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=b040c32a0707112121y21d08438u8ca7f138931827b0@mail.gmail.com \
--to=kenchen@google.com \
--cc=akpm@linux-foundation.org \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox