From: Roman Gushchin <guro@fb.com>
To: Jan Kara <jack@suse.cz>, Tejun Heo <tj@kernel.org>
Cc: <linux-fsdevel@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
<linux-mm@kvack.org>, Alexander Viro <viro@zeniv.linux.org.uk>,
Dennis Zhou <dennis@kernel.org>,
Dave Chinner <dchinner@redhat.com>, <cgroups@vger.kernel.org>,
Roman Gushchin <guro@fb.com>
Subject: [PATCH v7 5/6] writeback, cgroup: support switching multiple inodes at once
Date: Thu, 3 Jun 2021 18:31:58 -0700 [thread overview]
Message-ID: <20210604013159.3126180-6-guro@fb.com> (raw)
In-Reply-To: <20210604013159.3126180-1-guro@fb.com>
Currently only a single inode can be switched to another writeback
structure at once. That means to switch an inode a separate
inode_switch_wbs_context structure must be allocated, and a separate
rcu callback and work must be scheduled.
It's fine for the existing ad-hoc switching, which is not happening
that often, but sub-optimal for massive switching required in order to
release a writeback structure. To prepare for it, let's add a support
for switching multiple inodes at once.
Instead of containing a single inode pointer, inode_switch_wbs_context
will contain a NULL-terminated array of inode pointers.
inode_do_switch_wbs() will be called for each inode.
To optimize the locking bdi->wb_switch_rwsem, old_wb's and new_wb's
list_locks will be acquired and released only once altogether for all
inodes. wb_wakeup() will be also be called only once. Instead of
calling wb_put(old_wb) after each successful switch, wb_put_many()
is introduced and used.
Signed-off-by: Roman Gushchin <guro@fb.com>
---
fs/fs-writeback.c | 105 ++++++++++++++++++-------------
include/linux/backing-dev-defs.h | 18 +++++-
2 files changed, 79 insertions(+), 44 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d46cdeeb6797..5f5502238bf0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -335,10 +335,18 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
}
struct inode_switch_wbs_context {
- struct inode *inode;
- struct bdi_writeback *new_wb;
-
struct rcu_work work;
+
+ /*
+ * Multiple inodes can be switched at once. The switching procedure
+ * consists of two parts, separated by a RCU grace period. To make
+ * sure that the second part is executed for each inode gone through
+ * the first part, all inode pointers are placed into a NULL-terminated
+ * array embedded into struct inode_switch_wbs_context. Otherwise
+ * an inode could be left in a non-consistent state.
+ */
+ struct bdi_writeback *new_wb;
+ struct inode *inodes[];
};
static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
@@ -351,39 +359,15 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
up_write(&bdi->wb_switch_rwsem);
}
-static void inode_do_switch_wbs(struct inode *inode,
+static bool inode_do_switch_wbs(struct inode *inode,
+ struct bdi_writeback *old_wb,
struct bdi_writeback *new_wb)
{
- struct backing_dev_info *bdi = inode_to_bdi(inode);
struct address_space *mapping = inode->i_mapping;
- struct bdi_writeback *old_wb = inode->i_wb;
XA_STATE(xas, &mapping->i_pages, 0);
struct page *page;
bool switched = false;
- /*
- * If @inode switches cgwb membership while sync_inodes_sb() is
- * being issued, sync_inodes_sb() might miss it. Synchronize.
- */
- down_read(&bdi->wb_switch_rwsem);
-
- /*
- * By the time control reaches here, RCU grace period has passed
- * since I_WB_SWITCH assertion and all wb stat update transactions
- * between unlocked_inode_to_wb_begin/end() are guaranteed to be
- * synchronizing against the i_pages lock.
- *
- * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
- * gives us exclusion against all wb related operations on @inode
- * including IO list manipulations and stat updates.
- */
- if (old_wb < new_wb) {
- spin_lock(&old_wb->list_lock);
- spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
- } else {
- spin_lock(&new_wb->list_lock);
- spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
- }
spin_lock(&inode->i_lock);
xa_lock_irq(&mapping->i_pages);
@@ -458,25 +442,62 @@ static void inode_do_switch_wbs(struct inode *inode,
xa_unlock_irq(&mapping->i_pages);
spin_unlock(&inode->i_lock);
- spin_unlock(&new_wb->list_lock);
- spin_unlock(&old_wb->list_lock);
-
- up_read(&bdi->wb_switch_rwsem);
- if (switched) {
- wb_wakeup(new_wb);
- wb_put(old_wb);
- }
+ return switched;
}
static void inode_switch_wbs_work_fn(struct work_struct *work)
{
struct inode_switch_wbs_context *isw =
container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
+ struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
+ struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
+ struct bdi_writeback *new_wb = isw->new_wb;
+ unsigned long nr_switched = 0;
+ struct inode **inodep;
+
+ /*
+ * If @inode switches cgwb membership while sync_inodes_sb() is
+ * being issued, sync_inodes_sb() might miss it. Synchronize.
+ */
+ down_read(&bdi->wb_switch_rwsem);
+
+ /*
+ * By the time control reaches here, RCU grace period has passed
+ * since I_WB_SWITCH assertion and all wb stat update transactions
+ * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+ * synchronizing against the i_pages lock.
+ *
+ * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
+ * gives us exclusion against all wb related operations on @inode
+ * including IO list manipulations and stat updates.
+ */
+ if (old_wb < new_wb) {
+ spin_lock(&old_wb->list_lock);
+ spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&new_wb->list_lock);
+ spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+ }
+
+ for (inodep = isw->inodes; *inodep; inodep++) {
+ WARN_ON_ONCE((*inodep)->i_wb != old_wb);
+ if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
+ nr_switched++;
+ iput(*inodep);
+ }
+
+ spin_unlock(&new_wb->list_lock);
+ spin_unlock(&old_wb->list_lock);
+
+ up_read(&bdi->wb_switch_rwsem);
+
+ if (nr_switched) {
+ wb_wakeup(new_wb);
+ wb_put_many(old_wb, nr_switched);
+ }
- inode_do_switch_wbs(isw->inode, isw->new_wb);
- wb_put(isw->new_wb);
- iput(isw->inode);
+ wb_put(new_wb);
kfree(isw);
atomic_dec(&isw_nr_in_flight);
}
@@ -503,7 +524,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
return;
- isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+ isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC);
if (!isw)
return;
@@ -528,7 +549,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
__iget(inode);
spin_unlock(&inode->i_lock);
- isw->inode = inode;
+ isw->inodes[0] = inode;
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index e5dc238ebe4f..63f52ad2ce7a 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -240,8 +240,9 @@ static inline void wb_get(struct bdi_writeback *wb)
/**
* wb_put - decrement a wb's refcount
* @wb: bdi_writeback to put
+ * @nr: number of references to put
*/
-static inline void wb_put(struct bdi_writeback *wb)
+static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
{
if (WARN_ON_ONCE(!wb->bdi)) {
/*
@@ -252,7 +253,16 @@ static inline void wb_put(struct bdi_writeback *wb)
}
if (wb != &wb->bdi->wb)
- percpu_ref_put(&wb->refcnt);
+ percpu_ref_put_many(&wb->refcnt, nr);
+}
+
+/**
+ * wb_put - decrement a wb's refcount
+ * @wb: bdi_writeback to put
+ */
+static inline void wb_put(struct bdi_writeback *wb)
+{
+ wb_put_many(wb, 1);
}
/**
@@ -281,6 +291,10 @@ static inline void wb_put(struct bdi_writeback *wb)
{
}
+static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
+{
+}
+
static inline bool wb_dying(struct bdi_writeback *wb)
{
return false;
--
2.31.1
next prev parent reply other threads:[~2021-06-04 1:32 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-06-04 1:31 [PATCH v7 0/6] cgroup, blkcg: prevent dirty inodes to pin dying memory cgroups Roman Gushchin
2021-06-04 1:31 ` [PATCH v7 1/6] writeback, cgroup: do not switch inodes with I_WILL_FREE flag Roman Gushchin
2021-06-07 8:48 ` Jan Kara
2021-06-04 1:31 ` [PATCH v7 2/6] writeback, cgroup: switch to rcu_work API in inode_switch_wbs() Roman Gushchin
2021-06-04 1:31 ` [PATCH v7 3/6] writeback, cgroup: keep list of inodes attached to bdi_writeback Roman Gushchin
2021-06-04 1:31 ` [PATCH v7 4/6] writeback, cgroup: split out the functional part of inode_switch_wbs_work_fn() Roman Gushchin
2021-06-04 1:31 ` Roman Gushchin [this message]
2021-06-07 9:00 ` [PATCH v7 5/6] writeback, cgroup: support switching multiple inodes at once Jan Kara
2021-06-04 1:31 ` [PATCH v7 6/6] writeback, cgroup: release dying cgwbs by switching attached inodes Roman Gushchin
2021-06-04 15:51 ` Tejun Heo
2021-06-05 21:34 ` Dennis Zhou
2021-06-08 0:20 ` Roman Gushchin
2021-06-07 9:24 ` Jan Kara
2021-06-04 15:53 ` [PATCH v7 0/6] cgroup, blkcg: prevent dirty inodes to pin dying memory cgroups Tejun Heo
2021-06-04 22:24 ` Roman Gushchin
2021-06-04 23:31 ` Tejun Heo
2021-06-05 21:37 ` Dennis Zhou
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210604013159.3126180-6-guro@fb.com \
--to=guro@fb.com \
--cc=cgroups@vger.kernel.org \
--cc=dchinner@redhat.com \
--cc=dennis@kernel.org \
--cc=jack@suse.cz \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=tj@kernel.org \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox