From: Dan Schatzberg <dschatzberg@fb.com>
Cc: Dan Schatzberg <dschatzberg@fb.com>, Jens Axboe <axboe@kernel.dk>,
Tejun Heo <tj@kernel.org>, Li Zefan <lizefan@huawei.com>,
Johannes Weiner <hannes@cmpxchg.org>,
Michal Hocko <mhocko@kernel.org>,
Vladimir Davydov <vdavydov.dev@gmail.com>,
Andrew Morton <akpm@linux-foundation.org>,
Hugh Dickins <hughd@google.com>,
"open list:BLOCK LAYER" <linux-block@vger.kernel.org>,
open list <linux-kernel@vger.kernel.org>,
"open list:CONTROL GROUP (CGROUP)" <cgroups@vger.kernel.org>,
"open list:CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)"
<linux-mm@kvack.org>
Subject: [PATCH 2/2] loop: charge i/o per cgroup
Date: Wed, 5 Feb 2020 14:33:48 -0800 [thread overview]
Message-ID: <20200205223348.880610-3-dschatzberg@fb.com> (raw)
In-Reply-To: <20200205223348.880610-1-dschatzberg@fb.com>
Naively charging loopback i/o to the original issuing blkcg could
result in priority inversion because all loopback writes go through a
single kworker thread.
This patch modifies the per-loop-device implementation to have a
workqueue and a tree of blkcg -> struct loop_worker (which has a queue
of struct loop_cmd). This allows individual blkcgs to make forward
progress when one is blocked due to resource control. There is also a
single rootcg loop_cmd queue.
The locking here is a bit heavy handed - any time the tree or a queue is
accessed, we take the spinlock. However, the previous implementation
serialized everything through a single kworker so I don't think this is
any worse.
This code previously took references on the cgroups (css_put), but I
don't believe this is necessary since the bio_vec pins the cgroup until
the end.
Signed-off-by: Dan Schatzberg <dschatzberg@fb.com>
---
drivers/block/loop.c | 179 ++++++++++++++++++++++++++++++++---------
drivers/block/loop.h | 11 +--
kernel/cgroup/cgroup.c | 1 +
3 files changed, 150 insertions(+), 41 deletions(-)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 739b372a5112..d0484eeecc9b 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -70,7 +70,6 @@
#include <linux/writeback.h>
#include <linux/completion.h>
#include <linux/highmem.h>
-#include <linux/kthread.h>
#include <linux/splice.h>
#include <linux/sysfs.h>
#include <linux/miscdevice.h>
@@ -78,6 +77,7 @@
#include <linux/uio.h>
#include <linux/ioprio.h>
#include <linux/blk-cgroup.h>
+#include <linux/sched/mm.h>
#include "loop.h"
@@ -503,8 +503,6 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
{
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
- if (cmd->css)
- css_put(cmd->css);
cmd->ret = ret;
lo_rw_aio_do_completion(cmd);
}
@@ -565,8 +563,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
cmd->iocb.ki_complete = lo_rw_aio_complete;
cmd->iocb.ki_flags = IOCB_DIRECT;
cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
- if (cmd->css)
- kthread_associate_blkcg(cmd->css);
if (rw == WRITE)
ret = call_write_iter(file, &cmd->iocb, &iter);
@@ -574,7 +570,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
ret = call_read_iter(file, &cmd->iocb, &iter);
lo_rw_aio_do_completion(cmd);
- kthread_associate_blkcg(NULL);
if (ret != -EIOCBQUEUED)
cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
@@ -891,27 +886,95 @@ static void loop_config_discard(struct loop_device *lo)
static void loop_unprepare_queue(struct loop_device *lo)
{
- kthread_flush_worker(&lo->worker);
- kthread_stop(lo->worker_task);
-}
-
-static int loop_kthread_worker_fn(void *worker_ptr)
-{
- current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO;
- return kthread_worker_fn(worker_ptr);
+ flush_workqueue(lo->workqueue);
+ destroy_workqueue(lo->workqueue);
}
static int loop_prepare_queue(struct loop_device *lo)
{
- kthread_init_worker(&lo->worker);
- lo->worker_task = kthread_run(loop_kthread_worker_fn,
- &lo->worker, "loop%d", lo->lo_number);
- if (IS_ERR(lo->worker_task))
+ lo->workqueue = alloc_workqueue("loop%d",
+ WQ_FREEZABLE | WQ_MEM_RECLAIM |
+ WQ_HIGHPRI,
+ lo->lo_number);
+ if (IS_ERR(lo->workqueue))
return -ENOMEM;
- set_user_nice(lo->worker_task, MIN_NICE);
+
return 0;
}
+struct loop_worker {
+ struct rb_node rb_node;
+ struct work_struct work;
+ struct list_head cmd_list;
+ struct loop_device *lo;
+ int css_id;
+};
+
+static void loop_workfn(struct work_struct *work);
+static void loop_rootcg_workfn(struct work_struct *work);
+
+static void loop_queue_on_rootcg_locked(struct loop_device *lo,
+ struct loop_cmd *cmd)
+{
+ list_add_tail(&cmd->list_entry, &lo->rootcg_cmd_list);
+ if (list_is_first(&cmd->list_entry, &lo->rootcg_cmd_list))
+ queue_work(lo->workqueue, &lo->rootcg_work);
+}
+
+static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
+{
+ struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL;
+ struct loop_worker *cur_worker, *worker = NULL;
+ int css_id = 0;
+
+ if (cmd->blk_css)
+ css_id = cmd->blk_css->id;
+
+ spin_lock_irq(&lo->lo_lock);
+
+ if (!css_id) {
+ loop_queue_on_rootcg_locked(lo, cmd);
+ goto out;
+ }
+ node = &(lo->worker_tree.rb_node);
+
+ while (*node) {
+ parent = *node;
+ cur_worker = container_of(*node, struct loop_worker, rb_node);
+ if (cur_worker->css_id == css_id) {
+ worker = cur_worker;
+ break;
+ } else if (cur_worker->css_id < 0) {
+ node = &((*node)->rb_left);
+ } else {
+ node = &((*node)->rb_right);
+ }
+ }
+ if (worker) {
+ list_add_tail(&cmd->list_entry, &worker->cmd_list);
+ } else {
+ worker = kmalloc(sizeof(struct loop_worker), GFP_NOIO);
+ /*
+ * In the event we cannot allocate a worker, just
+ * queue on the rootcg worker
+ */
+ if (!worker) {
+ loop_queue_on_rootcg_locked(lo, cmd);
+ goto out;
+ }
+ worker->css_id = css_id;
+ INIT_WORK(&worker->work, loop_workfn);
+ INIT_LIST_HEAD(&worker->cmd_list);
+ worker->lo = lo;
+ rb_link_node(&worker->rb_node, parent, node);
+ rb_insert_color(&worker->rb_node, &lo->worker_tree);
+ list_add_tail(&cmd->list_entry, &worker->cmd_list);
+ queue_work(lo->workqueue, &worker->work);
+ }
+out:
+ spin_unlock_irq(&lo->lo_lock);
+}
+
static void loop_update_rotational(struct loop_device *lo)
{
struct file *file = lo->lo_backing_file;
@@ -993,6 +1056,8 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
+ INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
+ INIT_LIST_HEAD(&lo->rootcg_cmd_list);
lo->use_dio = false;
lo->lo_device = bdev;
lo->lo_flags = lo_flags;
@@ -1925,14 +1990,13 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
}
/* always use the first bio's css */
+ cmd->blk_css = NULL;
#ifdef CONFIG_BLK_CGROUP
- if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) {
- cmd->css = &bio_blkcg(rq->bio)->css;
- css_get(cmd->css);
- } else
+ if (rq->bio && rq->bio->bi_blkg)
+ cmd->blk_css = &bio_blkcg(rq->bio)->css;
#endif
- cmd->css = NULL;
- kthread_queue_work(&lo->worker, &cmd->work);
+
+ loop_queue_work(lo, cmd);
return BLK_STS_OK;
}
@@ -1942,6 +2006,9 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
struct request *rq = blk_mq_rq_from_pdu(cmd);
const bool write = op_is_write(req_op(rq));
struct loop_device *lo = rq->q->queuedata;
+#ifdef CONFIG_MEMCG
+ struct cgroup_subsys_state *mem_css;
+#endif
int ret = 0;
if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
@@ -1949,8 +2016,24 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
goto failed;
}
+ if (cmd->blk_css) {
+#ifdef CONFIG_MEMCG
+ mem_css = cgroup_get_e_css(cmd->blk_css->cgroup,
+ &memory_cgrp_subsys);
+ memalloc_use_memcg(mem_cgroup_from_css(mem_css));
+#endif
+ kthread_associate_blkcg(cmd->blk_css);
+ }
+
ret = do_req_filebacked(lo, rq);
- failed:
+
+ if (cmd->blk_css) {
+ kthread_associate_blkcg(NULL);
+#ifdef CONFIG_MEMCG
+ memalloc_unuse_memcg();
+#endif
+ }
+failed:
/* complete non-aio request */
if (!cmd->use_aio || ret) {
cmd->ret = ret ? -EIO : 0;
@@ -1958,26 +2041,50 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
}
}
-static void loop_queue_work(struct kthread_work *work)
+static void loop_process_work(struct loop_worker *worker,
+ struct list_head *cmd_list, struct loop_device *lo)
{
- struct loop_cmd *cmd =
- container_of(work, struct loop_cmd, work);
+ int orig_flags = current->flags;
+ struct loop_cmd *cmd;
+
+ while (1) {
+ spin_lock_irq(&lo->lo_lock);
+ if (list_empty(cmd_list)) {
+ if (worker)
+ rb_erase(&worker->rb_node, &lo->worker_tree);
+ spin_unlock_irq(&lo->lo_lock);
+ kfree(worker);
+ current->flags = orig_flags;
+ return;
+ }
- loop_handle_cmd(cmd);
+ cmd = container_of(
+ cmd_list->next, struct loop_cmd, list_entry);
+ list_del(cmd_list->next);
+ spin_unlock_irq(&lo->lo_lock);
+ current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO;
+ loop_handle_cmd(cmd);
+ current->flags = orig_flags;
+ cond_resched();
+ }
}
-static int loop_init_request(struct blk_mq_tag_set *set, struct request *rq,
- unsigned int hctx_idx, unsigned int numa_node)
+static void loop_workfn(struct work_struct *work)
{
- struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ struct loop_worker *worker =
+ container_of(work, struct loop_worker, work);
+ loop_process_work(worker, &worker->cmd_list, worker->lo);
+}
- kthread_init_work(&cmd->work, loop_queue_work);
- return 0;
+static void loop_rootcg_workfn(struct work_struct *work)
+{
+ struct loop_device *lo =
+ container_of(work, struct loop_device, rootcg_work);
+ loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
}
static const struct blk_mq_ops loop_mq_ops = {
.queue_rq = loop_queue_rq,
- .init_request = loop_init_request,
.complete = lo_complete_rq,
};
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index af75a5ee4094..239d4921b441 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -14,7 +14,6 @@
#include <linux/blk-mq.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
-#include <linux/kthread.h>
#include <uapi/linux/loop.h>
/* Possible states of device */
@@ -54,8 +53,10 @@ struct loop_device {
spinlock_t lo_lock;
int lo_state;
- struct kthread_worker worker;
- struct task_struct *worker_task;
+ struct workqueue_struct *workqueue;
+ struct work_struct rootcg_work;
+ struct list_head rootcg_cmd_list;
+ struct rb_root worker_tree;
bool use_dio;
bool sysfs_inited;
@@ -65,13 +66,13 @@ struct loop_device {
};
struct loop_cmd {
- struct kthread_work work;
+ struct list_head list_entry;
bool use_aio; /* use AIO interface to handle I/O */
atomic_t ref; /* only for aio */
long ret;
struct kiocb iocb;
struct bio_vec *bvec;
- struct cgroup_subsys_state *css;
+ struct cgroup_subsys_state *blk_css;
};
/* Support for loadable transfer modules */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 735af8f15f95..dd8270c6191c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -587,6 +587,7 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
rcu_read_unlock();
return css;
}
+EXPORT_SYMBOL_GPL(cgroup_get_e_css);
static void cgroup_get_live(struct cgroup *cgrp)
{
--
2.17.1
next prev parent reply other threads:[~2020-02-05 22:44 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <20200205223348.880610-1-dschatzberg@fb.com>
2020-02-05 22:33 ` [PATCH 1/2] mm: Charge current memcg when no mm is set Dan Schatzberg
2020-02-06 15:27 ` Johannes Weiner
2020-02-12 23:06 ` Tejun Heo
2020-02-05 22:33 ` Dan Schatzberg [this message]
2020-02-06 15:46 ` [PATCH 2/2] loop: charge i/o per cgroup Johannes Weiner
2020-02-13 0:07 ` Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200205223348.880610-3-dschatzberg@fb.com \
--to=dschatzberg@fb.com \
--cc=akpm@linux-foundation.org \
--cc=axboe@kernel.dk \
--cc=cgroups@vger.kernel.org \
--cc=hannes@cmpxchg.org \
--cc=hughd@google.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lizefan@huawei.com \
--cc=mhocko@kernel.org \
--cc=tj@kernel.org \
--cc=vdavydov.dev@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox