* [PATCH 1/2] mm: Charge current memcg when no mm is set
[not found] <20200205223348.880610-1-dschatzberg@fb.com>
@ 2020-02-05 22:33 ` Dan Schatzberg
2020-02-06 15:27 ` Johannes Weiner
2020-02-12 23:06 ` Tejun Heo
2020-02-05 22:33 ` [PATCH 2/2] loop: charge i/o per cgroup Dan Schatzberg
1 sibling, 2 replies; 6+ messages in thread
From: Dan Schatzberg @ 2020-02-05 22:33 UTC (permalink / raw)
Cc: Dan Schatzberg, Jens Axboe, Tejun Heo, Li Zefan, Johannes Weiner,
Michal Hocko, Vladimir Davydov, Andrew Morton, Hugh Dickins,
open list:BLOCK LAYER, open list,
open list:CONTROL GROUP (CGROUP),
open list:CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)
This modifies the shmem and mm charge logic so that now if there is no
mm set (as in the case of tmpfs backed loop device), we charge the
current memcg, if set.
Signed-off-by: Dan Schatzberg <dschatzberg@fb.com>
---
mm/memcontrol.c | 11 ++++++++---
mm/shmem.c | 2 +-
2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c5b5f74cfd4d..5a6ab3183525 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6354,7 +6354,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
* @compound: charge the page as compound or small page
*
* Try to charge @page to the memcg that @mm belongs to, reclaiming
- * pages according to @gfp_mask if necessary.
+ * pages according to @gfp_mask if necessary. If @mm is NULL, try to
+ * charge to the active memcg.
*
* Returns 0 on success, with *@memcgp pointing to the charged memcg.
* Otherwise, an error code is returned.
@@ -6398,8 +6399,12 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
}
}
- if (!memcg)
- memcg = get_mem_cgroup_from_mm(mm);
+ if (!memcg) {
+ if (!mm)
+ memcg = get_mem_cgroup_from_current();
+ else
+ memcg = get_mem_cgroup_from_mm(mm);
+ }
ret = try_charge(memcg, gfp_mask, nr_pages);
diff --git a/mm/shmem.c b/mm/shmem.c
index 165fa6332993..014e576a617b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1766,7 +1766,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
}
sbinfo = SHMEM_SB(inode->i_sb);
- charge_mm = vma ? vma->vm_mm : current->mm;
+ charge_mm = vma ? vma->vm_mm : NULL;
page = find_lock_entry(mapping, index);
if (xa_is_value(page)) {
--
2.17.1
^ permalink raw reply [flat|nested] 6+ messages in thread* [PATCH 2/2] loop: charge i/o per cgroup
[not found] <20200205223348.880610-1-dschatzberg@fb.com>
2020-02-05 22:33 ` [PATCH 1/2] mm: Charge current memcg when no mm is set Dan Schatzberg
@ 2020-02-05 22:33 ` Dan Schatzberg
2020-02-06 15:46 ` Johannes Weiner
2020-02-13 0:07 ` Tejun Heo
1 sibling, 2 replies; 6+ messages in thread
From: Dan Schatzberg @ 2020-02-05 22:33 UTC (permalink / raw)
Cc: Dan Schatzberg, Jens Axboe, Tejun Heo, Li Zefan, Johannes Weiner,
Michal Hocko, Vladimir Davydov, Andrew Morton, Hugh Dickins,
open list:BLOCK LAYER, open list,
open list:CONTROL GROUP (CGROUP),
open list:CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)
Naively charging loopback i/o to the original issuing blkcg could
result in priority inversion because all loopback writes go through a
single kworker thread.
This patch modifies the per-loop-device implementation to have a
workqueue and a tree of blkcg -> struct loop_worker (which has a queue
of struct loop_cmd). This allows individual blkcgs to make forward
progress when one is blocked due to resource control. There is also a
single rootcg loop_cmd queue.
The locking here is a bit heavy handed - any time the tree or a queue is
accessed, we take the spinlock. However, the previous implementation
serialized everything through a single kworker so I don't think this is
any worse.
This code previously took references on the cgroups (css_put), but I
don't believe this is necessary since the bio_vec pins the cgroup until
the end.
Signed-off-by: Dan Schatzberg <dschatzberg@fb.com>
---
drivers/block/loop.c | 179 ++++++++++++++++++++++++++++++++---------
drivers/block/loop.h | 11 +--
kernel/cgroup/cgroup.c | 1 +
3 files changed, 150 insertions(+), 41 deletions(-)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 739b372a5112..d0484eeecc9b 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -70,7 +70,6 @@
#include <linux/writeback.h>
#include <linux/completion.h>
#include <linux/highmem.h>
-#include <linux/kthread.h>
#include <linux/splice.h>
#include <linux/sysfs.h>
#include <linux/miscdevice.h>
@@ -78,6 +77,7 @@
#include <linux/uio.h>
#include <linux/ioprio.h>
#include <linux/blk-cgroup.h>
+#include <linux/sched/mm.h>
#include "loop.h"
@@ -503,8 +503,6 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
{
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
- if (cmd->css)
- css_put(cmd->css);
cmd->ret = ret;
lo_rw_aio_do_completion(cmd);
}
@@ -565,8 +563,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
cmd->iocb.ki_complete = lo_rw_aio_complete;
cmd->iocb.ki_flags = IOCB_DIRECT;
cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
- if (cmd->css)
- kthread_associate_blkcg(cmd->css);
if (rw == WRITE)
ret = call_write_iter(file, &cmd->iocb, &iter);
@@ -574,7 +570,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
ret = call_read_iter(file, &cmd->iocb, &iter);
lo_rw_aio_do_completion(cmd);
- kthread_associate_blkcg(NULL);
if (ret != -EIOCBQUEUED)
cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
@@ -891,27 +886,95 @@ static void loop_config_discard(struct loop_device *lo)
static void loop_unprepare_queue(struct loop_device *lo)
{
- kthread_flush_worker(&lo->worker);
- kthread_stop(lo->worker_task);
-}
-
-static int loop_kthread_worker_fn(void *worker_ptr)
-{
- current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO;
- return kthread_worker_fn(worker_ptr);
+ flush_workqueue(lo->workqueue);
+ destroy_workqueue(lo->workqueue);
}
static int loop_prepare_queue(struct loop_device *lo)
{
- kthread_init_worker(&lo->worker);
- lo->worker_task = kthread_run(loop_kthread_worker_fn,
- &lo->worker, "loop%d", lo->lo_number);
- if (IS_ERR(lo->worker_task))
+ lo->workqueue = alloc_workqueue("loop%d",
+ WQ_FREEZABLE | WQ_MEM_RECLAIM |
+ WQ_HIGHPRI,
+ lo->lo_number);
+ if (IS_ERR(lo->workqueue))
return -ENOMEM;
- set_user_nice(lo->worker_task, MIN_NICE);
+
return 0;
}
+struct loop_worker {
+ struct rb_node rb_node;
+ struct work_struct work;
+ struct list_head cmd_list;
+ struct loop_device *lo;
+ int css_id;
+};
+
+static void loop_workfn(struct work_struct *work);
+static void loop_rootcg_workfn(struct work_struct *work);
+
+static void loop_queue_on_rootcg_locked(struct loop_device *lo,
+ struct loop_cmd *cmd)
+{
+ list_add_tail(&cmd->list_entry, &lo->rootcg_cmd_list);
+ if (list_is_first(&cmd->list_entry, &lo->rootcg_cmd_list))
+ queue_work(lo->workqueue, &lo->rootcg_work);
+}
+
+static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
+{
+ struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL;
+ struct loop_worker *cur_worker, *worker = NULL;
+ int css_id = 0;
+
+ if (cmd->blk_css)
+ css_id = cmd->blk_css->id;
+
+ spin_lock_irq(&lo->lo_lock);
+
+ if (!css_id) {
+ loop_queue_on_rootcg_locked(lo, cmd);
+ goto out;
+ }
+ node = &(lo->worker_tree.rb_node);
+
+ while (*node) {
+ parent = *node;
+ cur_worker = container_of(*node, struct loop_worker, rb_node);
+ if (cur_worker->css_id == css_id) {
+ worker = cur_worker;
+ break;
+ } else if (cur_worker->css_id < 0) {
+ node = &((*node)->rb_left);
+ } else {
+ node = &((*node)->rb_right);
+ }
+ }
+ if (worker) {
+ list_add_tail(&cmd->list_entry, &worker->cmd_list);
+ } else {
+ worker = kmalloc(sizeof(struct loop_worker), GFP_NOIO);
+ /*
+ * In the event we cannot allocate a worker, just
+ * queue on the rootcg worker
+ */
+ if (!worker) {
+ loop_queue_on_rootcg_locked(lo, cmd);
+ goto out;
+ }
+ worker->css_id = css_id;
+ INIT_WORK(&worker->work, loop_workfn);
+ INIT_LIST_HEAD(&worker->cmd_list);
+ worker->lo = lo;
+ rb_link_node(&worker->rb_node, parent, node);
+ rb_insert_color(&worker->rb_node, &lo->worker_tree);
+ list_add_tail(&cmd->list_entry, &worker->cmd_list);
+ queue_work(lo->workqueue, &worker->work);
+ }
+out:
+ spin_unlock_irq(&lo->lo_lock);
+}
+
static void loop_update_rotational(struct loop_device *lo)
{
struct file *file = lo->lo_backing_file;
@@ -993,6 +1056,8 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
+ INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
+ INIT_LIST_HEAD(&lo->rootcg_cmd_list);
lo->use_dio = false;
lo->lo_device = bdev;
lo->lo_flags = lo_flags;
@@ -1925,14 +1990,13 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
}
/* always use the first bio's css */
+ cmd->blk_css = NULL;
#ifdef CONFIG_BLK_CGROUP
- if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) {
- cmd->css = &bio_blkcg(rq->bio)->css;
- css_get(cmd->css);
- } else
+ if (rq->bio && rq->bio->bi_blkg)
+ cmd->blk_css = &bio_blkcg(rq->bio)->css;
#endif
- cmd->css = NULL;
- kthread_queue_work(&lo->worker, &cmd->work);
+
+ loop_queue_work(lo, cmd);
return BLK_STS_OK;
}
@@ -1942,6 +2006,9 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
struct request *rq = blk_mq_rq_from_pdu(cmd);
const bool write = op_is_write(req_op(rq));
struct loop_device *lo = rq->q->queuedata;
+#ifdef CONFIG_MEMCG
+ struct cgroup_subsys_state *mem_css;
+#endif
int ret = 0;
if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
@@ -1949,8 +2016,24 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
goto failed;
}
+ if (cmd->blk_css) {
+#ifdef CONFIG_MEMCG
+ mem_css = cgroup_get_e_css(cmd->blk_css->cgroup,
+ &memory_cgrp_subsys);
+ memalloc_use_memcg(mem_cgroup_from_css(mem_css));
+#endif
+ kthread_associate_blkcg(cmd->blk_css);
+ }
+
ret = do_req_filebacked(lo, rq);
- failed:
+
+ if (cmd->blk_css) {
+ kthread_associate_blkcg(NULL);
+#ifdef CONFIG_MEMCG
+ memalloc_unuse_memcg();
+#endif
+ }
+failed:
/* complete non-aio request */
if (!cmd->use_aio || ret) {
cmd->ret = ret ? -EIO : 0;
@@ -1958,26 +2041,50 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
}
}
-static void loop_queue_work(struct kthread_work *work)
+static void loop_process_work(struct loop_worker *worker,
+ struct list_head *cmd_list, struct loop_device *lo)
{
- struct loop_cmd *cmd =
- container_of(work, struct loop_cmd, work);
+ int orig_flags = current->flags;
+ struct loop_cmd *cmd;
+
+ while (1) {
+ spin_lock_irq(&lo->lo_lock);
+ if (list_empty(cmd_list)) {
+ if (worker)
+ rb_erase(&worker->rb_node, &lo->worker_tree);
+ spin_unlock_irq(&lo->lo_lock);
+ kfree(worker);
+ current->flags = orig_flags;
+ return;
+ }
- loop_handle_cmd(cmd);
+ cmd = container_of(
+ cmd_list->next, struct loop_cmd, list_entry);
+ list_del(cmd_list->next);
+ spin_unlock_irq(&lo->lo_lock);
+ current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO;
+ loop_handle_cmd(cmd);
+ current->flags = orig_flags;
+ cond_resched();
+ }
}
-static int loop_init_request(struct blk_mq_tag_set *set, struct request *rq,
- unsigned int hctx_idx, unsigned int numa_node)
+static void loop_workfn(struct work_struct *work)
{
- struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ struct loop_worker *worker =
+ container_of(work, struct loop_worker, work);
+ loop_process_work(worker, &worker->cmd_list, worker->lo);
+}
- kthread_init_work(&cmd->work, loop_queue_work);
- return 0;
+static void loop_rootcg_workfn(struct work_struct *work)
+{
+ struct loop_device *lo =
+ container_of(work, struct loop_device, rootcg_work);
+ loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
}
static const struct blk_mq_ops loop_mq_ops = {
.queue_rq = loop_queue_rq,
- .init_request = loop_init_request,
.complete = lo_complete_rq,
};
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index af75a5ee4094..239d4921b441 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -14,7 +14,6 @@
#include <linux/blk-mq.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
-#include <linux/kthread.h>
#include <uapi/linux/loop.h>
/* Possible states of device */
@@ -54,8 +53,10 @@ struct loop_device {
spinlock_t lo_lock;
int lo_state;
- struct kthread_worker worker;
- struct task_struct *worker_task;
+ struct workqueue_struct *workqueue;
+ struct work_struct rootcg_work;
+ struct list_head rootcg_cmd_list;
+ struct rb_root worker_tree;
bool use_dio;
bool sysfs_inited;
@@ -65,13 +66,13 @@ struct loop_device {
};
struct loop_cmd {
- struct kthread_work work;
+ struct list_head list_entry;
bool use_aio; /* use AIO interface to handle I/O */
atomic_t ref; /* only for aio */
long ret;
struct kiocb iocb;
struct bio_vec *bvec;
- struct cgroup_subsys_state *css;
+ struct cgroup_subsys_state *blk_css;
};
/* Support for loadable transfer modules */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 735af8f15f95..dd8270c6191c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -587,6 +587,7 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
rcu_read_unlock();
return css;
}
+EXPORT_SYMBOL_GPL(cgroup_get_e_css);
static void cgroup_get_live(struct cgroup *cgrp)
{
--
2.17.1
^ permalink raw reply [flat|nested] 6+ messages in thread