From: Greg Thelen <gthelen@google.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
containers@lists.osdl.org, linux-fsdevel@vger.kernel.org,
Andrea Righi <arighi@develer.com>,
Balbir Singh <balbir@linux.vnet.ibm.com>,
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>,
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>,
Minchan Kim <minchan.kim@gmail.com>,
Johannes Weiner <hannes@cmpxchg.org>,
Ciju Rajan K <ciju@linux.vnet.ibm.com>,
David Rientjes <rientjes@google.com>,
Wu Fengguang <fengguang.wu@intel.com>,
Vivek Goyal <vgoyal@redhat.com>,
Dave Chinner <david@fromorbit.com>,
Greg Thelen <gthelen@google.com>
Subject: [RFC][PATCH v7 11/14] memcg: create support routines for writeback
Date: Fri, 13 May 2011 01:47:50 -0700 [thread overview]
Message-ID: <1305276473-14780-12-git-send-email-gthelen@google.com> (raw)
In-Reply-To: <1305276473-14780-1-git-send-email-gthelen@google.com>
Introduce memcg routines to assist in per-memcg writeback:
- mem_cgroups_over_bground_dirty_thresh() determines if any cgroups need
writeback because they are over their dirty memory threshold.
- should_writeback_mem_cgroup_inode() determines if an inode is
contributing pages to an over-limit memcg.
- mem_cgroup_writeback_done() is used periodically during writeback to
update memcg writeback data.
Signed-off-by: Greg Thelen <gthelen@google.com>
---
include/linux/memcontrol.h | 22 +++++++
include/trace/events/memcontrol.h | 49 ++++++++++++++++
mm/memcontrol.c | 116 +++++++++++++++++++++++++++++++++++++
3 files changed, 187 insertions(+), 0 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f06c2de..3d72e09 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -26,6 +26,7 @@ struct mem_cgroup;
struct page_cgroup;
struct page;
struct mm_struct;
+struct writeback_control;
/*
* Per mem_cgroup page counts tracked by kernel. As pages enter and leave these
@@ -162,6 +163,11 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
mem_cgroup_update_page_stat(page, idx, -1);
}
+bool should_writeback_mem_cgroup_inode(struct inode *inode,
+ struct writeback_control *wbc);
+bool mem_cgroups_over_bground_dirty_thresh(void);
+void mem_cgroup_writeback_done(void);
+
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
@@ -361,6 +367,22 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
{
}
+static inline bool
+should_writeback_mem_cgroup_inode(struct inode *inode,
+ struct writeback_control *wbc)
+{
+ return true;
+}
+
+static inline bool mem_cgroups_over_bground_dirty_thresh(void)
+{
+ return true;
+}
+
+static inline void mem_cgroup_writeback_done(void)
+{
+}
+
static inline
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask,
diff --git a/include/trace/events/memcontrol.h b/include/trace/events/memcontrol.h
index abf1306..326a66b 100644
--- a/include/trace/events/memcontrol.h
+++ b/include/trace/events/memcontrol.h
@@ -60,6 +60,55 @@ TRACE_EVENT(mem_cgroup_dirty_info,
__entry->nr_unstable_nfs)
)
+TRACE_EVENT(should_writeback_mem_cgroup_inode,
+ TP_PROTO(struct inode *inode,
+ struct writeback_control *wbc,
+ bool over_limit),
+
+ TP_ARGS(inode, wbc, over_limit),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned short, css_id)
+ __field(bool, shared_inodes)
+ __field(bool, over_limit)
+ ),
+
+ TP_fast_assign(
+ __entry->ino = inode->i_ino;
+ __entry->css_id =
+ inode->i_mapping ? inode->i_mapping->i_memcg : 0;
+ __entry->shared_inodes = wbc->shared_inodes;
+ __entry->over_limit = over_limit;
+ ),
+
+ TP_printk("ino=%ld css_id=%d shared_inodes=%d over_limit=%d",
+ __entry->ino,
+ __entry->css_id,
+ __entry->shared_inodes,
+ __entry->over_limit)
+)
+
+TRACE_EVENT(mem_cgroups_over_bground_dirty_thresh,
+ TP_PROTO(bool over_limit,
+ unsigned short first_id),
+
+ TP_ARGS(over_limit, first_id),
+
+ TP_STRUCT__entry(
+ __field(bool, over_limit)
+ __field(unsigned short, first_id)
+ ),
+
+ TP_fast_assign(
+ __entry->over_limit = over_limit;
+ __entry->first_id = first_id;
+ ),
+
+ TP_printk("over_limit=%d first_css_id=%d", __entry->over_limit,
+ __entry->first_id)
+)
+
#endif /* _TRACE_MEMCONTROL_H */
/* This part must be outside protection */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 75ef32c..230f0fb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -389,10 +389,18 @@ enum charge_type {
#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
+/*
+ * A bitmap representing all possible memcg, indexed by css_id. Each bit
+ * indicates if the respective memcg is over its background dirty memory
+ * limit.
+ */
+static DECLARE_BITMAP(over_bground_dirty_thresh, CSS_ID_MAX + 1);
+
static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
static void drain_all_stock_async(void);
+static struct mem_cgroup *mem_cgroup_lookup(unsigned short id);
static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -1503,6 +1511,114 @@ static void mem_cgroup_dirty_info(unsigned long sys_available_mem,
trace_mem_cgroup_dirty_info(css_id(&mem->css), info);
}
+/* Are any memcg over their background dirty memory limit? */
+bool mem_cgroups_over_bground_dirty_thresh(void)
+{
+ bool over_thresh;
+
+ over_thresh = !bitmap_empty(over_bground_dirty_thresh, CSS_ID_MAX + 1);
+
+ trace_mem_cgroups_over_bground_dirty_thresh(
+ over_thresh,
+ over_thresh ? find_next_bit(over_bground_dirty_thresh,
+ CSS_ID_MAX + 1, 0) : 0);
+
+ return over_thresh;
+}
+
+/*
+ * Should inode be written back? wbc indicates if this is foreground or
+ * background writeback and the set of inodes worth considering.
+ */
+bool should_writeback_mem_cgroup_inode(struct inode *inode,
+ struct writeback_control *wbc)
+{
+ unsigned short id;
+ bool over;
+
+ id = inode->i_mapping->i_memcg;
+ VM_BUG_ON(id >= CSS_ID_MAX + 1);
+
+ if (wbc->shared_inodes && id == I_MEMCG_SHARED)
+ over = true;
+ else
+ over = test_bit(id, over_bground_dirty_thresh);
+
+ trace_should_writeback_mem_cgroup_inode(inode, wbc, over);
+ return over;
+}
+
+/*
+ * Mark all child cgroup as eligible for writeback because @mem is over its bg
+ * threshold.
+ */
+static void mem_cgroup_mark_over_bg_thresh(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *iter;
+
+ /* mark this and all child cgroup as candidates for writeback */
+ for_each_mem_cgroup_tree(iter, mem)
+ set_bit(css_id(&iter->css), over_bground_dirty_thresh);
+}
+
+static void mem_cgroup_queue_bg_writeback(struct mem_cgroup *mem,
+ struct backing_dev_info *bdi)
+{
+ mem_cgroup_mark_over_bg_thresh(mem);
+ bdi_start_background_writeback(bdi);
+}
+
+/*
+ * This routine is called when per-memcg writeback completes. It scans any
+ * previously over-bground-thresh memcg to determine if the memcg are still over
+ * their background dirty memory limit.
+ */
+void mem_cgroup_writeback_done(void)
+{
+ struct mem_cgroup *mem;
+ struct mem_cgroup *ref_mem;
+ struct dirty_info info;
+ unsigned long sys_available_mem;
+ int id;
+
+ sys_available_mem = 0;
+
+ /* for each previously over-bg-limit memcg... */
+ for (id = 0; (id = find_next_bit(over_bground_dirty_thresh,
+ CSS_ID_MAX + 1, id)) < CSS_ID_MAX + 1;
+ id++) {
+
+ /* reference the memcg */
+ rcu_read_lock();
+ mem = mem_cgroup_lookup(id);
+ if (mem && !css_tryget(&mem->css))
+ mem = NULL;
+ rcu_read_unlock();
+ if (!mem)
+ continue;
+ ref_mem = mem;
+
+ if (!sys_available_mem)
+ sys_available_mem = determine_dirtyable_memory();
+
+ /*
+ * Walk the ancestry of inode's mem clearing the over-limit bits
+ * for for any memcg under its dirty memory background
+ * threshold.
+ */
+ for (; mem_cgroup_has_dirty_limit(mem);
+ mem = parent_mem_cgroup(mem)) {
+ mem_cgroup_dirty_info(sys_available_mem, mem, &info);
+ if (dirty_info_reclaimable(&info) >= info.dirty_thresh)
+ break;
+
+ clear_bit(css_id(&mem->css), over_bground_dirty_thresh);
+ }
+
+ css_put(&ref_mem->css);
+ }
+}
+
static void mem_cgroup_start_move(struct mem_cgroup *mem)
{
int cpu;
--
1.7.3.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2011-05-13 8:52 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-05-13 8:47 [RFC][PATCH v7 00/14] memcg: per cgroup dirty page accounting Greg Thelen
2011-05-13 8:47 ` [RFC][PATCH v7 01/14] memcg: document cgroup dirty memory interfaces Greg Thelen
2011-05-13 8:47 ` [RFC][PATCH v7 02/14] memcg: add page_cgroup flags for dirty page tracking Greg Thelen
2011-05-13 8:47 ` [RFC][PATCH v7 03/14] memcg: add mem_cgroup_mark_inode_dirty() Greg Thelen
2011-05-13 9:31 ` KAMEZAWA Hiroyuki
2011-05-13 8:47 ` [RFC][PATCH v7 04/14] memcg: add dirty page accounting infrastructure Greg Thelen
2011-05-13 9:33 ` KAMEZAWA Hiroyuki
2011-05-13 8:47 ` [RFC][PATCH v7 05/14] memcg: add kernel calls for memcg dirty page stats Greg Thelen
2011-05-13 8:47 ` [RFC][PATCH v7 06/14] memcg: add dirty limits to mem_cgroup Greg Thelen
2011-05-13 8:47 ` [RFC][PATCH v7 07/14] memcg: add cgroupfs interface to memcg dirty limits Greg Thelen
2011-05-13 8:47 ` [RFC][PATCH v7 08/14] writeback: add memcg fields to writeback_control Greg Thelen
2011-05-13 9:41 ` KAMEZAWA Hiroyuki
2011-05-15 19:53 ` Greg Thelen
2011-05-13 8:47 ` [RFC][PATCH v7 09/14] cgroup: move CSS_ID_MAX to cgroup.h Greg Thelen
2011-05-13 9:51 ` KAMEZAWA Hiroyuki
2011-05-15 19:53 ` Greg Thelen
2011-05-13 8:47 ` [RFC][PATCH v7 10/14] memcg: dirty page accounting support routines Greg Thelen
2011-05-13 9:56 ` KAMEZAWA Hiroyuki
2011-05-15 19:56 ` Greg Thelen
2011-05-16 5:58 ` KAMEZAWA Hiroyuki
2011-05-13 8:47 ` Greg Thelen [this message]
2011-05-13 10:04 ` [RFC][PATCH v7 11/14] memcg: create support routines for writeback KAMEZAWA Hiroyuki
2011-05-15 19:56 ` Greg Thelen
2011-05-13 8:47 ` [RFC][PATCH v7 12/14] memcg: create support routines for page-writeback Greg Thelen
2011-05-13 8:47 ` [RFC][PATCH v7 13/14] writeback: make background writeback cgroup aware Greg Thelen
2011-05-13 10:14 ` KAMEZAWA Hiroyuki
2011-05-13 8:47 ` [RFC][PATCH v7 14/14] memcg: check memcg dirty limits in page writeback Greg Thelen
2011-05-13 9:25 ` [RFC][PATCH v7 00/14] memcg: per cgroup dirty page accounting KAMEZAWA Hiroyuki
2011-05-14 0:54 ` Greg Thelen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1305276473-14780-12-git-send-email-gthelen@google.com \
--to=gthelen@google.com \
--cc=akpm@linux-foundation.org \
--cc=arighi@develer.com \
--cc=balbir@linux.vnet.ibm.com \
--cc=ciju@linux.vnet.ibm.com \
--cc=containers@lists.osdl.org \
--cc=david@fromorbit.com \
--cc=fengguang.wu@intel.com \
--cc=hannes@cmpxchg.org \
--cc=kamezawa.hiroyu@jp.fujitsu.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=minchan.kim@gmail.com \
--cc=nishimura@mxp.nes.nec.co.jp \
--cc=rientjes@google.com \
--cc=vgoyal@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox