From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>,
"balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>,
"nishimura@mxp.nes.nec.co.jp" <nishimura@mxp.nes.nec.co.jp>,
"xemul@openvz.org" <xemul@openvz.org>,
"menage@google.com" <menage@google.com>
Subject: [RFC][PATCH 10/11] memcg: swap cgroup
Date: Thu, 23 Oct 2008 18:13:49 +0900 [thread overview]
Message-ID: <20081023181349.63096aeb.kamezawa.hiroyu@jp.fujitsu.com> (raw)
In-Reply-To: <20081023175800.73afc957.kamezawa.hiroyu@jp.fujitsu.com>
For accounting swap, we need a record per swap entry, at least.
This patch adds following function.
- swap_cgroup_swapon() .... called from swapon
- swap_cgroup_swapoff() ... called at the end of swapoff.
- swap_cgroup_record() .... record information of swap entry.
- swap_cgroup_lookup() .... lookup information of swap entry.
This patch just implements "how to record information". No actual
method for limit the usage of swap. These routine uses flat table
to record and lookup. "wise" lookup system like radix-tree requires memory
allocation at new records but swap-out is ususally called under memory
shortage (or memcg hits limit.) So, I used static allocation.
Note1: In this, we use pointer to record information and this means
8bytes per swap entry. I think we can reduce this when we
create "id of cgroup" in the range of 0-65535 or 0-255.
Note2: array of swap_cgroup is allocated from HIGHMEM. maybe good for x86-32.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
include/linux/page_cgroup.h | 34 +++++++
mm/page_cgroup.c | 199 ++++++++++++++++++++++++++++++++++++++++++++
mm/swapfile.c | 8 +
3 files changed, 241 insertions(+)
Index: mmotm-2.6.27+/mm/page_cgroup.c
===================================================================
--- mmotm-2.6.27+.orig/mm/page_cgroup.c
+++ mmotm-2.6.27+/mm/page_cgroup.c
@@ -9,6 +9,8 @@
#include <linux/memory.h>
#include <linux/vmalloc.h>
#include <linux/cgroup.h>
+#include <linux/swapops.h>
+#include <linux/highmem.h>
static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -255,3 +257,200 @@ void __init pgdat_page_cgroup_init(struc
}
#endif
+
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+
+DEFINE_MUTEX(swap_cgroup_mutex);
+struct swap_cgroup_ctrl {
+ spinlock_t lock;
+ struct page **map;
+ unsigned long length;
+};
+
+struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+
+/*
+ * This 8bytes seems big..maybe we can reduce this when we can use "id" for
+ * cgroup rather than pointer.
+ */
+struct swap_cgroup {
+ struct mem_cgroup *val;
+};
+#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
+#define SC_POS_MASK (SC_PER_PAGE - 1)
+
+/*
+ * allocate buffer for swap_cgroup.
+ */
+static int swap_cgroup_prepare(int type)
+{
+ struct page *page;
+ struct swap_cgroup_ctrl *ctrl;
+ unsigned long idx, max;
+
+ if (!do_swap_account)
+ return 0;
+ ctrl = &swap_cgroup_ctrl[type];
+
+ for (idx = 0; idx < ctrl->length; idx++) {
+ page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+ if (!page)
+ goto not_enough_page;
+ ctrl->map[idx] = page;
+ }
+ return 0;
+not_enough_page:
+ max = idx;
+ for (idx = 0; idx < max; idx++)
+ __free_page(ctrl->map[idx]);
+
+ return -ENOMEM;
+}
+
+/**
+ * swap_cgroup_record - record mem_cgroup for this swp_entry.
+ * @ent: swap entry to be recorded into
+ * @mem: mem_cgroup to be recorded
+ *
+ * Returns old value at success, NULL at failure.
+ * (Of course, old value can be NULL.)
+ */
+struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
+{
+ unsigned long flags;
+ int type = swp_type(ent);
+ unsigned long offset = swp_offset(ent);
+ unsigned long idx = offset / SC_PER_PAGE;
+ unsigned long pos = offset & SC_POS_MASK;
+ struct swap_cgroup_ctrl *ctrl;
+ struct page *mappage;
+ struct swap_cgroup *sc;
+ struct mem_cgroup *old;
+
+ if (!do_swap_account)
+ return NULL;
+
+ ctrl = &swap_cgroup_ctrl[type];
+
+ mappage = ctrl->map[idx];
+ spin_lock_irqsave(&ctrl->lock, flags);
+ sc = kmap_atomic(mappage, KM_USER0);
+ sc += pos;
+ old = sc->val;
+ sc->val = mem;
+ kunmap_atomic(mappage, KM_USER0);
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ return old;
+}
+
+/**
+ * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
+ * @ent: swap entry to be looked up.
+ *
+ * Returns pointer to mem_cgroup at success. NULL at failure.
+ */
+struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
+{
+ int type = swp_type(ent);
+ unsigned long flags;
+ unsigned long offset = swp_offset(ent);
+ unsigned long idx = offset / SC_PER_PAGE;
+ unsigned long pos = offset & SC_POS_MASK;
+ struct swap_cgroup_ctrl *ctrl;
+ struct page *mappage;
+ struct swap_cgroup *sc;
+ struct mem_cgroup *ret;
+
+ if (!do_swap_account)
+ return NULL;
+
+ ctrl = &swap_cgroup_ctrl[type];
+
+ mappage = ctrl->map[idx];
+
+ spin_lock_irqsave(&ctrl->lock, flags);
+ sc = kmap_atomic(mappage, KM_USER0);
+ sc += pos;
+ ret = sc->val;
+ kunmap_atomic(mapppage, KM_USER0);
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ return ret;
+}
+
+int swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+ void *array;
+ unsigned long array_size;
+ unsigned long length;
+ struct swap_cgroup_ctrl *ctrl;
+
+ if (!do_swap_account)
+ return 0;
+
+ length = ((max_pages/SC_PER_PAGE) + 1);
+ array_size = length * sizeof(void *);
+
+ array = vmalloc(array_size);
+ if (!array)
+ goto nomem;
+
+ memset(array, 0, array_size);
+ ctrl = &swap_cgroup_ctrl[type];
+ mutex_lock(&swap_cgroup_mutex);
+ ctrl->length = length;
+ ctrl->map = array;
+ if (swap_cgroup_prepare(type)) {
+ /* memory shortage */
+ ctrl->map = NULL;
+ ctrl->length = 0;
+ vfree(array);
+ mutex_unlock(&swap_cgroup_mutex);
+ goto nomem;
+ }
+ mutex_unlock(&swap_cgroup_mutex);
+
+ printk(KERN_INFO
+ "swap_cgroup: uses %ldbytes vmalloc and %ld bytes buffres\n",
+ array_size, length * PAGE_SIZE);
+ printk(KERN_INFO
+ "swap_cgroup can be disabled by noswapaccount boot option.\n");
+
+ return 0;
+nomem:
+ printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
+ printk(KERN_INFO
+ "swap_cgroup can be disabled by noswapaccount boot option\n");
+ return -ENOMEM;
+}
+
+void swap_cgroup_swapoff(int type)
+{
+ int i;
+ struct swap_cgroup_ctrl *ctrl;
+
+ if (!do_swap_account)
+ return;
+
+ mutex_lock(&swap_cgroup_mutex);
+ ctrl = &swap_cgroup_ctrl[type];
+ for (i = 0; i < ctrl->length; i++) {
+ struct page *page = ctrl->map[i];
+ if (page)
+ __free_page(page);
+ }
+ vfree(ctrl->map);
+ ctrl->map = NULL;
+ ctrl->length = 0;
+ mutex_unlock(&swap_cgroup_mutex);
+}
+
+static int __init swap_cgroup_init(void)
+{
+ int i;
+ for (i = 0; i < MAX_SWAPFILES; i++)
+ spin_lock_init(&swap_cgroup_ctrl[i].lock);
+ return 0;
+}
+late_initcall(swap_cgroup_init);
+#endif
Index: mmotm-2.6.27+/include/linux/page_cgroup.h
===================================================================
--- mmotm-2.6.27+.orig/include/linux/page_cgroup.h
+++ mmotm-2.6.27+/include/linux/page_cgroup.h
@@ -110,4 +110,38 @@ static inline void page_cgroup_init(void
}
#endif
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#include <linux/swap.h>
+extern struct mem_cgroup *
+swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem);
+extern struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent);
+extern int swap_cgroup_swapon(int type, unsigned long max_pages);
+extern void swap_cgroup_swapoff(int type);
+#else
+#include <linux/swap.h>
+
+static inline
+struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
+{
+ return NULL;
+}
+
+extern struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent);
+{
+ return NULL;
+}
+
+extern inline int
+swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+ return 0;
+}
+
+extern void swap_cgroup_swapoff(int type)
+{
+ return;
+}
+
+#endif
#endif
Index: mmotm-2.6.27+/mm/swapfile.c
===================================================================
--- mmotm-2.6.27+.orig/mm/swapfile.c
+++ mmotm-2.6.27+/mm/swapfile.c
@@ -32,6 +32,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
+#include <linux/page_cgroup.h>
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
@@ -1345,6 +1346,9 @@ asmlinkage long sys_swapoff(const char _
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
vfree(swap_map);
+ /* Destroy swap acccount informatin */
+ swap_cgroup_swapoff(type);
+
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
struct block_device *bdev = I_BDEV(inode);
@@ -1669,6 +1673,10 @@ asmlinkage long sys_swapon(const char __
nr_good_pages = swap_header->info.last_page -
swap_header->info.nr_badpages -
1 /* header page */;
+
+ if (!error)
+ error = swap_cgroup_swapon(type, maxpages);
+
if (error)
goto bad_swap;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2008-10-23 9:14 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-10-23 8:58 [RFC][PATCH 0/11] memcg updates / clean up, lazy lru ,mem+swap controller KAMEZAWA Hiroyuki
2008-10-23 8:59 ` [RFC][PATCH 1/11] memcg: fix kconfig menu comment KAMEZAWA Hiroyuki
2008-10-24 4:24 ` Randy Dunlap
2008-10-24 4:28 ` KAMEZAWA Hiroyuki
2008-10-23 9:00 ` [RFC][PATCH 2/11] cgroup: make cgroup kconfig as submenu KAMEZAWA Hiroyuki
2008-10-23 21:20 ` Paul Menage
2008-10-24 1:16 ` KAMEZAWA Hiroyuki
2008-10-23 9:02 ` [RFC][PATCH 3/11] memcg: charge commit cancel protocol KAMEZAWA Hiroyuki
2008-10-23 9:03 ` [RFC][PATCH 4/11] memcg: better page migration handling KAMEZAWA Hiroyuki
2008-10-23 9:05 ` [RFC][PATCH 5/11] memcg: account move and change force_empty KAMEZAWA Hiroyuki
2008-10-24 4:28 ` Randy Dunlap
2008-10-24 4:37 ` KAMEZAWA Hiroyuki
2008-10-23 9:06 ` [RFC][PATCH 6/11] memcg: lary LRU removal KAMEZAWA Hiroyuki
2008-10-23 9:08 ` [RFC][PATCH 7/11] memcg: lazy lru add KAMEZAWA Hiroyuki
2008-10-23 9:10 ` [RFC][PATCH 8/11] memcg: shmem account helper KAMEZAWA Hiroyuki
2008-10-23 9:12 ` [RFC][PATCH 9/11] memcg : mem+swap controlelr kconfig KAMEZAWA Hiroyuki
2008-10-24 4:32 ` Randy Dunlap
2008-10-24 4:37 ` KAMEZAWA Hiroyuki
2008-10-27 6:39 ` Daisuke Nishimura
2008-10-27 7:17 ` Li Zefan
2008-10-27 7:24 ` Daisuke Nishimura
2008-10-28 0:08 ` KAMEZAWA Hiroyuki
2008-10-23 9:13 ` KAMEZAWA Hiroyuki [this message]
2008-10-27 7:02 ` [RFC][PATCH 10/11] memcg: swap cgroup Daisuke Nishimura
2008-10-28 0:09 ` KAMEZAWA Hiroyuki
2008-10-23 9:16 ` [RFC][PATCH 11/11] memcg: mem+swap controler core KAMEZAWA Hiroyuki
2008-10-27 11:37 ` Daisuke Nishimura
2008-10-28 0:16 ` KAMEZAWA Hiroyuki
2008-10-28 2:06 ` Daisuke Nishimura
2008-10-28 2:30 ` KAMEZAWA Hiroyuki
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20081023181349.63096aeb.kamezawa.hiroyu@jp.fujitsu.com \
--to=kamezawa.hiroyu@jp.fujitsu.com \
--cc=balbir@linux.vnet.ibm.com \
--cc=linux-mm@kvack.org \
--cc=menage@google.com \
--cc=nishimura@mxp.nes.nec.co.jp \
--cc=xemul@openvz.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox