> We do have per-memcg targeting reclaim including per-memcg background reclaim
> and soft_limit reclaim. Both of them need some improvement, and regardless we
> still need this patch since it breaks isolation.
>
> Besides, here is to-do list I have on memcg page reclaim and they are sorted.
> a) per-memcg background reclaim. to reclaim pages proactively
> b) skipping global lru reclaim if soft_limit reclaim does enough work. this is
> both for global background reclaim and global ttfp reclaim.
> c) improve the soft_limit reclaim to be efficient.
> d) isolate pages in memcg from global list since it breaks memory isolation.
>
> I have some basic test on this patch and more tests definitely are needed:
>
> Functional:
> two memcgs under root. cgroup-A is reading 20g file with 2g limit,
> cgroup-B is running random stuff with 500m limit. Check the counters for
> per-memcg lru and global lru, and they should add-up.
>
> 1) total file pages
> $ cat /proc/meminfo | grep Cache
> Cached: 6032128 kB
>
> 2) file lru on global lru
> $ cat /proc/vmstat | grep file
> nr_inactive_file 0
> nr_active_file 963131
>
> 3) file lru on root cgroup
> $ cat /dev/cgroup/memory.stat | grep file
> inactive_file 0
> active_file 0
>
> 4) file lru on cgroup-A
> $ cat /dev/cgroup/A/memory.stat | grep file
> inactive_file 2145759232
> active_file 0
>
> 5) file lru on cgroup-B
> $ cat /dev/cgroup/B/memory.stat | grep file
> inactive_file 401408
> active_file 143360
>
> Performance:
> run page fault test(pft) with 16 thread on faulting in 15G anon pages
> in 16G cgroup. There is no regression noticed on "flt/cpu/s"
>
> +-------------------------------------------------------------------------+
> N Min Max Median Avg Stddev
> x 10 16682.962 17344.027 16913.524 16928.812 166.5362
> + 9 16455.468 16961.779 16867.569 16802.83 157.43279
> No difference proven at 95.0% confidence
>
> Signed-off-by: Ying Han <
yinghan@google.com>
> ---
> include/linux/memcontrol.h | 17 ++++++-----
> include/linux/mm_inline.h | 24 ++++++++++------
> include/linux/page_cgroup.h | 1 -
> mm/memcontrol.c | 60 +++++++++++++++++++++---------------------
> mm/page_cgroup.c | 1 -
> mm/swap.c | 12 +++++++-
> mm/vmscan.c | 22 +++++++++++----
> 7 files changed, 80 insertions(+), 57 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 5a5ce70..587a41e 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -60,11 +60,11 @@ extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr);
>
> extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
> gfp_t gfp_mask);
> -extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru);
> -extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru);
> +extern bool mem_cgroup_add_lru_list(struct page *page, enum lru_list lru);
> +extern bool mem_cgroup_del_lru_list(struct page *page, enum lru_list lru);
> extern void mem_cgroup_rotate_reclaimable_page(struct page *page);
> extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
> -extern void mem_cgroup_del_lru(struct page *page);
> +extern bool mem_cgroup_del_lru(struct page *page);
> extern void mem_cgroup_move_lists(struct page *page,
> enum lru_list from, enum lru_list to);
>
> @@ -207,13 +207,14 @@ static inline int mem_cgroup_shmem_charge_fallback(struct page *page,
> return 0;
> }
>
> -static inline void mem_cgroup_add_lru_list(struct page *page, int lru)
> +static inline bool mem_cgroup_add_lru_list(struct page *page, int lru)
> {
> + return false;
> }
>
> -static inline void mem_cgroup_del_lru_list(struct page *page, int lru)
> +static inline bool mem_cgroup_del_lru_list(struct page *page, int lru)
> {
> - return ;
> + return false;
> }
>
> static inline inline void mem_cgroup_rotate_reclaimable_page(struct page *page)
> @@ -226,9 +227,9 @@ static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru)
> return ;
> }
>
> -static inline void mem_cgroup_del_lru(struct page *page)
> +static inline bool mem_cgroup_del_lru(struct page *page)
> {
> - return ;
> + return false;
> }
>
> static inline void
> diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> index 8f7d247..f55b311 100644
> --- a/include/linux/mm_inline.h
> +++ b/include/linux/mm_inline.h
> @@ -25,9 +25,11 @@ static inline void
> __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l,
> struct list_head *head)
> {
> - list_add(&page->lru, head);
> - __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page));
> - mem_cgroup_add_lru_list(page, l);
> + if (mem_cgroup_add_lru_list(page, l) == false) {
> + list_add(&page->lru, head);
> + __mod_zone_page_state(zone, NR_LRU_BASE + l,
> + hpage_nr_pages(page));
> + }
> }
>
> static inline void
> @@ -39,9 +41,11 @@ add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
> static inline void
> del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
> {
> - list_del(&page->lru);
> - __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
> - mem_cgroup_del_lru_list(page, l);
> + if (mem_cgroup_del_lru_list(page, l) == false) {
> + list_del(&page->lru);
> + __mod_zone_page_state(zone, NR_LRU_BASE + l,
> + -hpage_nr_pages(page));
> + }
> }
>
> /**
> @@ -64,7 +68,6 @@ del_page_from_lru(struct zone *zone, struct page *page)
> {
> enum lru_list l;
>
> - list_del(&page->lru);
> if (PageUnevictable(page)) {
> __ClearPageUnevictable(page);
> l = LRU_UNEVICTABLE;
> @@ -75,8 +78,11 @@ del_page_from_lru(struct zone *zone, struct page *page)
> l += LRU_ACTIVE;
> }
> }
> - __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
> - mem_cgroup_del_lru_list(page, l);
> + if (mem_cgroup_del_lru_list(page, l) == false) {
> + __mod_zone_page_state(zone, NR_LRU_BASE + l,
> + -hpage_nr_pages(page));
> + list_del(&page->lru);
> + }
> }
>
> /**
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index f5de21d..7b2567b 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -31,7 +31,6 @@ enum {
> struct page_cgroup {
> unsigned long flags;
> struct mem_cgroup *mem_cgroup;
> - struct list_head lru; /* per cgroup LRU list */
> };
>
> void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 4407dd0..9079e2e 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -827,17 +827,17 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
> * When moving account, the page is not on LRU. It's isolated.
> */
>
> -void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
> +bool mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
> {
> struct page_cgroup *pc;
> struct mem_cgroup_per_zone *mz;
>
> if (mem_cgroup_disabled())
> - return;
> + return false;
> pc = lookup_page_cgroup(page);
> /* can happen while we handle swapcache. */
> if (!TestClearPageCgroupAcctLRU(pc))
> - return;
> + return false;
> VM_BUG_ON(!pc->mem_cgroup);
> /*
> * We don't check PCG_USED bit. It's cleared when the "page" is finally
> @@ -845,16 +845,16 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
> */
> mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
> /* huge page split is done under lru_lock. so, we have no races. */
> - MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
> if (mem_cgroup_is_root(pc->mem_cgroup))
> - return;
> - VM_BUG_ON(list_empty(&pc->lru));
> - list_del_init(&pc->lru);
> + return false;
> + MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
> + list_del_init(&page->lru);
> + return true;
> }
>
> -void mem_cgroup_del_lru(struct page *page)
> +bool mem_cgroup_del_lru(struct page *page)
> {
> - mem_cgroup_del_lru_list(page, page_lru(page));
> + return mem_cgroup_del_lru_list(page, page_lru(page));
> }
>
> /*
> @@ -880,7 +880,7 @@ void mem_cgroup_rotate_reclaimable_page(struct page *page)
> if (mem_cgroup_is_root(pc->mem_cgroup))
> return;
> mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
> - list_move_tail(&pc->lru, &mz->lists[lru]);
> + list_move(&page->lru, &mz->lists[lru]);
> }
>
> void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
> @@ -900,29 +900,30 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
> if (mem_cgroup_is_root(pc->mem_cgroup))
> return;
> mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
> - list_move(&pc->lru, &mz->lists[lru]);
> + list_move(&page->lru, &mz->lists[lru]);
> }
>
> -void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
> +bool mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
> {
> struct page_cgroup *pc;
> struct mem_cgroup_per_zone *mz;
>
> if (mem_cgroup_disabled())
> - return;
> + return false;
> pc = lookup_page_cgroup(page);
> VM_BUG_ON(PageCgroupAcctLRU(pc));
> if (!PageCgroupUsed(pc))
> - return;
> + return false;
> /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
> smp_rmb();
> mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
> /* huge page split is done under lru_lock. so, we have no races. */
> - MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
> SetPageCgroupAcctLRU(pc);
> if (mem_cgroup_is_root(pc->mem_cgroup))
> - return;
> - list_add(&pc->lru, &mz->lists[lru]);
> + return false;
> + MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
> + list_add(&page->lru, &mz->lists[lru]);
> + return true;
> }
>
> /*
> @@ -1111,11 +1112,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
> int active, int file)
> {
> unsigned long nr_taken = 0;
> - struct page *page;
> + struct page *page, *tmp;
> unsigned long scan;
> LIST_HEAD(pc_list);
> struct list_head *src;
> - struct page_cgroup *pc, *tmp;
> + struct page_cgroup *pc;
> int nid = zone_to_nid(z);
> int zid = zone_idx(z);
> struct mem_cgroup_per_zone *mz;
> @@ -1127,24 +1128,24 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
> src = &mz->lists[lru];
>
> scan = 0;
> - list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
> + list_for_each_entry_safe_reverse(page, tmp, src, lru) {
> + pc = lookup_page_cgroup(page);
> if (scan >= nr_to_scan)
> break;
>
> if (unlikely(!PageCgroupUsed(pc)))
> continue;
> -
> - page = lookup_cgroup_page(pc);
> -
> if (unlikely(!PageLRU(page)))
> continue;
>
> + BUG_ON(!PageCgroupAcctLRU(pc));
> +
> scan++;
> ret = __isolate_lru_page(page, mode, file);
> switch (ret) {
> case 0:
> - list_move(&page->lru, dst);
> mem_cgroup_del_lru(page);
> + list_add(&page->lru, dst);
> nr_taken += hpage_nr_pages(page);
> break;
> case -EBUSY:
> @@ -3386,6 +3387,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
> struct page_cgroup *pc, *busy;
> unsigned long flags, loop;
> struct list_head *list;
> + struct page *page;
> int ret = 0;
>
> zone = &NODE_DATA(node)->node_zones[zid];
> @@ -3397,25 +3399,23 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
> loop += 256;
> busy = NULL;
> while (loop--) {
> - struct page *page;
> -
> ret = 0;
> spin_lock_irqsave(&zone->lru_lock, flags);
> if (list_empty(list)) {
> spin_unlock_irqrestore(&zone->lru_lock, flags);
> break;
> }
> - pc = list_entry(list->prev, struct page_cgroup, lru);
> + page = list_entry(list->prev, struct page, lru);
> + pc = lookup_page_cgroup(page);
> if (busy == pc) {
> - list_move(&pc->lru, list);
> + /* XXX what should we do here? */
> + list_move(&page->lru, list);
> busy = NULL;
> spin_unlock_irqrestore(&zone->lru_lock, flags);
> continue;
> }
> spin_unlock_irqrestore(&zone->lru_lock, flags);
>
> - page = lookup_cgroup_page(pc);
> -
> ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
> if (ret == -ENOMEM)
> break;
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index 885b2ac..b812bf3 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -16,7 +16,6 @@ static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
> pc->flags = 0;
> set_page_cgroup_array_id(pc, id);
> pc->mem_cgroup = NULL;
> - INIT_LIST_HEAD(&pc->lru);
> }
> static unsigned long total_usage;
>
> diff --git a/mm/swap.c b/mm/swap.c
> index 0a33714..9cb95c5 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -31,6 +31,7 @@
> #include <linux/backing-dev.h>
> #include <linux/memcontrol.h>
> #include <linux/gfp.h>
> +#include <linux/page_cgroup.h>
>
> #include "internal.h"
>
> @@ -200,10 +201,17 @@ static void pagevec_move_tail(struct pagevec *pvec)
> spin_lock(&zone->lru_lock);
> }
> if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
> + struct page_cgroup *pc;
> enum lru_list lru = page_lru_base_type(page);
> - list_move_tail(&page->lru, &zone->lru[lru].list);
> +
> mem_cgroup_rotate_reclaimable_page(page);
> - pgmoved++;
> + pc = lookup_page_cgroup(page);
> + smp_rmb();
> + if (!PageCgroupAcctLRU(pc)) {
> + list_move_tail(&page->lru,
> + &zone->lru[lru].list);
> + pgmoved++;
> + }
> }
> }
> if (zone)
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 060e4c1..5e54611 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -41,6 +41,7 @@
> #include <linux/memcontrol.h>
> #include <linux/delayacct.h>
> #include <linux/sysctl.h>
> +#include <linux/page_cgroup.h>
>
> #include <asm/tlbflush.h>
> #include <asm/div64.h>
> @@ -1042,15 +1043,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
>
> switch (__isolate_lru_page(page, mode, file)) {
> case 0:
> - list_move(&page->lru, dst);
> + /* verify that it's not on any cgroups */
> mem_cgroup_del_lru(page);
> + list_move(&page->lru, dst);
> nr_taken += hpage_nr_pages(page);
> break;
>
> case -EBUSY:
> /* else it is being freed elsewhere */
> - list_move(&page->lru, src);
> mem_cgroup_rotate_lru_list(page, page_lru(page));
> + list_move(&page->lru, src);
> continue;
>
> default:
> @@ -1100,8 +1102,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> break;
>
> if (__isolate_lru_page(cursor_page, mode, file) == 0) {
> - list_move(&cursor_page->lru, dst);
> + /* verify that it's not on any cgroup */
> mem_cgroup_del_lru(cursor_page);
> + list_move(&cursor_page->lru, dst);
> nr_taken += hpage_nr_pages(page);
> nr_lumpy_taken++;
> if (PageDirty(cursor_page))
> @@ -1473,6 +1476,7 @@ static void move_active_pages_to_lru(struct zone *zone,
> unsigned long pgmoved = 0;
> struct pagevec pvec;
> struct page *page;
> + struct page_cgroup *pc;
>
> pagevec_init(&pvec, 1);
>
> @@ -1482,9 +1486,15 @@ static void move_active_pages_to_lru(struct zone *zone,
> VM_BUG_ON(PageLRU(page));
> SetPageLRU(page);
>
> - list_move(&page->lru, &zone->lru[lru].list);
> - mem_cgroup_add_lru_list(page, lru);
> - pgmoved += hpage_nr_pages(page);
> + pc = lookup_page_cgroup(page);
> + smp_rmb();
> + if (!PageCgroupAcctLRU(pc)) {
> + list_move(&page->lru, &zone->lru[lru].list);
> + pgmoved += hpage_nr_pages(page);
> + } else {
> + list_del_init(&page->lru);
> + mem_cgroup_add_lru_list(page, lru);
> + }
>
> if (!pagevec_add(&pvec, page) || list_empty(list)) {
> spin_unlock_irq(&zone->lru_lock);
> --
> 1.7.3.1
>