From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx196.postini.com [74.125.245.196]) by kanga.kvack.org (Postfix) with SMTP id B76606B0044 for ; Thu, 23 Aug 2012 19:07:40 -0400 (EDT) Received: by lbon3 with SMTP id n3so927349lbo.14 for ; Thu, 23 Aug 2012 16:07:38 -0700 (PDT) MIME-Version: 1.0 In-Reply-To: <20120816113805.5ae65af0@cuia.bos.redhat.com> References: <20120816113450.52f4e633@cuia.bos.redhat.com> <20120816113805.5ae65af0@cuia.bos.redhat.com> Date: Thu, 23 Aug 2012 16:07:38 -0700 Message-ID: Subject: Re: [RFC][PATCH -mm -v2 4/4] mm,vmscan: evict inactive file pages first From: Ying Han Content-Type: multipart/alternative; boundary=f46d0435c1d2f953aa04c7f6ece9 Sender: owner-linux-mm@kvack.org List-ID: To: Rik van Riel Cc: linux-mm@kvack.org, aquini@redhat.com, hannes@cmpxchg.org, mhocko@suse.cz, Mel Gorman --f46d0435c1d2f953aa04c7f6ece9 Content-Type: text/plain; charset=ISO-8859-1 On Thu, Aug 16, 2012 at 8:38 AM, Rik van Riel wrote: > When a lot of streaming file IO is happening, it makes sense to > evict just the inactive file pages and leave the other LRU lists > alone. > > Likewise, when driving a cgroup hierarchy into its hard limit, > or over its soft limit, it makes sense to pick a child cgroup > that has lots of inactive file pages, and evict those first. > > Being over its soft limit is considered a stronger preference > than just having a lot of inactive file pages, so a well behaved > cgroup is allowed to keep its file cache when there is a "badly > behaving" one in the same hierarchy. > > Signed-off-by: Rik van Riel > --- > mm/vmscan.c | 37 +++++++++++++++++++++++++++++++++---- > 1 files changed, 33 insertions(+), 4 deletions(-) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 769fdcd..2884b4f 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -1576,6 +1576,19 @@ static int inactive_list_is_low(struct lruvec > *lruvec, enum lru_list lru) > return inactive_anon_is_low(lruvec); > } > > +/* If this lruvec has lots of inactive file pages, reclaim those only. */ > +static bool reclaim_file_only(struct lruvec *lruvec, struct scan_control > *sc, > + unsigned long anon, unsigned long file) > +{ > + if (inactive_file_is_low(lruvec)) > + return false; > + > + if (file > (anon + file) >> sc->priority) > + return true; > + > + return false; > +} > + > static unsigned long shrink_list(enum lru_list lru, unsigned long > nr_to_scan, > struct lruvec *lruvec, struct > scan_control *sc) > { > @@ -1658,6 +1671,14 @@ static void get_scan_count(struct lruvec *lruvec, > struct scan_control *sc, > } > } > > + /* Lots of inactive file pages? Reclaim those only. */ > + if (reclaim_file_only(lruvec, sc, anon, file)) { > + fraction[0] = 0; > + fraction[1] = 1; > + denominator = 1; > + goto out; > + } > + > /* > * With swappiness at 100, anonymous and file have the same > priority. > * This scanning priority is essentially the inverse of IO cost. > @@ -1922,8 +1943,8 @@ static void age_recent_pressure(struct lruvec > *lruvec, struct zone *zone) > * should always be larger than recent_rotated, and the size should > * always be larger than recent_pressure. > */ > -static u64 reclaim_score(struct mem_cgroup *memcg, > - struct lruvec *lruvec) > +static u64 reclaim_score(struct mem_cgroup *memcg, struct lruvec *lruvec, > + struct scan_control *sc) > { > struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; > u64 anon, file; > @@ -1949,6 +1970,14 @@ static u64 reclaim_score(struct mem_cgroup *memcg, > anon *= 10000; > } > > + /* > + * Prefer reclaiming from an lruvec with lots of inactive file > + * pages. Once those have been reclaimed, the score will drop so > + * far we will pick another lruvec to reclaim from. > + */ > + if (reclaim_file_only(lruvec, sc, anon, file)) > + file *= 100; > + > return max(anon, file); > } > > @@ -1977,7 +2006,7 @@ static void shrink_zone(struct zone *zone, struct > scan_control *sc) > > age_recent_pressure(lruvec, zone); > > - score = reclaim_score(memcg, lruvec); > + score = reclaim_score(memcg, lruvec, sc); > > /* Pick the lruvec with the highest score. */ > if (score > max_score) { > @@ -2002,7 +2031,7 @@ static void shrink_zone(struct zone *zone, struct > scan_control *sc) > */ > do { > shrink_lruvec(victim_lruvec, sc); > - score = reclaim_score(memcg, victim_lruvec); > + score = reclaim_score(memcg, victim_lruvec, sc); > I wonder if you meant s/memcg/victim_memcg here. --Ying > } while (sc->nr_to_reclaim > 0 && score > max_score / 2); > > mem_cgroup_put(victim_memcg); > > --f46d0435c1d2f953aa04c7f6ece9 Content-Type: text/html; charset=ISO-8859-1 Content-Transfer-Encoding: quoted-printable

On Thu, Aug 16, 2012 at 8:38 AM, Rik van= Riel <riel@redhat.com> wrote:
When a lot of streaming file IO is happening, it makes sense to
evict just the inactive file pages and leave the other LRU lists
alone.

Likewise, when driving a cgroup hierarchy into its hard limit,
or over its soft limit, it makes sense to pick a child cgroup
that has lots of inactive file pages, and evict those first.

Being over its soft limit is considered a stronger preference
than just having a lot of inactive file pages, so a well behaved
cgroup is allowed to keep its file cache when there is a "badly
behaving" one in the same hierarchy.

Signed-off-by: Rik van Riel <riel@red= hat.com>
---
=A0mm/vmscan.c | =A0 37 +++++++++++++++++++++++++++++++++----
=A01 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 769fdcd..2884b4f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1576,6 +1576,19 @@ static int inactive_list_is_low(struct lruvec *lruve= c, enum lru_list lru)
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return inactive_anon_is_low(lruvec);
=A0}

+/* If this lruvec has lots of inactive file pages, reclaim those only. */<= br> +static bool reclaim_file_only(struct lruvec *lruvec, struct scan_control *= sc,
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 unsigned long ano= n, unsigned long file)
+{
+ =A0 =A0 =A0 if (inactive_file_is_low(lruvec))
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 return false;
+
+ =A0 =A0 =A0 if (file > (anon + file) >> sc->priority)
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 return true;
+
+ =A0 =A0 =A0 return false;
+}
+
=A0static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_= scan,
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0struct l= ruvec *lruvec, struct scan_control *sc)
=A0{
@@ -1658,6 +1671,14 @@ static void get_scan_count(struct lruvec *lruvec, st= ruct scan_control *sc,
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
=A0 =A0 =A0 =A0 }

+ =A0 =A0 =A0 /* Lots of inactive file pages? Reclaim those only. */
+ =A0 =A0 =A0 if (reclaim_file_only(lruvec, sc, anon, file)) {
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 fraction[0] =3D 0;
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 fraction[1] =3D 1;
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 denominator =3D 1;
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 goto out;
+ =A0 =A0 =A0 }
+
=A0 =A0 =A0 =A0 /*
=A0 =A0 =A0 =A0 =A0* With swappiness at 100, anonymous and file have the sa= me priority.
=A0 =A0 =A0 =A0 =A0* This scanning priority is essentially the inverse of I= O cost.
@@ -1922,8 +1943,8 @@ static void age_recent_pressure(struct lruvec *lruvec= , struct zone *zone)
=A0 * should always be larger than recent_rotated, and the size should
=A0 * always be larger than recent_pressure.
=A0 */
-static u64 reclaim_score(struct mem_cgroup *memcg,
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0struct lruvec *lruvec)
+static u64 reclaim_score(struct mem_cgroup *memcg, struct lruvec *lruvec,<= br> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0struct scan_control *sc) =A0{
=A0 =A0 =A0 =A0 struct zone_reclaim_stat *reclaim_stat =3D &lruvec->= reclaim_stat;
=A0 =A0 =A0 =A0 u64 anon, file;
@@ -1949,6 +1970,14 @@ static u64 reclaim_score(struct mem_cgroup *memcg, =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 anon *=3D 10000;
=A0 =A0 =A0 =A0 }

+ =A0 =A0 =A0 /*
+ =A0 =A0 =A0 =A0* Prefer reclaiming from an lruvec with lots of inactive f= ile
+ =A0 =A0 =A0 =A0* pages. Once those have been reclaimed, the score will dr= op so
+ =A0 =A0 =A0 =A0* far we will pick another lruvec to reclaim from.
+ =A0 =A0 =A0 =A0*/
+ =A0 =A0 =A0 if (reclaim_file_only(lruvec, sc, anon, file))
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 file *=3D 100;
+
=A0 =A0 =A0 =A0 return max(anon, file);
=A0}

@@ -1977,7 +2006,7 @@ static void shrink_zone(struct zone *zone, struct sca= n_control *sc)

=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 age_recent_pressure(lruvec, zone);

- =A0 =A0 =A0 =A0 =A0 =A0 =A0 score =3D reclaim_score(memcg, lruvec);
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 score =3D reclaim_score(memcg, lruvec, sc);
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 /* Pick the lruvec with the highest score. = */
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (score > max_score) {
@@ -2002,7 +2031,7 @@ static void shrink_zone(struct zone *zone, struct sca= n_control *sc)
=A0 =A0 =A0 =A0 =A0*/
=A0 =A0 =A0 =A0 do {
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 shrink_lruvec(victim_lruvec, sc);
- =A0 =A0 =A0 =A0 =A0 =A0 =A0 score =3D reclaim_score(memcg, victim_lruvec)= ;
+ =A0 =A0 =A0 =A0 =A0 =A0 =A0 score =3D reclaim_score(memcg, victim_lruvec,= sc);

I wonder if you meant s/memcg/vic= tim_memcg here.

--Ying
=A0
=A0 =A0 =A0 =A0 } while (sc->nr_to_reclaim > 0 && score > = max_score / 2);

=A0 =A0 =A0 =A0 mem_cgroup_put(victim_memcg);


--f46d0435c1d2f953aa04c7f6ece9-- -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org