Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Zhaoyang Huang <huangzhaoyang@gmail.com>
To: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	 Vlastimil Babka <vbabka@suse.cz>,
	Joonsoo Kim <iamjoonsoo.kim@lge.com>,
	 David Rientjes <rientjes@google.com>,
	Zhaoyang Huang <zhaoyang.huang@unisoc.com>,
	 Roman Gushchin <guro@fb.com>, Jeff Layton <jlayton@redhat.com>,
	 "open list:MEMORY MANAGEMENT" <linux-mm@kvack.org>,
	LKML <linux-kernel@vger.kernel.org>,
	 Pavel Tatashin <pasha.tatashin@soleen.com>,
	Johannes Weiner <hannes@cmpxchg.org>
Subject: Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp
Date: Tue, 23 Apr 2019 19:43:18 +0800	[thread overview]
Message-ID: <CAGWkznEqhHcAb0ZnO9-ssk1qQHYFKx4ML0vd4Knj_f2n_PpR0g@mail.gmail.com> (raw)
In-Reply-To: <20190417133724.GC7751@bombadil.infradead.org>

rebase the commit to latest mainline and update the code.
@Matthew, with regarding to your comment, I would like to say the
algorithm doesn't change at all. I do NOT judge the page's activity
via an absolute time value, but still the refault distance. What I
want to fix is the scenario which drop lots of file pages on this lru
that leading to a big refault_distance(inactive_age) and inactivate
the page. I haven't found regression of the commit yet. Could you
please suggest me more test cases? Thank you!
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fba7741..ca4ced6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -242,6 +242,7 @@ struct lruvec {
  atomic_long_t inactive_age;
  /* Refaults at the time of last reclaim cycle */
  unsigned long refaults;
+ atomic_long_t refaults_ratio;
 #ifdef CONFIG_MEMCG
  struct pglist_data *pgdat;
 #endif
diff --git a/mm/workingset.c b/mm/workingset.c
index 0bedf67..95683c1 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -171,6 +171,15 @@
  1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
 #define EVICTION_MASK (~0UL >> EVICTION_SHIFT)

+#ifdef CONFIG_64BIT
+#define EVICTION_SECS_POS_SHIFT 19
+#define EVICTION_SECS_SHRINK_SHIFT 4
+#define EVICTION_SECS_POS_MASK  ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
+#else
+#define EVICTION_SECS_POS_SHIFT 0
+#define EVICTION_SECS_SHRINK_SHIFT 0
+#define NO_SECS_IN_WORKINGSET
+#endif
 /*
  * Eviction timestamps need to be able to cover the full range of
  * actionable refaults. However, bits are tight in the xarray
@@ -180,12 +189,48 @@
  * evictions into coarser buckets by shaving off lower timestamp bits.
  */
 static unsigned int bucket_order __read_mostly;
-
+#ifdef NO_SECS_IN_WORKINGSET
+static void pack_secs(unsigned long *peviction) { }
+static unsigned int unpack_secs(unsigned long entry) {return 0; }
+#else
+static void pack_secs(unsigned long *peviction)
+{
+ unsigned int secs;
+ unsigned long eviction;
+ int order;
+ int secs_shrink_size;
+ struct timespec64 ts;
+
+ ktime_get_boottime_ts64(&ts);
+ secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
+ order = get_count_order(secs);
+ secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT)
+ ? 0 : (order - EVICTION_SECS_POS_SHIFT);
+
+ eviction = *peviction;
+ eviction = (eviction << EVICTION_SECS_POS_SHIFT)
+ | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK);
+ eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) |
(secs_shrink_size & 0xf);
+ *peviction = eviction;
+}
+static unsigned int unpack_secs(unsigned long entry)
+{
+ unsigned int secs;
+ int secs_shrink_size;
+
+ secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1);
+ entry >>= EVICTION_SECS_SHRINK_SHIFT;
+ secs = entry & EVICTION_SECS_POS_MASK;
+ secs = secs << secs_shrink_size;
+ return secs;
+}
+#endif
 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
  bool workingset)
 {
  eviction >>= bucket_order;
  eviction &= EVICTION_MASK;
+ pack_secs(&eviction);
  eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
  eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
  eviction = (eviction << 1) | workingset;
@@ -194,11 +239,12 @@ static void *pack_shadow(int memcgid, pg_data_t
*pgdat, unsigned long eviction,
 }

 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
-   unsigned long *evictionp, bool *workingsetp)
+ unsigned long *evictionp, bool *workingsetp, unsigned int *prev_secs)
 {
  unsigned long entry = xa_to_value(shadow);
  int memcgid, nid;
  bool workingset;
+ unsigned int secs;

  workingset = entry & 1;
  entry >>= 1;
@@ -206,11 +252,14 @@ static void unpack_shadow(void *shadow, int
*memcgidp, pg_data_t **pgdat,
  entry >>= NODES_SHIFT;
  memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
  entry >>= MEM_CGROUP_ID_SHIFT;
+ secs = unpack_secs(entry);
+ entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT);

  *memcgidp = memcgid;
  *pgdat = NODE_DATA(nid);
  *evictionp = entry << bucket_order;
  *workingsetp = workingset;
+ *prev_secs = secs;
 }

 /**
@@ -257,8 +306,22 @@ void workingset_refault(struct page *page, void *shadow)
  unsigned long refault;
  bool workingset;
  int memcgid;
+#ifndef NO_SECS_IN_WORKINGSET
+ unsigned long avg_refault_time;
+ unsigned long refaults_ratio;
+ unsigned long refault_time;
+ int tradition;
+ unsigned int prev_secs;
+ unsigned int secs;
+#endif
+ struct timespec64 ts;
+ /*
+ convert jiffies to second
+ */
+ ktime_get_boottime_ts64(&ts);
+ secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;

- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset, &prev_secs);

  rcu_read_lock();
  /*
@@ -303,23 +366,58 @@ void workingset_refault(struct page *page, void *shadow)
  refault_distance = (refault - eviction) & EVICTION_MASK;

  inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
-
+#ifndef NO_SECS_IN_WORKINGSET
+ refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs;
+ atomic_long_set(&lruvec->refaults_ratio, refaults_ratio);
+ refault_time = secs - prev_secs;
+ avg_refault_time = active_file / refaults_ratio;
+ tradition = !!(refault_distance < active_file);
  /*
- * Compare the distance to the existing workingset size. We
- * don't act on pages that couldn't stay resident even if all
- * the memory was available to the page cache.
+ * What we are trying to solve here is
+ * 1. extremely fast refault as refault_time == 0.
+ * 2. quick file drop scenario, which has a big refault_distance but
+ *    small refault_time comparing with the past refault ratio, which
+ *    will be deemed as inactive in previous implementation.
  */
- if (refault_distance > active_file)
+ if (refault_time && (((refault_time < avg_refault_time)
+ && (avg_refault_time < 2 * refault_time))
+ || (refault_time >= avg_refault_time))) {
+ trace_printk("WKST_INACT[%d]:rft_dis %ld, act %ld\
+ rft_ratio %ld rft_time %ld avg_rft_time %ld\
+ refault %ld eviction %ld secs %d pre_secs %d page %p\n",
+ tradition, refault_distance, active_file,
+ refaults_ratio, refault_time, avg_refault_time,
+ refault, eviction, secs, prev_secs, page);
  goto out;
+ }
+ else {
+#else
+ if (refault_distance < active_file) {
+#endif

- SetPageActive(page);
- atomic_long_inc(&lruvec->inactive_age);
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+ /*
+ * Compare the distance to the existing workingset size. We
+ * don't act on pages that couldn't stay resident even if all
+ * the memory was available to the page cache.
+ */

- /* Page was active prior to eviction */
- if (workingset) {
- SetPageWorkingset(page);
- inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
+ SetPageActive(page);
+ atomic_long_inc(&lruvec->inactive_age);
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+
+ /* Page was active prior to eviction */
+ if (workingset) {
+ SetPageWorkingset(page);
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
+ }
+#ifndef NO_SECS_IN_WORKINGSET
+ trace_printk("WKST_ACT[%d]:rft_dis %ld, act %ld\
+ rft_ratio %ld rft_time %ld avg_rft_time %ld\
+ refault %ld eviction %ld secs %d pre_secs %d page %p\n",
+ tradition, refault_distance, active_file,
+ refaults_ratio, refault_time, avg_refault_time,
+ refault, eviction, secs, prev_secs, page);
+#endif
  }
 out:
  rcu_read_unlock();
@@ -539,7 +637,9 @@ static int __init workingset_init(void)
  unsigned int max_order;
  int ret;

- BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+ BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT
+ + EVICTION_SECS_POS_SHIFT
+ + EVICTION_SECS_SHRINK_SHIFT));
  /*
  * Calculate the eviction bucket size to cover the longest
  * actionable refault distance, which is currently half of
@@ -547,7 +647,9 @@ static int __init workingset_init(void)
  * some more pages at runtime, so keep working with up to
  * double the initial memory by using totalram_pages as-is.
  */
- timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT
+ - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT;
+
  max_order = fls_long(totalram_pages() - 1);
  if (max_order > timestamp_bits)
  bucket_order = max_order - timestamp_bits;

On Wed, Apr 17, 2019 at 9:37 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Wed, Apr 17, 2019 at 08:26:22PM +0800, Zhaoyang Huang wrote:
> [quoting Johannes here]
> > As Matthew says, you are fairly randomly making refault activations
> > more aggressive (especially with that timestamp unpacking bug), and
> > while that expectedly boosts workload transition / startup, it comes
> > at the cost of disrupting stable states because you can flood a very
> > active in-ram workingset with completely cold cache pages simply
> > because they refault uniformly wrt each other.
> > [HZY]: I analysis the log got from trace_printk, what we activate have
> > proven record of long refault distance but very short refault time.
>
> You haven't addressed my point, which is that you were only testing
> workloads for which your changed algorithm would improve the results.
> What you haven't done is shown how other workloads would be negatively
> affected.
>
> Once you do that, we can make a decision about whether to improve your
> workload by X% and penalise that other workload by Y%.

next prev parent reply	other threads:[~2019-04-23 11:43 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-17  7:47 Zhaoyang Huang
2019-04-17  7:59 ` Zhaoyang Huang
2019-04-17 10:55   ` Zhaoyang Huang
2019-04-17 11:06     ` Michal Hocko
2019-04-17 11:36       ` Zhaoyang Huang
2019-04-17 11:46         ` Michal Hocko
2019-04-17 12:26           ` Zhaoyang Huang
2019-04-17 12:58             ` Michal Hocko
2019-04-17 13:37             ` Matthew Wilcox
2019-04-23 11:43               ` Zhaoyang Huang [this message]
2019-04-17  8:45 ` Michal Hocko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAGWkznEqhHcAb0ZnO9-ssk1qQHYFKx4ML0vd4Knj_f2n_PpR0g@mail.gmail.com \
    --to=huangzhaoyang@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=guro@fb.com \
    --cc=hannes@cmpxchg.org \
    --cc=iamjoonsoo.kim@lge.com \
    --cc=jlayton@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=pasha.tatashin@soleen.com \
    --cc=rientjes@google.com \
    --cc=vbabka@suse.cz \
    --cc=willy@infradead.org \
    --cc=zhaoyang.huang@unisoc.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox