[PATCH 0/3] mm, lru_gen: batch update pages when aging

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/3] mm, lru_gen: batch update pages when aging
@ 2023-12-22 10:22 Kairui Song
  2023-12-22 10:22 ` [PATCH 1/3] mm, lru_gen: batch update counters on againg Kairui Song
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Kairui Song @ 2023-12-22 10:22 UTC (permalink / raw)
  To: linux-mm; +Cc: Andrew Morton, Yu Zhao, linux-kernel, Kairui Song

From: Kairui Song <kasong@tencent.com>

Currently when MGLRU ages, it moves the pages one by one and updates mm
counter page by page, which is correct but the overhead can be optimized
by batching these operations.

Batch moving also has a good effect on LRU ordering. Currently when
MGLRU ages, it walks the LRU backward, and the protected pages are moved to
the tail of newer gen one by one, which reverses the order of pages in
LRU. Moving them in batches can help keep their order, only in a small
scope though due to the scan limit of MAX_LRU_BATCH pages.

I noticed a higher performance gain if there are a lot of pages getting
protected, but hard to reproduce, so instead I tested using a simpler
benchmark, memtier, also for a more generic result. The main overhead
here is not aging but the result is also looking good:

Average result of 18 test runs:

Before:           44017.78 Ops/sec
After patch 1-3:  44890.50 Ops/sec (+1.8%)

Kairui Song (3):
  mm, lru_gen: batch update counters on againg
  mm, lru_gen: move pages in bulk when aging
  mm, lru_gen: try to prefetch next page when canning LRU

 mm/vmscan.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 124 insertions(+), 16 deletions(-)

-- 
2.43.0

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/3] mm, lru_gen: batch update counters on againg
  2023-12-22 10:22 [PATCH 0/3] mm, lru_gen: batch update pages when aging Kairui Song
@ 2023-12-22 10:22 ` Kairui Song
  2023-12-25  7:28   ` Yu Zhao
  2023-12-26 23:43   ` Chris Li
  2023-12-22 10:22 ` [PATCH 2/3] mm, lru_gen: move pages in bulk when aging Kairui Song
  2023-12-22 10:22 ` [PATCH 3/3] mm, lru_gen: try to prefetch next page when canning LRU Kairui Song
  2 siblings, 2 replies; 14+ messages in thread
From: Kairui Song @ 2023-12-22 10:22 UTC (permalink / raw)
  To: linux-mm; +Cc: Andrew Morton, Yu Zhao, linux-kernel, Kairui Song

From: Kairui Song <kasong@tencent.com>

When lru_gen is aging, it will update mm counters page by page,
which causes a higher overhead if age happens frequently or there
are a lot of pages in one generation getting moved.
Optimize this by doing the counter update in batch.

Although most __mod_*_state has its own caches the overhead
is still observable.

Tested in a 4G memcg on a EPYC 7K62 with:

  memcached -u nobody -m 16384 -s /tmp/memcached.socket \
    -a 0766 -t 16 -B binary &

  memtier_benchmark -S /tmp/memcached.socket \
    -P memcache_binary -n allkeys \
    --key-minimum=1 --key-maximum=16000000 -d 1024 \
    --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6

Average result of 18 test runs:

Before: 44017.78 Ops/sec
After:  44687.08 Ops/sec (+1.5%)

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/vmscan.c | 64 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 55 insertions(+), 9 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b4ca3563bcf4..e3b4797b9729 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3095,9 +3095,47 @@ static int folio_update_gen(struct folio *folio, int gen)
 	return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
 }
 
+/*
+ * Update LRU gen in batch for each lru_gen LRU list. The batch is limited to
+ * each gen / type / zone level LRU. Batch is applied after finished or aborted
+ * scanning one LRU list.
+ */
+struct gen_update_batch {
+	int delta[MAX_NR_GENS];
+};
+
+static void lru_gen_update_batch(struct lruvec *lruvec, bool type, int zone,
+				 struct gen_update_batch *batch)
+{
+	int gen;
+	int promoted = 0;
+	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+	enum lru_list lru = type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
+
+	for (gen = 0; gen < MAX_NR_GENS; gen++) {
+		int delta = batch->delta[gen];
+
+		if (!delta)
+			continue;
+
+		WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
+			   lrugen->nr_pages[gen][type][zone] + delta);
+
+		if (lru_gen_is_active(lruvec, gen))
+			promoted += delta;
+	}
+
+	if (promoted) {
+		__update_lru_size(lruvec, lru, zone, -promoted);
+		__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, promoted);
+	}
+}
+
 /* protect pages accessed multiple times through file descriptors */
-static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio,
+			 bool reclaiming, struct gen_update_batch *batch)
 {
+	int delta = folio_nr_pages(folio);
 	int type = folio_is_file_lru(folio);
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
 	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
@@ -3120,7 +3158,8 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
 			new_flags |= BIT(PG_reclaim);
 	} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
 
-	lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+	batch->delta[old_gen] -= delta;
+	batch->delta[new_gen] += delta;
 
 	return new_gen;
 }
@@ -3663,6 +3702,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
 {
 	int zone;
 	int remaining = MAX_LRU_BATCH;
+	struct gen_update_batch batch = { };
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
 	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
 
@@ -3681,12 +3721,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
 			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
 			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
 
-			new_gen = folio_inc_gen(lruvec, folio, false);
+			new_gen = folio_inc_gen(lruvec, folio, false, &batch);
 			list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
 
-			if (!--remaining)
+			if (!--remaining) {
+				lru_gen_update_batch(lruvec, type, zone, &batch);
 				return false;
+			}
 		}
+		lru_gen_update_batch(lruvec, type, zone, &batch);
 	}
 done:
 	reset_ctrl_pos(lruvec, type, true);
@@ -4197,7 +4240,7 @@ static int lru_gen_memcg_seg(struct lruvec *lruvec)
  ******************************************************************************/
 
 static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
-		       int tier_idx)
+		       int tier_idx, struct gen_update_batch *batch)
 {
 	bool success;
 	int gen = folio_lru_gen(folio);
@@ -4239,7 +4282,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
 	if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
 		int hist = lru_hist_from_seq(lrugen->min_seq[type]);
 
-		gen = folio_inc_gen(lruvec, folio, false);
+		gen = folio_inc_gen(lruvec, folio, false, batch);
 		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
 
 		WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
@@ -4249,7 +4292,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
 
 	/* ineligible */
 	if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
-		gen = folio_inc_gen(lruvec, folio, false);
+		gen = folio_inc_gen(lruvec, folio, false, batch);
 		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
 		return true;
 	}
@@ -4257,7 +4300,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
 	/* waiting for writeback */
 	if (folio_test_locked(folio) || folio_test_writeback(folio) ||
 	    (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
-		gen = folio_inc_gen(lruvec, folio, true);
+		gen = folio_inc_gen(lruvec, folio, true, batch);
 		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
 		return true;
 	}
@@ -4323,6 +4366,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 	for (i = MAX_NR_ZONES; i > 0; i--) {
 		LIST_HEAD(moved);
 		int skipped_zone = 0;
+		struct gen_update_batch batch = { };
 		int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
 		struct list_head *head = &lrugen->folios[gen][type][zone];
 
@@ -4337,7 +4381,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 
 			scanned += delta;
 
-			if (sort_folio(lruvec, folio, sc, tier))
+			if (sort_folio(lruvec, folio, sc, tier, &batch))
 				sorted += delta;
 			else if (isolate_folio(lruvec, folio, sc)) {
 				list_add(&folio->lru, list);
@@ -4357,6 +4401,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 			skipped += skipped_zone;
 		}
 
+		lru_gen_update_batch(lruvec, type, zone, &batch);
+
 		if (!remaining || isolated >= MIN_LRU_BATCH)
 			break;
 	}
-- 
2.43.0



^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/3] mm, lru_gen: move pages in bulk when aging
  2023-12-22 10:22 [PATCH 0/3] mm, lru_gen: batch update pages when aging Kairui Song
  2023-12-22 10:22 ` [PATCH 1/3] mm, lru_gen: batch update counters on againg Kairui Song
@ 2023-12-22 10:22 ` Kairui Song
  2023-12-23  7:36   ` kernel test robot
  2023-12-25  6:58   ` Yu Zhao
  2023-12-22 10:22 ` [PATCH 3/3] mm, lru_gen: try to prefetch next page when canning LRU Kairui Song
  2 siblings, 2 replies; 14+ messages in thread
From: Kairui Song @ 2023-12-22 10:22 UTC (permalink / raw)
  To: linux-mm; +Cc: Andrew Morton, Yu Zhao, linux-kernel, Kairui Song

From: Kairui Song <kasong@tencent.com>

Another overhead of aging is page moving. Actually, in most cases,
pages are being moved to the same gen after folio_inc_gen is called,
especially the protected pages.  So it's better to move them in bulk.

This also has a good effect on LRU ordering. Currently when MGLRU
ages, it walks the LRU backward, and the protected pages are moved to
the tail of newer gen one by one, which reverses the order of pages in
LRU. Moving them in batches can help keep their order, only in a small
scope though due to the scan limit of MAX_LRU_BATCH pages.

After this commit, we can see a performance gain:

Tested in a 4G memcg on a EPYC 7K62 with:

  memcached -u nobody -m 16384 -s /tmp/memcached.socket \
    -a 0766 -t 16 -B binary &

  memtier_benchmark -S /tmp/memcached.socket \
    -P memcache_binary -n allkeys \
    --key-minimum=1 --key-maximum=16000000 -d 1024 \
    --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6

Average result of 18 test runs:

Before:           44017.78 Ops/sec
After patch 1-2:  44810.01 Ops/sec (+1.8%)

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/vmscan.c | 84 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 71 insertions(+), 13 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index e3b4797b9729..af1266129c1b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3102,9 +3102,46 @@ static int folio_update_gen(struct folio *folio, int gen)
  */
 struct gen_update_batch {
 	int delta[MAX_NR_GENS];
+	struct folio *head, *tail;
 };
 
-static void lru_gen_update_batch(struct lruvec *lruvec, bool type, int zone,
+static void inline lru_gen_inc_bulk_finish(struct lru_gen_folio *lrugen,
+					   int bulk_gen, bool type, int zone,
+					   struct gen_update_batch *batch)
+{
+	if (!batch->head)
+		return;
+
+	list_bulk_move_tail(&lrugen->folios[bulk_gen][type][zone],
+			    &batch->head->lru,
+			    &batch->tail->lru);
+
+	batch->head = NULL;
+}
+
+/*
+ * When aging, protected pages will go to the tail of the same higher
+ * gen, so the can be moved in batches. Besides reduced overhead, this
+ * also avoids changing their LRU order in a small scope.
+ */
+static void inline lru_gen_try_inc_bulk(struct lru_gen_folio *lrugen, struct folio *folio,
+					int bulk_gen, int gen, bool type, int zone,
+					struct gen_update_batch *batch)
+{
+	/*
+	 * If folio not moving to the bulk_gen, it's raced with promotion
+	 * so it need to go to the head of another LRU.
+	 */
+	if (bulk_gen != gen)
+		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
+
+	if (!batch->head)
+		batch->tail = folio;
+
+	batch->head = folio;
+}
+
+static void lru_gen_update_batch(struct lruvec *lruvec, int bulk_gen, bool type, int zone,
 				 struct gen_update_batch *batch)
 {
 	int gen;
@@ -3112,6 +3149,8 @@ static void lru_gen_update_batch(struct lruvec *lruvec, bool type, int zone,
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
 	enum lru_list lru = type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
 
+	lru_gen_inc_bulk_finish(lrugen, bulk_gen, type, zone, batch);
+
 	for (gen = 0; gen < MAX_NR_GENS; gen++) {
 		int delta = batch->delta[gen];
 
@@ -3705,6 +3744,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
 	struct gen_update_batch batch = { };
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
 	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+	int bulk_gen = (old_gen + 1) % MAX_NR_GENS;
 
 	if (type == LRU_GEN_ANON && !can_swap)
 		goto done;
@@ -3712,24 +3752,33 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
 	/* prevent cold/hot inversion if force_scan is true */
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		struct list_head *head = &lrugen->folios[old_gen][type][zone];
+		struct folio *prev = NULL;
 
-		while (!list_empty(head)) {
-			struct folio *folio = lru_to_folio(head);
+		if (!list_empty(head))
+			prev = lru_to_folio(head);
 
+		while (prev) {
+			struct folio *folio = prev;
 			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
 			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
 			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
 			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
 
+			if (unlikely(list_is_first(&folio->lru, head)))
+				prev = NULL;
+			else
+				prev = lru_to_folio(&folio->lru);
+
 			new_gen = folio_inc_gen(lruvec, folio, false, &batch);
-			list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
+			lru_gen_try_inc_bulk(lrugen, folio, bulk_gen, new_gen, type, zone, &batch);
 
 			if (!--remaining) {
-				lru_gen_update_batch(lruvec, type, zone, &batch);
+				lru_gen_update_batch(lruvec, bulk_gen, type, zone, &batch);
 				return false;
 			}
 		}
-		lru_gen_update_batch(lruvec, type, zone, &batch);
+
+		lru_gen_update_batch(lruvec, bulk_gen, type, zone, &batch);
 	}
 done:
 	reset_ctrl_pos(lruvec, type, true);
@@ -4240,7 +4289,7 @@ static int lru_gen_memcg_seg(struct lruvec *lruvec)
  ******************************************************************************/
 
 static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
-		       int tier_idx, struct gen_update_batch *batch)
+		       int tier_idx, int bulk_gen, struct gen_update_batch *batch)
 {
 	bool success;
 	int gen = folio_lru_gen(folio);
@@ -4283,7 +4332,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
 		int hist = lru_hist_from_seq(lrugen->min_seq[type]);
 
 		gen = folio_inc_gen(lruvec, folio, false, batch);
-		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+		lru_gen_try_inc_bulk(lrugen, folio, bulk_gen, gen, type, zone, batch);
 
 		WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
 			   lrugen->protected[hist][type][tier - 1] + delta);
@@ -4293,7 +4342,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
 	/* ineligible */
 	if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
 		gen = folio_inc_gen(lruvec, folio, false, batch);
-		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+		lru_gen_try_inc_bulk(lrugen, folio, bulk_gen, gen, type, zone, batch);
 		return true;
 	}
 
@@ -4367,11 +4416,16 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 		LIST_HEAD(moved);
 		int skipped_zone = 0;
 		struct gen_update_batch batch = { };
+		int bulk_gen = (gen + 1) % MAX_NR_GENS;
 		int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
 		struct list_head *head = &lrugen->folios[gen][type][zone];
+		struct folio *prev = NULL;
 
-		while (!list_empty(head)) {
-			struct folio *folio = lru_to_folio(head);
+		if (!list_empty(head))
+			prev = lru_to_folio(head);
+
+		while (prev) {
+			struct folio *folio = prev;
 			int delta = folio_nr_pages(folio);
 
 			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
@@ -4380,8 +4434,12 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
 
 			scanned += delta;
+			if (unlikely(list_is_first(&folio->lru, head)))
+				prev = NULL;
+			else
+				prev = lru_to_folio(&folio->lru);
 
-			if (sort_folio(lruvec, folio, sc, tier, &batch))
+			if (sort_folio(lruvec, folio, sc, tier, bulk_gen, &batch))
 				sorted += delta;
 			else if (isolate_folio(lruvec, folio, sc)) {
 				list_add(&folio->lru, list);
@@ -4401,7 +4459,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 			skipped += skipped_zone;
 		}
 
-		lru_gen_update_batch(lruvec, type, zone, &batch);
+		lru_gen_update_batch(lruvec, bulk_gen, type, zone, &batch);
 
 		if (!remaining || isolated >= MIN_LRU_BATCH)
 			break;
-- 
2.43.0



^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 3/3] mm, lru_gen: try to prefetch next page when canning LRU
  2023-12-22 10:22 [PATCH 0/3] mm, lru_gen: batch update pages when aging Kairui Song
  2023-12-22 10:22 ` [PATCH 1/3] mm, lru_gen: batch update counters on againg Kairui Song
  2023-12-22 10:22 ` [PATCH 2/3] mm, lru_gen: move pages in bulk when aging Kairui Song
@ 2023-12-22 10:22 ` Kairui Song
  2023-12-25  6:41   ` Yu Zhao
  2 siblings, 1 reply; 14+ messages in thread
From: Kairui Song @ 2023-12-22 10:22 UTC (permalink / raw)
  To: linux-mm; +Cc: Andrew Morton, Yu Zhao, linux-kernel, Kairui Song

From: Kairui Song <kasong@tencent.com>

Prefetch for inactive/active LRU have been long exiting, apply the same
optimization for MGLRU.

Tested in a 4G memcg on a EPYC 7K62 with:

  memcached -u nobody -m 16384 -s /tmp/memcached.socket \
    -a 0766 -t 16 -B binary &

  memtier_benchmark -S /tmp/memcached.socket \
    -P memcache_binary -n allkeys \
    --key-minimum=1 --key-maximum=16000000 -d 1024 \
    --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6

Average result of 18 test runs:

Before:           44017.78 Ops/sec
After patch 1-3:  44890.50 Ops/sec (+1.8%)

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/vmscan.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index af1266129c1b..1e9d69e18443 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3764,10 +3764,12 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
 			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
 			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
 
-			if (unlikely(list_is_first(&folio->lru, head)))
+			if (unlikely(list_is_first(&folio->lru, head))) {
 				prev = NULL;
-			else
+			} else {
 				prev = lru_to_folio(&folio->lru);
+				prefetchw(&prev->flags);
+			}
 
 			new_gen = folio_inc_gen(lruvec, folio, false, &batch);
 			lru_gen_try_inc_bulk(lrugen, folio, bulk_gen, new_gen, type, zone, &batch);
@@ -4434,10 +4436,12 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
 
 			scanned += delta;
-			if (unlikely(list_is_first(&folio->lru, head)))
+			if (unlikely(list_is_first(&folio->lru, head))) {
 				prev = NULL;
-			else
+			} else {
 				prev = lru_to_folio(&folio->lru);
+				prefetchw(&prev->flags);
+			}
 
 			if (sort_folio(lruvec, folio, sc, tier, bulk_gen, &batch))
 				sorted += delta;
-- 
2.43.0



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] mm, lru_gen: move pages in bulk when aging
  2023-12-22 10:22 ` [PATCH 2/3] mm, lru_gen: move pages in bulk when aging Kairui Song
@ 2023-12-23  7:36   ` kernel test robot
  2023-12-25  6:58   ` Yu Zhao
  1 sibling, 0 replies; 14+ messages in thread
From: kernel test robot @ 2023-12-23  7:36 UTC (permalink / raw)
  To: Kairui Song, linux-mm
  Cc: oe-kbuild-all, Andrew Morton, Linux Memory Management List,
	Yu Zhao, linux-kernel, Kairui Song

Hi Kairui,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Kairui-Song/mm-lru_gen-batch-update-counters-on-againg/20231222-184601
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20231222102255.56993-3-ryncsn%40gmail.com
patch subject: [PATCH 2/3] mm, lru_gen: move pages in bulk when aging
config: arc-randconfig-002-20231223 (https://download.01.org/0day-ci/archive/20231223/202312231555.KTX84YjF-lkp@intel.com/config)
compiler: arc-elf-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231223/202312231555.KTX84YjF-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202312231555.KTX84YjF-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> mm/vmscan.c:3108:1: warning: 'inline' is not at beginning of declaration [-Wold-style-declaration]
    3108 | static void inline lru_gen_inc_bulk_finish(struct lru_gen_folio *lrugen,
         | ^~~~~~
   mm/vmscan.c:3127:1: warning: 'inline' is not at beginning of declaration [-Wold-style-declaration]
    3127 | static void inline lru_gen_try_inc_bulk(struct lru_gen_folio *lrugen, struct folio *folio,
         | ^~~~~~


vim +/inline +3108 mm/vmscan.c

  3107	
> 3108	static void inline lru_gen_inc_bulk_finish(struct lru_gen_folio *lrugen,
  3109						   int bulk_gen, bool type, int zone,
  3110						   struct gen_update_batch *batch)
  3111	{
  3112		if (!batch->head)
  3113			return;
  3114	
  3115		list_bulk_move_tail(&lrugen->folios[bulk_gen][type][zone],
  3116				    &batch->head->lru,
  3117				    &batch->tail->lru);
  3118	
  3119		batch->head = NULL;
  3120	}
  3121	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] mm, lru_gen: try to prefetch next page when canning LRU
  2023-12-22 10:22 ` [PATCH 3/3] mm, lru_gen: try to prefetch next page when canning LRU Kairui Song
@ 2023-12-25  6:41   ` Yu Zhao
  2023-12-25  6:54     ` Yu Zhao
  2023-12-25 15:42     ` Matthew Wilcox
  0 siblings, 2 replies; 14+ messages in thread
From: Yu Zhao @ 2023-12-25  6:41 UTC (permalink / raw)
  To: Kairui Song, Matthew Wilcox; +Cc: linux-mm, Andrew Morton, linux-kernel

On Fri, Dec 22, 2023 at 3:24 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Prefetch for inactive/active LRU have been long exiting, apply the same
> optimization for MGLRU.

I seriously doubt that prefetch helps in this case.

Willy, any thoughts on this? Thanks.

> Tested in a 4G memcg on a EPYC 7K62 with:
>
>   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
>     -a 0766 -t 16 -B binary &
>
>   memtier_benchmark -S /tmp/memcached.socket \
>     -P memcache_binary -n allkeys \
>     --key-minimum=1 --key-maximum=16000000 -d 1024 \
>     --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6
>
> Average result of 18 test runs:
>
> Before:           44017.78 Ops/sec
> After patch 1-3:  44890.50 Ops/sec (+1.8%)
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/vmscan.c | 12 ++++++++----
>  1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index af1266129c1b..1e9d69e18443 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3764,10 +3764,12 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
>                         VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
>                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
>
> -                       if (unlikely(list_is_first(&folio->lru, head)))
> +                       if (unlikely(list_is_first(&folio->lru, head))) {
>                                 prev = NULL;
> -                       else
> +                       } else {
>                                 prev = lru_to_folio(&folio->lru);
> +                               prefetchw(&prev->flags);
> +                       }
>
>                         new_gen = folio_inc_gen(lruvec, folio, false, &batch);
>                         lru_gen_try_inc_bulk(lrugen, folio, bulk_gen, new_gen, type, zone, &batch);
> @@ -4434,10 +4436,12 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
>
>                         scanned += delta;
> -                       if (unlikely(list_is_first(&folio->lru, head)))
> +                       if (unlikely(list_is_first(&folio->lru, head))) {
>                                 prev = NULL;
> -                       else
> +                       } else {
>                                 prev = lru_to_folio(&folio->lru);
> +                               prefetchw(&prev->flags);
> +                       }
>
>                         if (sort_folio(lruvec, folio, sc, tier, bulk_gen, &batch))
>                                 sorted += delta;
> --
> 2.43.0
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] mm, lru_gen: try to prefetch next page when canning LRU
  2023-12-25  6:41   ` Yu Zhao
@ 2023-12-25  6:54     ` Yu Zhao
  2023-12-25 15:42     ` Matthew Wilcox
  1 sibling, 0 replies; 14+ messages in thread
From: Yu Zhao @ 2023-12-25  6:54 UTC (permalink / raw)
  To: Kairui Song, Matthew Wilcox; +Cc: linux-mm, Andrew Morton, linux-kernel

On Sun, Dec 24, 2023 at 11:41 PM Yu Zhao <yuzhao@google.com> wrote:
>
> On Fri, Dec 22, 2023 at 3:24 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > Prefetch for inactive/active LRU have been long exiting, apply the same
> > optimization for MGLRU.
>
> I seriously doubt that prefetch helps in this case.
>
> Willy, any thoughts on this? Thanks.
>
> > Tested in a 4G memcg on a EPYC 7K62 with:
> >
> >   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
> >     -a 0766 -t 16 -B binary &
> >
> >   memtier_benchmark -S /tmp/memcached.socket \
> >     -P memcache_binary -n allkeys \
> >     --key-minimum=1 --key-maximum=16000000 -d 1024 \
> >     --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6
> >
> > Average result of 18 test runs:
> >
> > Before:           44017.78 Ops/sec
> > After patch 1-3:  44890.50 Ops/sec (+1.8%)

This patch itself only brought a 0.17% "improvement", which I'm
99.999% sure is just noise.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] mm, lru_gen: move pages in bulk when aging
  2023-12-22 10:22 ` [PATCH 2/3] mm, lru_gen: move pages in bulk when aging Kairui Song
  2023-12-23  7:36   ` kernel test robot
@ 2023-12-25  6:58   ` Yu Zhao
  2023-12-25  7:01     ` Kairui Song
  1 sibling, 1 reply; 14+ messages in thread
From: Yu Zhao @ 2023-12-25  6:58 UTC (permalink / raw)
  To: Kairui Song; +Cc: linux-mm, Andrew Morton, linux-kernel

On Fri, Dec 22, 2023 at 3:24 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Another overhead of aging is page moving. Actually, in most cases,
> pages are being moved to the same gen after folio_inc_gen is called,
> especially the protected pages.  So it's better to move them in bulk.
>
> This also has a good effect on LRU ordering. Currently when MGLRU
> ages, it walks the LRU backward, and the protected pages are moved to
> the tail of newer gen one by one, which reverses the order of pages in
> LRU. Moving them in batches can help keep their order, only in a small
> scope though due to the scan limit of MAX_LRU_BATCH pages.
>
> After this commit, we can see a performance gain:
>
> Tested in a 4G memcg on a EPYC 7K62 with:
>
>   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
>     -a 0766 -t 16 -B binary &
>
>   memtier_benchmark -S /tmp/memcached.socket \
>     -P memcache_binary -n allkeys \
>     --key-minimum=1 --key-maximum=16000000 -d 1024 \
>     --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6
>
> Average result of 18 test runs:
>
> Before:           44017.78 Ops/sec
> After patch 1-2:  44810.01 Ops/sec (+1.8%)

Was it tested with CONFIG_DEBUG_LIST=y?

Also, the (44810.01-44687.08)/44687.08=0.0027 improvement also sounded
like a noise to me.

> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/vmscan.c | 84 ++++++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 71 insertions(+), 13 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index e3b4797b9729..af1266129c1b 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3102,9 +3102,46 @@ static int folio_update_gen(struct folio *folio, int gen)
>   */
>  struct gen_update_batch {
>         int delta[MAX_NR_GENS];
> +       struct folio *head, *tail;
>  };
>
> -static void lru_gen_update_batch(struct lruvec *lruvec, bool type, int zone,
> +static void inline lru_gen_inc_bulk_finish(struct lru_gen_folio *lrugen,
> +                                          int bulk_gen, bool type, int zone,
> +                                          struct gen_update_batch *batch)
> +{
> +       if (!batch->head)
> +               return;
> +
> +       list_bulk_move_tail(&lrugen->folios[bulk_gen][type][zone],
> +                           &batch->head->lru,
> +                           &batch->tail->lru);
> +
> +       batch->head = NULL;
> +}
> +
> +/*
> + * When aging, protected pages will go to the tail of the same higher
> + * gen, so the can be moved in batches. Besides reduced overhead, this
> + * also avoids changing their LRU order in a small scope.
> + */
> +static void inline lru_gen_try_inc_bulk(struct lru_gen_folio *lrugen, struct folio *folio,
> +                                       int bulk_gen, int gen, bool type, int zone,
> +                                       struct gen_update_batch *batch)
> +{
> +       /*
> +        * If folio not moving to the bulk_gen, it's raced with promotion
> +        * so it need to go to the head of another LRU.
> +        */
> +       if (bulk_gen != gen)
> +               list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
> +
> +       if (!batch->head)
> +               batch->tail = folio;
> +
> +       batch->head = folio;
> +}
> +
> +static void lru_gen_update_batch(struct lruvec *lruvec, int bulk_gen, bool type, int zone,
>                                  struct gen_update_batch *batch)
>  {
>         int gen;
> @@ -3112,6 +3149,8 @@ static void lru_gen_update_batch(struct lruvec *lruvec, bool type, int zone,
>         struct lru_gen_folio *lrugen = &lruvec->lrugen;
>         enum lru_list lru = type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
>
> +       lru_gen_inc_bulk_finish(lrugen, bulk_gen, type, zone, batch);
> +
>         for (gen = 0; gen < MAX_NR_GENS; gen++) {
>                 int delta = batch->delta[gen];
>
> @@ -3705,6 +3744,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
>         struct gen_update_batch batch = { };
>         struct lru_gen_folio *lrugen = &lruvec->lrugen;
>         int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
> +       int bulk_gen = (old_gen + 1) % MAX_NR_GENS;
>
>         if (type == LRU_GEN_ANON && !can_swap)
>                 goto done;
> @@ -3712,24 +3752,33 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
>         /* prevent cold/hot inversion if force_scan is true */
>         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
>                 struct list_head *head = &lrugen->folios[old_gen][type][zone];
> +               struct folio *prev = NULL;
>
> -               while (!list_empty(head)) {
> -                       struct folio *folio = lru_to_folio(head);
> +               if (!list_empty(head))
> +                       prev = lru_to_folio(head);
>
> +               while (prev) {
> +                       struct folio *folio = prev;
>                         VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
>                         VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
>                         VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
>                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
>
> +                       if (unlikely(list_is_first(&folio->lru, head)))
> +                               prev = NULL;
> +                       else
> +                               prev = lru_to_folio(&folio->lru);
> +
>                         new_gen = folio_inc_gen(lruvec, folio, false, &batch);
> -                       list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
> +                       lru_gen_try_inc_bulk(lrugen, folio, bulk_gen, new_gen, type, zone, &batch);
>
>                         if (!--remaining) {
> -                               lru_gen_update_batch(lruvec, type, zone, &batch);
> +                               lru_gen_update_batch(lruvec, bulk_gen, type, zone, &batch);
>                                 return false;
>                         }
>                 }
> -               lru_gen_update_batch(lruvec, type, zone, &batch);
> +
> +               lru_gen_update_batch(lruvec, bulk_gen, type, zone, &batch);
>         }
>  done:
>         reset_ctrl_pos(lruvec, type, true);
> @@ -4240,7 +4289,7 @@ static int lru_gen_memcg_seg(struct lruvec *lruvec)
>   ******************************************************************************/
>
>  static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
> -                      int tier_idx, struct gen_update_batch *batch)
> +                      int tier_idx, int bulk_gen, struct gen_update_batch *batch)
>  {
>         bool success;
>         int gen = folio_lru_gen(folio);
> @@ -4283,7 +4332,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
>                 int hist = lru_hist_from_seq(lrugen->min_seq[type]);
>
>                 gen = folio_inc_gen(lruvec, folio, false, batch);
> -               list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
> +               lru_gen_try_inc_bulk(lrugen, folio, bulk_gen, gen, type, zone, batch);
>
>                 WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
>                            lrugen->protected[hist][type][tier - 1] + delta);
> @@ -4293,7 +4342,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
>         /* ineligible */
>         if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
>                 gen = folio_inc_gen(lruvec, folio, false, batch);
> -               list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
> +               lru_gen_try_inc_bulk(lrugen, folio, bulk_gen, gen, type, zone, batch);
>                 return true;
>         }
>
> @@ -4367,11 +4416,16 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>                 LIST_HEAD(moved);
>                 int skipped_zone = 0;
>                 struct gen_update_batch batch = { };
> +               int bulk_gen = (gen + 1) % MAX_NR_GENS;
>                 int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
>                 struct list_head *head = &lrugen->folios[gen][type][zone];
> +               struct folio *prev = NULL;
>
> -               while (!list_empty(head)) {
> -                       struct folio *folio = lru_to_folio(head);
> +               if (!list_empty(head))
> +                       prev = lru_to_folio(head);
> +
> +               while (prev) {
> +                       struct folio *folio = prev;
>                         int delta = folio_nr_pages(folio);
>
>                         VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
> @@ -4380,8 +4434,12 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
>
>                         scanned += delta;
> +                       if (unlikely(list_is_first(&folio->lru, head)))
> +                               prev = NULL;
> +                       else
> +                               prev = lru_to_folio(&folio->lru);
>
> -                       if (sort_folio(lruvec, folio, sc, tier, &batch))
> +                       if (sort_folio(lruvec, folio, sc, tier, bulk_gen, &batch))
>                                 sorted += delta;
>                         else if (isolate_folio(lruvec, folio, sc)) {
>                                 list_add(&folio->lru, list);
> @@ -4401,7 +4459,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>                         skipped += skipped_zone;
>                 }
>
> -               lru_gen_update_batch(lruvec, type, zone, &batch);
> +               lru_gen_update_batch(lruvec, bulk_gen, type, zone, &batch);
>
>                 if (!remaining || isolated >= MIN_LRU_BATCH)
>                         break;
> --
> 2.43.0
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] mm, lru_gen: move pages in bulk when aging
  2023-12-25  6:58   ` Yu Zhao
@ 2023-12-25  7:01     ` Kairui Song
  0 siblings, 0 replies; 14+ messages in thread
From: Kairui Song @ 2023-12-25  7:01 UTC (permalink / raw)
  To: Yu Zhao; +Cc: linux-mm, Andrew Morton, linux-kernel

Yu Zhao <yuzhao@google.com> 于2023年12月25日周一 14:58写道：
>
> On Fri, Dec 22, 2023 at 3:24 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > Another overhead of aging is page moving. Actually, in most cases,
> > pages are being moved to the same gen after folio_inc_gen is called,
> > especially the protected pages.  So it's better to move them in bulk.
> >
> > This also has a good effect on LRU ordering. Currently when MGLRU
> > ages, it walks the LRU backward, and the protected pages are moved to
> > the tail of newer gen one by one, which reverses the order of pages in
> > LRU. Moving them in batches can help keep their order, only in a small
> > scope though due to the scan limit of MAX_LRU_BATCH pages.
> >
> > After this commit, we can see a performance gain:
> >
> > Tested in a 4G memcg on a EPYC 7K62 with:
> >
> >   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
> >     -a 0766 -t 16 -B binary &
> >
> >   memtier_benchmark -S /tmp/memcached.socket \
> >     -P memcache_binary -n allkeys \
> >     --key-minimum=1 --key-maximum=16000000 -d 1024 \
> >     --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6
> >
> > Average result of 18 test runs:
> >
> > Before:           44017.78 Ops/sec
> > After patch 1-2:  44810.01 Ops/sec (+1.8%)
>
> Was it tested with CONFIG_DEBUG_LIST=y?
>

Hi, CONFIG_DEBUG_LIST is disabled here.

> Also, the (44810.01-44687.08)/44687.08=0.0027 improvement also sounded
> like a noise to me.
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] mm, lru_gen: batch update counters on againg
  2023-12-22 10:22 ` [PATCH 1/3] mm, lru_gen: batch update counters on againg Kairui Song
@ 2023-12-25  7:28   ` Yu Zhao
  2023-12-26 23:43   ` Chris Li
  1 sibling, 0 replies; 14+ messages in thread
From: Yu Zhao @ 2023-12-25  7:28 UTC (permalink / raw)
  To: Kairui Song; +Cc: linux-mm, Andrew Morton, linux-kernel

On Fri, Dec 22, 2023 at 3:24 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> When lru_gen is aging, it will update mm counters page by page,
> which causes a higher overhead if age happens frequently or there
> are a lot of pages in one generation getting moved.
> Optimize this by doing the counter update in batch.
>
> Although most __mod_*_state has its own caches the overhead
> is still observable.
>
> Tested in a 4G memcg on a EPYC 7K62 with:
>
>   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
>     -a 0766 -t 16 -B binary &
>
>   memtier_benchmark -S /tmp/memcached.socket \
>     -P memcache_binary -n allkeys \
>     --key-minimum=1 --key-maximum=16000000 -d 1024 \
>     --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6
>
> Average result of 18 test runs:
>
> Before: 44017.78 Ops/sec
> After:  44687.08 Ops/sec (+1.5%)
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/vmscan.c | 64 +++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 55 insertions(+), 9 deletions(-)

Usually most reclaim activity happens in kswapd, e.g., from the
MongoDB benchmark (--duration=900):
pgscan_kswapd 11294317
pgscan_direct 128
And kswapd always has current->reclaim_state->mm_walk. So the
following should bring the vast majority of the improvement (assuming
it's not noise) with far less code change:

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9dd8977de5a2..c06e00635d2b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3095,6 +3095,8 @@ static int folio_update_gen(struct folio *folio, int gen)
 static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio,
bool reclaiming)
 {
        int type = folio_is_file_lru(folio);
+       struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
        int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
        unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
@@ -3116,7 +3118,10 @@ static int folio_inc_gen(struct lruvec *lruvec,
struct folio *folio, bool reclai
                        new_flags |= BIT(PG_reclaim);
        } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));

-       lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+       if (walk)
+               update_batch_size(walk, folio, old_gen, new_gen);
+       else
+               lru_gen_update_size(lruvec, folio, old_gen, new_gen);

        return new_gen;
 }
@@ -3739,6 +3744,8 @@ static void inc_max_seq(struct lruvec *lruvec,
bool can_swap, bool force_scan)
        int prev, next;
        int type, zone;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;
+       struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+
 restart:
        spin_lock_irq(&lruvec->lru_lock);

@@ -3758,6 +3765,9 @@ static void inc_max_seq(struct lruvec *lruvec,
bool can_swap, bool force_scan)
                goto restart;
        }

+       if (walk && walk->batched)
+               reset_batch_size(lruvec, walk);
+
        /*
         * Update the active/inactive LRU sizes for compatibility. Both sides of
         * the current max_seq need to be covered, since max_seq+1 can overlap


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] mm, lru_gen: try to prefetch next page when canning LRU
  2023-12-25  6:41   ` Yu Zhao
  2023-12-25  6:54     ` Yu Zhao
@ 2023-12-25 15:42     ` Matthew Wilcox
  2023-12-26 22:12       ` Suren Baghdasaryan
  1 sibling, 1 reply; 14+ messages in thread
From: Matthew Wilcox @ 2023-12-25 15:42 UTC (permalink / raw)
  To: Yu Zhao
  Cc: Kairui Song, linux-mm, Andrew Morton, linux-kernel, Suren Baghdasaryan

On Sun, Dec 24, 2023 at 11:41:31PM -0700, Yu Zhao wrote:
> On Fri, Dec 22, 2023 at 3:24 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > Prefetch for inactive/active LRU have been long exiting, apply the same
> > optimization for MGLRU.
> 
> I seriously doubt that prefetch helps in this case.
> 
> Willy, any thoughts on this? Thanks.

It _might_ ... highly depends on microarchitecture.  My experience is
that it offers more benefit on AMD than on Intel, but that experience
is several generations out of date and it may just not be applicable to
modern AMD.

It's probably more effective on ARM Cortex A cores than on ARM Cortex X
cores ... maybe we can get someone from Android (Suren?) to do some
testing?


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] mm, lru_gen: try to prefetch next page when canning LRU
  2023-12-25 15:42     ` Matthew Wilcox
@ 2023-12-26 22:12       ` Suren Baghdasaryan
  0 siblings, 0 replies; 14+ messages in thread
From: Suren Baghdasaryan @ 2023-12-26 22:12 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Yu Zhao, Kairui Song, linux-mm, Andrew Morton, linux-kernel

On Mon, Dec 25, 2023 at 7:42 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Sun, Dec 24, 2023 at 11:41:31PM -0700, Yu Zhao wrote:
> > On Fri, Dec 22, 2023 at 3:24 AM Kairui Song <ryncsn@gmail.com> wrote:
> > >
> > > From: Kairui Song <kasong@tencent.com>
> > >
> > > Prefetch for inactive/active LRU have been long exiting, apply the same
> > > optimization for MGLRU.
> >
> > I seriously doubt that prefetch helps in this case.
> >
> > Willy, any thoughts on this? Thanks.
>
> It _might_ ... highly depends on microarchitecture.  My experience is
> that it offers more benefit on AMD than on Intel, but that experience
> is several generations out of date and it may just not be applicable to
> modern AMD.
>
> It's probably more effective on ARM Cortex A cores than on ARM Cortex X
> cores ... maybe we can get someone from Android (Suren?) to do some
> testing?

Android is quite noisy and I'm afraid a small improvement like this
would not be distinguishable from noise unless it's much more
pronounced. I'll take a stab but don't hold your breath.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] mm, lru_gen: batch update counters on againg
  2023-12-22 10:22 ` [PATCH 1/3] mm, lru_gen: batch update counters on againg Kairui Song
  2023-12-25  7:28   ` Yu Zhao
@ 2023-12-26 23:43   ` Chris Li
  2023-12-27 10:22     ` Kairui Song
  1 sibling, 1 reply; 14+ messages in thread
From: Chris Li @ 2023-12-26 23:43 UTC (permalink / raw)
  To: Kairui Song; +Cc: linux-mm, Andrew Morton, Yu Zhao, linux-kernel

Hi Kairui,

Some early feedback on your patch. I am still working  my way through
your patches.
Might have more questions.

On Fri, Dec 22, 2023 at 2:24 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> When lru_gen is aging, it will update mm counters page by page,
> which causes a higher overhead if age happens frequently or there
> are a lot of pages in one generation getting moved.
> Optimize this by doing the counter update in batch.
>
> Although most __mod_*_state has its own caches the overhead
> is still observable.
>
> Tested in a 4G memcg on a EPYC 7K62 with:
>
>   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
>     -a 0766 -t 16 -B binary &
>
>   memtier_benchmark -S /tmp/memcached.socket \
>     -P memcache_binary -n allkeys \
>     --key-minimum=1 --key-maximum=16000000 -d 1024 \
>     --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6
>
> Average result of 18 test runs:
>
> Before: 44017.78 Ops/sec
> After:  44687.08 Ops/sec (+1.5%)
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/vmscan.c | 64 +++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 55 insertions(+), 9 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b4ca3563bcf4..e3b4797b9729 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3095,9 +3095,47 @@ static int folio_update_gen(struct folio *folio, int gen)
>         return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
>  }
>
> +/*
> + * Update LRU gen in batch for each lru_gen LRU list. The batch is limited to
> + * each gen / type / zone level LRU. Batch is applied after finished or aborted
> + * scanning one LRU list.
> + */
> +struct gen_update_batch {
> +       int delta[MAX_NR_GENS];
> +};
> +
> +static void lru_gen_update_batch(struct lruvec *lruvec, bool type, int zone,
"type" need to be int, it is either  LRU_GEN_FILE or LRU_GEN_ANON.

Ideally the type is an enum that defines LRU_GEN_FILE or LRU_GEN_ANON.
bool is not the right C type of "type" here. The rest of the code uses
"int" for type as well.

I saw you use "bool type" in other patches as well. All need to change
to "int type".

Chris

> +                                struct gen_update_batch *batch)
> +{
> +       int gen;
> +       int promoted = 0;
> +       struct lru_gen_folio *lrugen = &lruvec->lrugen;
> +       enum lru_list lru = type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
> +
> +       for (gen = 0; gen < MAX_NR_GENS; gen++) {
> +               int delta = batch->delta[gen];
> +
> +               if (!delta)
> +                       continue;
> +
> +               WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
> +                          lrugen->nr_pages[gen][type][zone] + delta);
> +
> +               if (lru_gen_is_active(lruvec, gen))
> +                       promoted += delta;
> +       }
> +
> +       if (promoted) {
> +               __update_lru_size(lruvec, lru, zone, -promoted);
> +               __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, promoted);
> +       }
> +}
> +
>  /* protect pages accessed multiple times through file descriptors */
> -static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
> +static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio,
> +                        bool reclaiming, struct gen_update_batch *batch)
>  {
> +       int delta = folio_nr_pages(folio);
>         int type = folio_is_file_lru(folio);
>         struct lru_gen_folio *lrugen = &lruvec->lrugen;
>         int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
> @@ -3120,7 +3158,8 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
>                         new_flags |= BIT(PG_reclaim);
>         } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
>
> -       lru_gen_update_size(lruvec, folio, old_gen, new_gen);
> +       batch->delta[old_gen] -= delta;
> +       batch->delta[new_gen] += delta;
>
>         return new_gen;
>  }
> @@ -3663,6 +3702,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
>  {
>         int zone;
>         int remaining = MAX_LRU_BATCH;
> +       struct gen_update_batch batch = { };
>         struct lru_gen_folio *lrugen = &lruvec->lrugen;
>         int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
>
> @@ -3681,12 +3721,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
>                         VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
>                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
>
> -                       new_gen = folio_inc_gen(lruvec, folio, false);
> +                       new_gen = folio_inc_gen(lruvec, folio, false, &batch);
>                         list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
>
> -                       if (!--remaining)
> +                       if (!--remaining) {
> +                               lru_gen_update_batch(lruvec, type, zone, &batch);
>                                 return false;
> +                       }
>                 }
> +               lru_gen_update_batch(lruvec, type, zone, &batch);
>         }
>  done:
>         reset_ctrl_pos(lruvec, type, true);
> @@ -4197,7 +4240,7 @@ static int lru_gen_memcg_seg(struct lruvec *lruvec)
>   ******************************************************************************/
>
>  static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
> -                      int tier_idx)
> +                      int tier_idx, struct gen_update_batch *batch)
>  {
>         bool success;
>         int gen = folio_lru_gen(folio);
> @@ -4239,7 +4282,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
>         if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
>                 int hist = lru_hist_from_seq(lrugen->min_seq[type]);
>
> -               gen = folio_inc_gen(lruvec, folio, false);
> +               gen = folio_inc_gen(lruvec, folio, false, batch);
>                 list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
>
>                 WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
> @@ -4249,7 +4292,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
>
>         /* ineligible */
>         if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
> -               gen = folio_inc_gen(lruvec, folio, false);
> +               gen = folio_inc_gen(lruvec, folio, false, batch);
>                 list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
>                 return true;
>         }
> @@ -4257,7 +4300,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
>         /* waiting for writeback */
>         if (folio_test_locked(folio) || folio_test_writeback(folio) ||
>             (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
> -               gen = folio_inc_gen(lruvec, folio, true);
> +               gen = folio_inc_gen(lruvec, folio, true, batch);
>                 list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
>                 return true;
>         }
> @@ -4323,6 +4366,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>         for (i = MAX_NR_ZONES; i > 0; i--) {
>                 LIST_HEAD(moved);
>                 int skipped_zone = 0;
> +               struct gen_update_batch batch = { };
>                 int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
>                 struct list_head *head = &lrugen->folios[gen][type][zone];
>
> @@ -4337,7 +4381,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>
>                         scanned += delta;
>
> -                       if (sort_folio(lruvec, folio, sc, tier))
> +                       if (sort_folio(lruvec, folio, sc, tier, &batch))
>                                 sorted += delta;
>                         else if (isolate_folio(lruvec, folio, sc)) {
>                                 list_add(&folio->lru, list);
> @@ -4357,6 +4401,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
>                         skipped += skipped_zone;
>                 }
>
> +               lru_gen_update_batch(lruvec, type, zone, &batch);
> +
>                 if (!remaining || isolated >= MIN_LRU_BATCH)
>                         break;
>         }
> --
> 2.43.0
>
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] mm, lru_gen: batch update counters on againg
  2023-12-26 23:43   ` Chris Li
@ 2023-12-27 10:22     ` Kairui Song
  0 siblings, 0 replies; 14+ messages in thread
From: Kairui Song @ 2023-12-27 10:22 UTC (permalink / raw)
  To: Chris Li; +Cc: linux-mm, Andrew Morton, Yu Zhao, linux-kernel

Chris Li <chrisl@kernel.org> 于2023年12月27日周三 07:43写道：
>
> Hi Kairui,
>
> Some early feedback on your patch. I am still working  my way through
> your patches.
> Might have more questions.

Hi Chris,

Thanks for the review.

> On Fri, Dec 22, 2023 at 2:24 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > When lru_gen is aging, it will update mm counters page by page,
> > which causes a higher overhead if age happens frequently or there
> > are a lot of pages in one generation getting moved.
> > Optimize this by doing the counter update in batch.
> >
> > Although most __mod_*_state has its own caches the overhead
> > is still observable.
> >
> > Tested in a 4G memcg on a EPYC 7K62 with:
> >
> >   memcached -u nobody -m 16384 -s /tmp/memcached.socket \
> >     -a 0766 -t 16 -B binary &
> >
> >   memtier_benchmark -S /tmp/memcached.socket \
> >     -P memcache_binary -n allkeys \
> >     --key-minimum=1 --key-maximum=16000000 -d 1024 \
> >     --ratio=1:0 --key-pattern=P:P -c 2 -t 16 --pipeline 8 -x 6
> >
> > Average result of 18 test runs:
> >
> > Before: 44017.78 Ops/sec
> > After:  44687.08 Ops/sec (+1.5%)
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/vmscan.c | 64 +++++++++++++++++++++++++++++++++++++++++++++--------
> >  1 file changed, 55 insertions(+), 9 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index b4ca3563bcf4..e3b4797b9729 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -3095,9 +3095,47 @@ static int folio_update_gen(struct folio *folio, int gen)
> >         return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
> >  }
> >
> > +/*
> > + * Update LRU gen in batch for each lru_gen LRU list. The batch is limited to
> > + * each gen / type / zone level LRU. Batch is applied after finished or aborted
> > + * scanning one LRU list.
> > + */
> > +struct gen_update_batch {
> > +       int delta[MAX_NR_GENS];
> > +};
> > +
> > +static void lru_gen_update_batch(struct lruvec *lruvec, bool type, int zone,
> "type" need to be int, it is either  LRU_GEN_FILE or LRU_GEN_ANON.

Yes, I'll update it with some more test results later.


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2023-12-27 10:23 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-22 10:22 [PATCH 0/3] mm, lru_gen: batch update pages when aging Kairui Song
2023-12-22 10:22 ` [PATCH 1/3] mm, lru_gen: batch update counters on againg Kairui Song
2023-12-25  7:28   ` Yu Zhao
2023-12-26 23:43   ` Chris Li
2023-12-27 10:22     ` Kairui Song
2023-12-22 10:22 ` [PATCH 2/3] mm, lru_gen: move pages in bulk when aging Kairui Song
2023-12-23  7:36   ` kernel test robot
2023-12-25  6:58   ` Yu Zhao
2023-12-25  7:01     ` Kairui Song
2023-12-22 10:22 ` [PATCH 3/3] mm, lru_gen: try to prefetch next page when canning LRU Kairui Song
2023-12-25  6:41   ` Yu Zhao
2023-12-25  6:54     ` Yu Zhao
2023-12-25 15:42     ` Matthew Wilcox
2023-12-26 22:12       ` Suren Baghdasaryan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox