linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Kairui Song <ryncsn@gmail.com>
To: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	David Hildenbrand <david@kernel.org>,
	 Shakeel Butt <shakeel.butt@linux.dev>,
	Yosry Ahmed <yosry.ahmed@linux.dev>, Zi Yan <ziy@nvidia.com>,
	 "Liam R. Howlett" <Liam.Howlett@oracle.com>,
	Usama Arif <usama.arif@linux.dev>,
	 Kiryl Shutsemau <kas@kernel.org>,
	Dave Chinner <david@fromorbit.com>,
	 Roman Gushchin <roman.gushchin@linux.dev>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru
Date: Fri, 27 Mar 2026 15:51:07 +0800	[thread overview]
Message-ID: <CAMgjq7BA8BHMEwK-QVpH+gDgu9fHEKxs4p1A3CeY9p7G98xhPg@mail.gmail.com> (raw)
In-Reply-To: <20260318200352.1039011-8-hannes@cmpxchg.org>

On Thu, Mar 19, 2026 at 4:05 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> The deferred split queue handles cgroups in a suboptimal fashion. The
> queue is per-NUMA node or per-cgroup, not the intersection. That means
> on a cgrouped system, a node-restricted allocation entering reclaim
> can end up splitting large pages on other nodes:
>
>         alloc/unmap
>           deferred_split_folio()
>             list_add_tail(memcg->split_queue)
>             set_shrinker_bit(memcg, node, deferred_shrinker_id)
>
>         for_each_zone_zonelist_nodemask(restricted_nodes)
>           mem_cgroup_iter()
>             shrink_slab(node, memcg)
>               shrink_slab_memcg(node, memcg)
>                 if test_shrinker_bit(memcg, node, deferred_shrinker_id)
>                   deferred_split_scan()
>                     walks memcg->split_queue
>
> The shrinker bit adds an imperfect guard rail. As soon as the cgroup
> has a single large page on the node of interest, all large pages owned
> by that memcg, including those on other nodes, will be split.
>
> list_lru properly sets up per-node, per-cgroup lists. As a bonus, it
> streamlines a lot of the list operations and reclaim walks. It's used
> widely by other major shrinkers already. Convert the deferred split
> queue as well.
>
> The list_lru per-memcg heads are instantiated on demand when the first
> object of interest is allocated for a cgroup, by calling
> folio_memcg_list_lru_alloc(). Add calls to where splittable pages are
> created: anon faults, swapin faults, khugepaged collapse.
>
> These calls create all possible node heads for the cgroup at once, so
> the migration code (between nodes) doesn't need any special care.
>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  include/linux/huge_mm.h    |   6 +-
>  include/linux/memcontrol.h |   4 -
>  include/linux/mmzone.h     |  12 --
>  mm/huge_memory.c           | 342 ++++++++++++-------------------------
>  mm/internal.h              |   2 +-
>  mm/khugepaged.c            |   7 +
>  mm/memcontrol.c            |  12 +-
>  mm/memory.c                |  52 +++---
>  mm/mm_init.c               |  15 --
>  9 files changed, 151 insertions(+), 301 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index bd7f0e1d8094..8d801ed378db 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -414,10 +414,9 @@ static inline int split_huge_page(struct page *page)
>  {
>         return split_huge_page_to_list_to_order(page, NULL, 0);
>  }
> +
> +extern struct list_lru deferred_split_lru;
>  void deferred_split_folio(struct folio *folio, bool partially_mapped);
> -#ifdef CONFIG_MEMCG
> -void reparent_deferred_split_queue(struct mem_cgroup *memcg);
> -#endif
>
>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>                 unsigned long address, bool freeze);
> @@ -650,7 +649,6 @@ static inline int try_folio_split_to_order(struct folio *folio,
>  }
>
>  static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> -static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
>  #define split_huge_pmd(__vma, __pmd, __address)        \
>         do { } while (0)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 086158969529..0782c72a1997 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -277,10 +277,6 @@ struct mem_cgroup {
>         struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
>  #endif
>
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -       struct deferred_split deferred_split_queue;
> -#endif
> -
>  #ifdef CONFIG_LRU_GEN_WALKS_MMU
>         /* per-memcg mm_struct list */
>         struct lru_gen_mm_list mm_list;
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 7bd0134c241c..232b7a71fd69 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1429,14 +1429,6 @@ struct zonelist {
>   */
>  extern struct page *mem_map;
>
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -struct deferred_split {
> -       spinlock_t split_queue_lock;
> -       struct list_head split_queue;
> -       unsigned long split_queue_len;
> -};
> -#endif
> -
>  #ifdef CONFIG_MEMORY_FAILURE
>  /*
>   * Per NUMA node memory failure handling statistics.
> @@ -1562,10 +1554,6 @@ typedef struct pglist_data {
>         unsigned long first_deferred_pfn;
>  #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
>
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -       struct deferred_split deferred_split_queue;
> -#endif
> -
>  #ifdef CONFIG_NUMA_BALANCING
>         /* start time in ms of current promote rate limit period */
>         unsigned int nbp_rl_start;
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 3fc02913b63e..e90d08db219d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -14,6 +14,7 @@
>  #include <linux/mmu_notifier.h>
>  #include <linux/rmap.h>
>  #include <linux/swap.h>
> +#include <linux/list_lru.h>
>  #include <linux/shrinker.h>
>  #include <linux/mm_inline.h>
>  #include <linux/swapops.h>
> @@ -67,6 +68,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
>         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
>         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
>
> +static struct lock_class_key deferred_split_key;
> +struct list_lru deferred_split_lru;
>  static struct shrinker *deferred_split_shrinker;
>  static unsigned long deferred_split_count(struct shrinker *shrink,
>                                           struct shrink_control *sc);
> @@ -919,6 +922,13 @@ static int __init thp_shrinker_init(void)
>         if (!deferred_split_shrinker)
>                 return -ENOMEM;
>
> +       if (list_lru_init_memcg_key(&deferred_split_lru,
> +                                   deferred_split_shrinker,
> +                                   &deferred_split_key)) {
> +               shrinker_free(deferred_split_shrinker);
> +               return -ENOMEM;
> +       }
> +
>         deferred_split_shrinker->count_objects = deferred_split_count;
>         deferred_split_shrinker->scan_objects = deferred_split_scan;
>         shrinker_register(deferred_split_shrinker);
> @@ -939,6 +949,7 @@ static int __init thp_shrinker_init(void)
>
>         huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
>         if (!huge_zero_folio_shrinker) {
> +               list_lru_destroy(&deferred_split_lru);
>                 shrinker_free(deferred_split_shrinker);
>                 return -ENOMEM;
>         }
> @@ -953,6 +964,7 @@ static int __init thp_shrinker_init(void)
>  static void __init thp_shrinker_exit(void)
>  {
>         shrinker_free(huge_zero_folio_shrinker);
> +       list_lru_destroy(&deferred_split_lru);
>         shrinker_free(deferred_split_shrinker);
>  }
>
> @@ -1133,119 +1145,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
>         return pmd;
>  }
>
> -static struct deferred_split *split_queue_node(int nid)
> -{
> -       struct pglist_data *pgdata = NODE_DATA(nid);
> -
> -       return &pgdata->deferred_split_queue;
> -}
> -
> -#ifdef CONFIG_MEMCG
> -static inline
> -struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
> -                                          struct deferred_split *queue)
> -{
> -       if (mem_cgroup_disabled())
> -               return NULL;
> -       if (split_queue_node(folio_nid(folio)) == queue)
> -               return NULL;
> -       return container_of(queue, struct mem_cgroup, deferred_split_queue);
> -}
> -
> -static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
> -{
> -       return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
> -}
> -#else
> -static inline
> -struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
> -                                          struct deferred_split *queue)
> -{
> -       return NULL;
> -}
> -
> -static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
> -{
> -       return split_queue_node(nid);
> -}
> -#endif
> -
> -static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg)
> -{
> -       struct deferred_split *queue;
> -
> -retry:
> -       queue = memcg_split_queue(nid, memcg);
> -       spin_lock(&queue->split_queue_lock);
> -       /*
> -        * There is a period between setting memcg to dying and reparenting
> -        * deferred split queue, and during this period the THPs in the deferred
> -        * split queue will be hidden from the shrinker side.
> -        */
> -       if (unlikely(memcg_is_dying(memcg))) {
> -               spin_unlock(&queue->split_queue_lock);
> -               memcg = parent_mem_cgroup(memcg);
> -               goto retry;
> -       }
> -
> -       return queue;
> -}
> -
> -static struct deferred_split *
> -split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags)
> -{
> -       struct deferred_split *queue;
> -
> -retry:
> -       queue = memcg_split_queue(nid, memcg);
> -       spin_lock_irqsave(&queue->split_queue_lock, *flags);
> -       if (unlikely(memcg_is_dying(memcg))) {
> -               spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
> -               memcg = parent_mem_cgroup(memcg);
> -               goto retry;
> -       }
> -
> -       return queue;
> -}
> -
> -static struct deferred_split *folio_split_queue_lock(struct folio *folio)
> -{
> -       struct deferred_split *queue;
> -
> -       rcu_read_lock();
> -       queue = split_queue_lock(folio_nid(folio), folio_memcg(folio));
> -       /*
> -        * The memcg destruction path is acquiring the split queue lock for
> -        * reparenting. Once you have it locked, it's safe to drop the rcu lock.
> -        */
> -       rcu_read_unlock();
> -
> -       return queue;
> -}
> -
> -static struct deferred_split *
> -folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
> -{
> -       struct deferred_split *queue;
> -
> -       rcu_read_lock();
> -       queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
> -       rcu_read_unlock();
> -
> -       return queue;
> -}
> -
> -static inline void split_queue_unlock(struct deferred_split *queue)
> -{
> -       spin_unlock(&queue->split_queue_lock);
> -}
> -
> -static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
> -                                                unsigned long flags)
> -{
> -       spin_unlock_irqrestore(&queue->split_queue_lock, flags);
> -}
> -
>  static inline bool is_transparent_hugepage(const struct folio *folio)
>  {
>         if (!folio_test_large(folio))
> @@ -1346,6 +1245,14 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
>                 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>                 return NULL;
>         }
> +
> +       if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> +               folio_put(folio);
> +               count_vm_event(THP_FAULT_FALLBACK);
> +               count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> +               return NULL;
> +       }
> +
>         folio_throttle_swaprate(folio, gfp);
>
>         /*
> @@ -3854,34 +3761,34 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>         struct folio *end_folio = folio_next(folio);
>         struct folio *new_folio, *next;
>         int old_order = folio_order(folio);
> +       struct list_lru_one *l;
> +       bool dequeue_deferred;
>         int ret = 0;
> -       struct deferred_split *ds_queue;
>
>         VM_WARN_ON_ONCE(!mapping && end);
>         /* Prevent deferred_split_scan() touching ->_refcount */
> -       ds_queue = folio_split_queue_lock(folio);
> +       dequeue_deferred = folio_test_anon(folio) && old_order > 1;
> +       if (dequeue_deferred) {
> +               rcu_read_lock();
> +               l = list_lru_lock(&deferred_split_lru,
> +                                 folio_nid(folio), folio_memcg(folio));
> +       }
>         if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
>                 struct swap_cluster_info *ci = NULL;
>                 struct lruvec *lruvec;
>
> -               if (old_order > 1) {
> -                       if (!list_empty(&folio->_deferred_list)) {
> -                               ds_queue->split_queue_len--;
> -                               /*
> -                                * Reinitialize page_deferred_list after removing the
> -                                * page from the split_queue, otherwise a subsequent
> -                                * split will see list corruption when checking the
> -                                * page_deferred_list.
> -                                */
> -                               list_del_init(&folio->_deferred_list);
> -                       }
> +               if (dequeue_deferred) {
> +                       __list_lru_del(&deferred_split_lru, l,
> +                                      &folio->_deferred_list, folio_nid(folio));
>                         if (folio_test_partially_mapped(folio)) {
>                                 folio_clear_partially_mapped(folio);
>                                 mod_mthp_stat(old_order,
>                                         MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
>                         }
> +                       list_lru_unlock(l);
> +                       rcu_read_unlock();
>                 }
> -               split_queue_unlock(ds_queue);
> +
>                 if (mapping) {
>                         int nr = folio_nr_pages(folio);
>
> @@ -3982,7 +3889,10 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>                 if (ci)
>                         swap_cluster_unlock(ci);
>         } else {
> -               split_queue_unlock(ds_queue);
> +               if (dequeue_deferred) {
> +                       list_lru_unlock(l);
> +                       rcu_read_unlock();
> +               }
>                 return -EAGAIN;
>         }
>
> @@ -4349,33 +4259,35 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
>   * queueing THP splits, and that list is (racily observed to be) non-empty.
>   *
>   * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
> - * zero: because even when split_queue_lock is held, a non-empty _deferred_list
> - * might be in use on deferred_split_scan()'s unlocked on-stack list.
> + * zero: because even when the list_lru lock is held, a non-empty
> + * _deferred_list might be in use on deferred_split_scan()'s unlocked
> + * on-stack list.
>   *
> - * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
> - * therefore important to unqueue deferred split before changing folio memcg.
> + * The list_lru sublist is determined by folio's memcg: it is therefore
> + * important to unqueue deferred split before changing folio memcg.
>   */
>  bool __folio_unqueue_deferred_split(struct folio *folio)
>  {
> -       struct deferred_split *ds_queue;
> +       struct list_lru_one *l;
> +       int nid = folio_nid(folio);
>         unsigned long flags;
>         bool unqueued = false;
>
>         WARN_ON_ONCE(folio_ref_count(folio));
>         WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
>
> -       ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
> -       if (!list_empty(&folio->_deferred_list)) {
> -               ds_queue->split_queue_len--;
> +       rcu_read_lock();
> +       l = list_lru_lock_irqsave(&deferred_split_lru, nid, folio_memcg(folio), &flags);
> +       if (__list_lru_del(&deferred_split_lru, l, &folio->_deferred_list, nid)) {
>                 if (folio_test_partially_mapped(folio)) {
>                         folio_clear_partially_mapped(folio);
>                         mod_mthp_stat(folio_order(folio),
>                                       MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
>                 }
> -               list_del_init(&folio->_deferred_list);
>                 unqueued = true;
>         }
> -       split_queue_unlock_irqrestore(ds_queue, flags);
> +       list_lru_unlock_irqrestore(l, &flags);
> +       rcu_read_unlock();
>
>         return unqueued;        /* useful for debug warnings */
>  }
> @@ -4383,7 +4295,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
>  /* partially_mapped=false won't clear PG_partially_mapped folio flag */
>  void deferred_split_folio(struct folio *folio, bool partially_mapped)
>  {
> -       struct deferred_split *ds_queue;
> +       struct list_lru_one *l;
> +       int nid;
> +       struct mem_cgroup *memcg;
>         unsigned long flags;
>
>         /*
> @@ -4406,7 +4320,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
>         if (folio_test_swapcache(folio))
>                 return;
>
> -       ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
> +       nid = folio_nid(folio);
> +
> +       rcu_read_lock();
> +       memcg = folio_memcg(folio);
> +       l = list_lru_lock_irqsave(&deferred_split_lru, nid, memcg, &flags);
>         if (partially_mapped) {
>                 if (!folio_test_partially_mapped(folio)) {
>                         folio_set_partially_mapped(folio);
> @@ -4414,36 +4332,20 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
>                                 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>                         count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>                         mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
> -
>                 }
>         } else {
>                 /* partially mapped folios cannot become non-partially mapped */
>                 VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
>         }
> -       if (list_empty(&folio->_deferred_list)) {
> -               struct mem_cgroup *memcg;
> -
> -               memcg = folio_split_queue_memcg(folio, ds_queue);
> -               list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> -               ds_queue->split_queue_len++;
> -               if (memcg)
> -                       set_shrinker_bit(memcg, folio_nid(folio),
> -                                        shrinker_id(deferred_split_shrinker));
> -       }
> -       split_queue_unlock_irqrestore(ds_queue, flags);
> +       __list_lru_add(&deferred_split_lru, l, &folio->_deferred_list, nid, memcg);
> +       list_lru_unlock_irqrestore(l, &flags);
> +       rcu_read_unlock();
>  }
>
>  static unsigned long deferred_split_count(struct shrinker *shrink,
>                 struct shrink_control *sc)
>  {
> -       struct pglist_data *pgdata = NODE_DATA(sc->nid);
> -       struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
> -
> -#ifdef CONFIG_MEMCG
> -       if (sc->memcg)
> -               ds_queue = &sc->memcg->deferred_split_queue;
> -#endif
> -       return READ_ONCE(ds_queue->split_queue_len);
> +       return list_lru_shrink_count(&deferred_split_lru, sc);
>  }
>
>  static bool thp_underused(struct folio *folio)
> @@ -4473,45 +4375,47 @@ static bool thp_underused(struct folio *folio)
>         return false;
>  }
>
> +static enum lru_status deferred_split_isolate(struct list_head *item,
> +                                             struct list_lru_one *lru,
> +                                             void *cb_arg)
> +{
> +       struct folio *folio = container_of(item, struct folio, _deferred_list);
> +       struct list_head *freeable = cb_arg;
> +
> +       if (folio_try_get(folio)) {
> +               list_lru_isolate_move(lru, item, freeable);
> +               return LRU_REMOVED;
> +       }
> +
> +       /* We lost race with folio_put() */
> +       list_lru_isolate(lru, item);
> +       if (folio_test_partially_mapped(folio)) {
> +               folio_clear_partially_mapped(folio);
> +               mod_mthp_stat(folio_order(folio),
> +                             MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> +       }
> +       return LRU_REMOVED;
> +}
> +
>  static unsigned long deferred_split_scan(struct shrinker *shrink,
>                 struct shrink_control *sc)
>  {
> -       struct deferred_split *ds_queue;
> -       unsigned long flags;
> +       LIST_HEAD(dispose);
>         struct folio *folio, *next;
> -       int split = 0, i;
> -       struct folio_batch fbatch;
> +       int split = 0;
> +       unsigned long isolated;
>
> -       folio_batch_init(&fbatch);
> +       isolated = list_lru_shrink_walk_irq(&deferred_split_lru, sc,
> +                                           deferred_split_isolate, &dispose);
>
> -retry:
> -       ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
> -       /* Take pin on all head pages to avoid freeing them under us */
> -       list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
> -                                                       _deferred_list) {
> -               if (folio_try_get(folio)) {
> -                       folio_batch_add(&fbatch, folio);
> -               } else if (folio_test_partially_mapped(folio)) {
> -                       /* We lost race with folio_put() */
> -                       folio_clear_partially_mapped(folio);
> -                       mod_mthp_stat(folio_order(folio),
> -                                     MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> -               }
> -               list_del_init(&folio->_deferred_list);
> -               ds_queue->split_queue_len--;
> -               if (!--sc->nr_to_scan)
> -                       break;
> -               if (!folio_batch_space(&fbatch))
> -                       break;
> -       }
> -       split_queue_unlock_irqrestore(ds_queue, flags);
> -
> -       for (i = 0; i < folio_batch_count(&fbatch); i++) {
> +       list_for_each_entry_safe(folio, next, &dispose, _deferred_list) {
>                 bool did_split = false;
>                 bool underused = false;
> -               struct deferred_split *fqueue;
> +               struct list_lru_one *l;
> +               unsigned long flags;
> +
> +               list_del_init(&folio->_deferred_list);
>
> -               folio = fbatch.folios[i];
>                 if (!folio_test_partially_mapped(folio)) {
>                         /*
>                          * See try_to_map_unused_to_zeropage(): we cannot
> @@ -4534,64 +4438,32 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>                 }
>                 folio_unlock(folio);
>  next:
> -               if (did_split || !folio_test_partially_mapped(folio))
> -                       continue;
>                 /*
>                  * Only add back to the queue if folio is partially mapped.
>                  * If thp_underused returns false, or if split_folio fails
>                  * in the case it was underused, then consider it used and
>                  * don't add it back to split_queue.
>                  */
> -               fqueue = folio_split_queue_lock_irqsave(folio, &flags);
> -               if (list_empty(&folio->_deferred_list)) {
> -                       list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
> -                       fqueue->split_queue_len++;
> +               if (!did_split && folio_test_partially_mapped(folio)) {
> +                       rcu_read_lock();
> +                       l = list_lru_lock_irqsave(&deferred_split_lru,
> +                                                 folio_nid(folio),
> +                                                 folio_memcg(folio),
> +                                                 &flags);
> +                       __list_lru_add(&deferred_split_lru, l,
> +                                      &folio->_deferred_list,
> +                                      folio_nid(folio), folio_memcg(folio));
> +                       list_lru_unlock_irqrestore(l, &flags);
> +                       rcu_read_unlock();
>                 }
> -               split_queue_unlock_irqrestore(fqueue, flags);
> -       }
> -       folios_put(&fbatch);
> -
> -       if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) {
> -               cond_resched();
> -               goto retry;
> +               folio_put(folio);
>         }
>
> -       /*
> -        * Stop shrinker if we didn't split any page, but the queue is empty.
> -        * This can happen if pages were freed under us.
> -        */
> -       if (!split && list_empty(&ds_queue->split_queue))
> +       if (!split && !isolated)
>                 return SHRINK_STOP;
>         return split;
>  }
>
> -#ifdef CONFIG_MEMCG
> -void reparent_deferred_split_queue(struct mem_cgroup *memcg)
> -{
> -       struct mem_cgroup *parent = parent_mem_cgroup(memcg);
> -       struct deferred_split *ds_queue = &memcg->deferred_split_queue;
> -       struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
> -       int nid;
> -
> -       spin_lock_irq(&ds_queue->split_queue_lock);
> -       spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
> -
> -       if (!ds_queue->split_queue_len)
> -               goto unlock;
> -
> -       list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
> -       parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
> -       ds_queue->split_queue_len = 0;
> -
> -       for_each_node(nid)
> -               set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
> -
> -unlock:
> -       spin_unlock(&parent_ds_queue->split_queue_lock);
> -       spin_unlock_irq(&ds_queue->split_queue_lock);
> -}
> -#endif
> -
>  #ifdef CONFIG_DEBUG_FS
>  static void split_huge_pages_all(void)
>  {
> diff --git a/mm/internal.h b/mm/internal.h
> index f98f4746ac41..d8c737338df5 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -863,7 +863,7 @@ static inline bool folio_unqueue_deferred_split(struct folio *folio)
>         /*
>          * At this point, there is no one trying to add the folio to
>          * deferred_list. If folio is not in deferred_list, it's safe
> -        * to check without acquiring the split_queue_lock.
> +        * to check without acquiring the list_lru lock.
>          */
>         if (data_race(list_empty(&folio->_deferred_list)))
>                 return false;
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 4b0e59c7c0e6..b2ac28ddd480 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1081,6 +1081,7 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
>         }
>
>         count_vm_event(THP_COLLAPSE_ALLOC);
> +
>         if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
>                 folio_put(folio);
>                 *foliop = NULL;
> @@ -1089,6 +1090,12 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
>
>         count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
>
> +       if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> +               folio_put(folio);
> +               *foliop = NULL;
> +               return SCAN_CGROUP_CHARGE_FAIL;
> +       }
> +
>         *foliop = folio;
>         return SCAN_SUCCEED;
>  }
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a47fb68dd65f..f381cb6bdff1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4015,11 +4015,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
>         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
>                 memcg->cgwb_frn[i].done =
>                         __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
> -#endif
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -       spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
> -       INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
> -       memcg->deferred_split_queue.split_queue_len = 0;
>  #endif
>         lru_gen_init_memcg(memcg);
>         return memcg;
> @@ -4167,11 +4162,10 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
>         zswap_memcg_offline_cleanup(memcg);
>
>         memcg_offline_kmem(memcg);
> -       reparent_deferred_split_queue(memcg);
>         /*
> -        * The reparenting of objcg must be after the reparenting of the
> -        * list_lru and deferred_split_queue above, which ensures that they will
> -        * not mistakenly get the parent list_lru and deferred_split_queue.
> +        * The reparenting of objcg must be after the reparenting of
> +        * the list_lru in memcg_offline_kmem(), which ensures that
> +        * they will not mistakenly get the parent list_lru.
>          */
>         memcg_reparent_objcgs(memcg);
>         reparent_shrinker_deferred(memcg);
> diff --git a/mm/memory.c b/mm/memory.c
> index 219b9bf6cae0..e68ceb4aa624 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4651,13 +4651,19 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>         while (orders) {
>                 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>                 folio = vma_alloc_folio(gfp, order, vma, addr);
> -               if (folio) {
> -                       if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
> -                                                           gfp, entry))
> -                               return folio;
> +               if (!folio)
> +                       goto next;
> +               if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) {
>                         count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
>                         folio_put(folio);
> +                       goto next;
>                 }
> +               if (folio_memcg_list_lru_alloc(folio, &deferred_split_lru, gfp)) {
> +                       folio_put(folio);
> +                       goto fallback;
> +               }

Hi Johannes,

Haven't checked every detail yet, but one question here, might be
trivial, will it be better if we fallback to the next order instead of
fallback to 0 order directly? Suppose this is a 2M allocation and 1M
fallback is allowed, releasing that folio and fallback to 1M will free
1M memory which would be enough for the list lru metadata to be
allocated.


  parent reply	other threads:[~2026-03-27  7:51 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-18 19:53 [PATCH v3 0/7] mm: switch THP " Johannes Weiner
2026-03-18 19:53 ` [PATCH v3 1/7] mm: list_lru: lock_list_lru_of_memcg() cannot return NULL if !skip_empty Johannes Weiner
2026-03-18 20:12   ` Shakeel Butt
2026-03-24 11:30   ` Lorenzo Stoakes (Oracle)
2026-03-18 19:53 ` [PATCH v3 2/7] mm: list_lru: deduplicate unlock_list_lru() Johannes Weiner
2026-03-24 11:32   ` Lorenzo Stoakes (Oracle)
2026-03-18 19:53 ` [PATCH v3 3/7] mm: list_lru: move list dead check to lock_list_lru_of_memcg() Johannes Weiner
2026-03-18 20:20   ` Shakeel Butt
2026-03-24 11:34   ` Lorenzo Stoakes (Oracle)
2026-03-18 19:53 ` [PATCH v3 4/7] mm: list_lru: deduplicate lock_list_lru() Johannes Weiner
2026-03-18 20:22   ` Shakeel Butt
2026-03-24 11:36   ` Lorenzo Stoakes (Oracle)
2026-03-18 19:53 ` [PATCH v3 5/7] mm: list_lru: introduce caller locking for additions and deletions Johannes Weiner
2026-03-18 20:51   ` Shakeel Butt
2026-03-20 16:18     ` Johannes Weiner
2026-03-24 11:55   ` Lorenzo Stoakes (Oracle)
2026-03-18 19:53 ` [PATCH v3 6/7] mm: list_lru: introduce folio_memcg_list_lru_alloc() Johannes Weiner
2026-03-18 20:52   ` Shakeel Butt
2026-03-18 21:01   ` Shakeel Butt
2026-03-24 12:01   ` Lorenzo Stoakes (Oracle)
2026-03-30 16:54     ` Johannes Weiner
2026-04-01 14:43       ` Lorenzo Stoakes (Oracle)
2026-03-18 19:53 ` [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru Johannes Weiner
2026-03-18 20:26   ` David Hildenbrand (Arm)
2026-03-18 23:18   ` Shakeel Butt
2026-03-24 13:48   ` Lorenzo Stoakes (Oracle)
2026-03-30 16:40     ` Johannes Weiner
2026-04-01 17:33       ` Lorenzo Stoakes (Oracle)
2026-04-06 21:37         ` Johannes Weiner
2026-04-07  9:55           ` Lorenzo Stoakes (Oracle)
2026-03-27  7:51   ` Kairui Song [this message]
2026-03-30 16:51     ` Johannes Weiner
2026-03-30 16:37   ` [PATCH v3 7/7] mm: switch deferred split shrinker to list_lru - [s390] panic in __memcg_list_lru_alloc Mikhail Zaslonko
2026-03-30 19:03     ` Andrew Morton
2026-03-30 20:41     ` Johannes Weiner
2026-03-30 20:56       ` Johannes Weiner
2026-03-30 22:46         ` Vasily Gorbik
2026-03-31  8:04         ` Mikhail Zaslonko
2026-03-18 21:00 ` [PATCH v3 0/7] mm: switch THP shrinker to list_lru Lorenzo Stoakes (Oracle)
2026-03-18 22:31   ` Johannes Weiner
2026-03-19  8:47     ` Lorenzo Stoakes (Oracle)
2026-03-19  8:52       ` David Hildenbrand (Arm)
2026-03-19 11:45         ` Lorenzo Stoakes (Oracle)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAMgjq7BA8BHMEwK-QVpH+gDgu9fHEKxs4p1A3CeY9p7G98xhPg@mail.gmail.com \
    --to=ryncsn@gmail.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@fromorbit.com \
    --cc=david@kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=kas@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=usama.arif@linux.dev \
    --cc=yosry.ahmed@linux.dev \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox