On Wed, Sep 27, 2023 at 5:57 PM Nhat Pham <nphamcs@gmail.com> wrote:

> Currently, hugetlb memory usage is not acounted for in the memory
> controller, which could lead to memory overprotection for cgroups with
> hugetlb-backed memory. This has been observed in our production system.
>
> This patch rectifies this issue by charging the memcg when the hugetlb
> folio is allocated, and uncharging when the folio is freed (analogous to
> the hugetlb controller).
>
> Signed-off-by: Nhat Pham <nphamcs@gmail.com>
> ---
>  Documentation/admin-guide/cgroup-v2.rst |  9 ++++++
>  fs/hugetlbfs/inode.c                    |  2 +-
>  include/linux/cgroup-defs.h             |  5 +++
>  include/linux/hugetlb.h                 |  6 ++--
>  include/linux/memcontrol.h              |  8 +++++
>  kernel/cgroup/cgroup.c                  | 15 ++++++++-
>  mm/hugetlb.c                            | 23 ++++++++++----
>  mm/memcontrol.c                         | 41 +++++++++++++++++++++++++
>  8 files changed, 99 insertions(+), 10 deletions(-)
>
> diff --git a/Documentation/admin-guide/cgroup-v2.rst
> b/Documentation/admin-guide/cgroup-v2.rst
> index 622a7f28db1f..e6267b8cbd1d 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -210,6 +210,15 @@ cgroup v2 currently supports the following mount
> options.
>          relying on the original semantics (e.g. specifying bogusly
>          high 'bypass' protection values at higher tree levels).
>
> +  memory_hugetlb_accounting
> +        Count hugetlb memory usage towards the cgroup's overall
> +        memory usage for the memory controller. This is a new behavior
> +        that could regress existing setups, so it must be explicitly
> +        opted in with this mount option. Note that hugetlb pages
> +        allocated while this option is not selected will not be
> +        tracked by the memory controller (even if cgroup v2 is
> +        remounted later on).
> +
>
>  Organizing Processes and Threads
>  --------------------------------
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 60fce26ff937..034967319955 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -902,7 +902,7 @@ static long hugetlbfs_fallocate(struct file *file, int
> mode, loff_t offset,
>                  * to keep reservation accounting consistent.
>                  */
>                 hugetlb_set_vma_policy(&pseudo_vma, inode, index);
> -               folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0);
> +               folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0, true);
>                 hugetlb_drop_vma_policy(&pseudo_vma);
>                 if (IS_ERR(folio)) {
>                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index f1b3151ac30b..8641f4320c98 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -115,6 +115,11 @@ enum {
>          * Enable recursive subtree protection
>          */
>         CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18),
> +
> +       /*
> +        * Enable hugetlb accounting for the memory controller.
> +        */
> +        CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
>  };
>
>  /* cftype->flags */
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index a30686e649f7..9b73db1605a2 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -713,7 +713,8 @@ struct huge_bootmem_page {
>
>  int isolate_or_dissolve_huge_page(struct page *page, struct list_head
> *list);
>  struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
> -                               unsigned long addr, int avoid_reserve);
> +                               unsigned long addr, int avoid_reserve,
> +                               bool restore_reserve_on_memcg_failure);
>  struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int
> preferred_nid,
>                                 nodemask_t *nmask, gfp_t gfp_mask);
>  struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct
> vm_area_struct *vma,
> @@ -1016,7 +1017,8 @@ static inline int
> isolate_or_dissolve_huge_page(struct page *page,
>
>  static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct
> *vma,
>                                            unsigned long addr,
> -                                          int avoid_reserve)
> +                                          int avoid_reserve,
> +                                          bool
> restore_reserve_on_memcg_failure)
>  {
>         return NULL;
>  }
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index e0cfab58ab71..8094679c99dd 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -677,6 +677,8 @@ static inline int mem_cgroup_charge(struct folio
> *folio, struct mm_struct *mm,
>         return __mem_cgroup_charge(folio, mm, gfp);
>  }
>
> +int mem_cgroup_hugetlb_charge_folio(struct folio *folio, gfp_t gfp);
> +
>  int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct
> *mm,
>                                   gfp_t gfp, swp_entry_t entry);
>  void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
> @@ -1251,6 +1253,12 @@ static inline int mem_cgroup_charge(struct folio
> *folio,
>         return 0;
>  }
>
> +static inline int mem_cgroup_hugetlb_charge_folio(struct folio *folio,
> +               gfp_t gfp)
> +{
> +       return 0;
> +}
> +
>  static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
>                         struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
>  {
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 1fb7f562289d..f11488b18ceb 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -1902,6 +1902,7 @@ enum cgroup2_param {
>         Opt_favordynmods,
>         Opt_memory_localevents,
>         Opt_memory_recursiveprot,
> +       Opt_memory_hugetlb_accounting,
>         nr__cgroup2_params
>  };
>
> @@ -1910,6 +1911,7 @@ static const struct fs_parameter_spec
> cgroup2_fs_parameters[] = {
>         fsparam_flag("favordynmods",            Opt_favordynmods),
>         fsparam_flag("memory_localevents",      Opt_memory_localevents),
>         fsparam_flag("memory_recursiveprot",    Opt_memory_recursiveprot),
> +       fsparam_flag("memory_hugetlb_accounting",
> Opt_memory_hugetlb_accounting),
>         {}
>  };
>
> @@ -1936,6 +1938,9 @@ static int cgroup2_parse_param(struct fs_context
> *fc, struct fs_parameter *param
>         case Opt_memory_recursiveprot:
>                 ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
>                 return 0;
> +       case Opt_memory_hugetlb_accounting:
> +               ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
> +               return 0;
>         }
>         return -EINVAL;
>  }
> @@ -1960,6 +1965,11 @@ static void apply_cgroup_root_flags(unsigned int
> root_flags)
>                         cgrp_dfl_root.flags |=
> CGRP_ROOT_MEMORY_RECURSIVE_PROT;
>                 else
>                         cgrp_dfl_root.flags &=
> ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
> +
> +               if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
> +                       cgrp_dfl_root.flags |=
> CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
> +               else
> +                       cgrp_dfl_root.flags &=
> ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
>         }
>  }
>
> @@ -1973,6 +1983,8 @@ static int cgroup_show_options(struct seq_file *seq,
> struct kernfs_root *kf_root
>                 seq_puts(seq, ",memory_localevents");
>         if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
>                 seq_puts(seq, ",memory_recursiveprot");
> +       if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
> +               seq_puts(seq, ",memory_hugetlb_accounting");
>         return 0;
>  }
>
> @@ -7050,7 +7062,8 @@ static ssize_t features_show(struct kobject *kobj,
> struct kobj_attribute *attr,
>                         "nsdelegate\n"
>                         "favordynmods\n"
>                         "memory_localevents\n"
> -                       "memory_recursiveprot\n");
> +                       "memory_recursiveprot\n"
> +                       "memory_hugetlb_accounting\n");
>  }
>  static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index de220e3ff8be..ff88ea4df11a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1902,6 +1902,7 @@ void free_huge_folio(struct folio *folio)
>                                      pages_per_huge_page(h), folio);
>         hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
>                                           pages_per_huge_page(h), folio);
> +       mem_cgroup_uncharge(folio);
>         if (restore_reserve)
>                 h->resv_huge_pages++;
>
> @@ -3004,7 +3005,8 @@ int isolate_or_dissolve_huge_page(struct page *page,
> struct list_head *list)
>  }
>
>  struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
> -                                   unsigned long addr, int avoid_reserve)
> +                                       unsigned long addr, int
> avoid_reserve,
> +                                       bool
> restore_reserve_on_memcg_failure)
>  {
>         struct hugepage_subpool *spool = subpool_vma(vma);
>         struct hstate *h = hstate_vma(vma);
> @@ -3119,6 +3121,15 @@ struct folio *alloc_hugetlb_folio(struct
> vm_area_struct *vma,
>                         hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
>                                         pages_per_huge_page(h), folio);
>         }
> +
> +       /* undo allocation if memory controller disallows it. */
> +       if (mem_cgroup_hugetlb_charge_folio(folio, GFP_KERNEL)) {
> +               if (restore_reserve_on_memcg_failure)
> +                       restore_reserve_on_error(h, vma, addr, folio);
> +               folio_put(folio);
> +               return ERR_PTR(-ENOMEM);
> +       }
> +
>         return folio;
>
>  out_uncharge_cgroup:
> @@ -5179,7 +5190,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst,
> struct mm_struct *src,
>                                 spin_unlock(src_ptl);
>                                 spin_unlock(dst_ptl);
>                                 /* Do not use reserve as it's private
> owned */
> -                               new_folio = alloc_hugetlb_folio(dst_vma,
> addr, 1);
> +                               new_folio = alloc_hugetlb_folio(dst_vma,
> addr, 1, false);
>                                 if (IS_ERR(new_folio)) {
>                                         folio_put(pte_folio);
>                                         ret = PTR_ERR(new_folio);
> @@ -5656,7 +5667,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm,
> struct vm_area_struct *vma,
>          * be acquired again before returning to the caller, as expected.
>          */
>         spin_unlock(ptl);
> -       new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve);
> +       new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve, true);
>
>         if (IS_ERR(new_folio)) {
>                 /*
> @@ -5930,7 +5941,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct
> *mm,
>                                                         VM_UFFD_MISSING);
>                 }
>
> -               folio = alloc_hugetlb_folio(vma, haddr, 0);
> +               folio = alloc_hugetlb_folio(vma, haddr, 0, true);
>                 if (IS_ERR(folio)) {
>                         /*
>                          * Returning error will result in faulting task
> being
> @@ -6352,7 +6363,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
>                         goto out;
>                 }
>
> -               folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
> +               folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0, true);
>                 if (IS_ERR(folio)) {
>                         ret = -ENOMEM;
>                         goto out;
> @@ -6394,7 +6405,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
>                         goto out;
>                 }
>
> -               folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
> +               folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0, false);
>                 if (IS_ERR(folio)) {
>                         folio_put(*foliop);
>                         ret = -ENOMEM;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d1a322a75172..d5dfc9b36acb 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -7050,6 +7050,47 @@ int __mem_cgroup_charge(struct folio *folio, struct
> mm_struct *mm, gfp_t gfp)
>         return ret;
>  }
>
> +static struct mem_cgroup *get_mem_cgroup_from_current(void)
> +{
> +       struct mem_cgroup *memcg;
> +
> +again:
> +       rcu_read_lock();
> +       memcg = mem_cgroup_from_task(current);
> +       if (!css_tryget(&memcg->css)) {
> +               rcu_read_unlock();
> +               goto again;
> +       }
> +       rcu_read_unlock();
> +       return memcg;
> +}
> +
> +/**
> + * mem_cgroup_hugetlb_charge_folio - Charge a newly allocated hugetlb
> folio.
> + * @folio: folio to charge.
> + * @gfp: reclaim mode
> + *
> + * This function charges an allocated hugetlbf folio to the memcg of the
> + * current task.
> + *
> + * Returns 0 on success. Otherwise, an error code is returned.
> + */
> +int mem_cgroup_hugetlb_charge_folio(struct folio *folio, gfp_t gfp)
> +{
> +       struct mem_cgroup *memcg;
> +       int ret;
> +
> +       if (mem_cgroup_disabled() ||
> +               !(cgrp_dfl_root.flags &
> CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
> +               return 0;
> +
> +       memcg = get_mem_cgroup_from_current();
> +       ret = charge_memcg(folio, memcg, gfp);
> +       mem_cgroup_put(memcg);
> +
> +       return ret;
> +}
> +
>  /**
>   * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for
> swapin.
>   * @folio: folio to charge.
> --
> 2.34.1
>
>
With the mount option added, I'm fine with this. There are reasons to want
and reasons not to want this, so everybody's happy!

Out of curiosity: is anyone aware of any code that may behave badly when
folio_memcg(hugetlb_folio) != NULL, not expecting it?

- Frank