On Wed, Sep 27, 2023 at 5:57 PM Nhat Pham wrote: > Currently, hugetlb memory usage is not acounted for in the memory > controller, which could lead to memory overprotection for cgroups with > hugetlb-backed memory. This has been observed in our production system. > > This patch rectifies this issue by charging the memcg when the hugetlb > folio is allocated, and uncharging when the folio is freed (analogous to > the hugetlb controller). > > Signed-off-by: Nhat Pham > --- > Documentation/admin-guide/cgroup-v2.rst | 9 ++++++ > fs/hugetlbfs/inode.c | 2 +- > include/linux/cgroup-defs.h | 5 +++ > include/linux/hugetlb.h | 6 ++-- > include/linux/memcontrol.h | 8 +++++ > kernel/cgroup/cgroup.c | 15 ++++++++- > mm/hugetlb.c | 23 ++++++++++---- > mm/memcontrol.c | 41 +++++++++++++++++++++++++ > 8 files changed, 99 insertions(+), 10 deletions(-) > > diff --git a/Documentation/admin-guide/cgroup-v2.rst > b/Documentation/admin-guide/cgroup-v2.rst > index 622a7f28db1f..e6267b8cbd1d 100644 > --- a/Documentation/admin-guide/cgroup-v2.rst > +++ b/Documentation/admin-guide/cgroup-v2.rst > @@ -210,6 +210,15 @@ cgroup v2 currently supports the following mount > options. > relying on the original semantics (e.g. specifying bogusly > high 'bypass' protection values at higher tree levels). > > + memory_hugetlb_accounting > + Count hugetlb memory usage towards the cgroup's overall > + memory usage for the memory controller. This is a new behavior > + that could regress existing setups, so it must be explicitly > + opted in with this mount option. Note that hugetlb pages > + allocated while this option is not selected will not be > + tracked by the memory controller (even if cgroup v2 is > + remounted later on). > + > > Organizing Processes and Threads > -------------------------------- > diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c > index 60fce26ff937..034967319955 100644 > --- a/fs/hugetlbfs/inode.c > +++ b/fs/hugetlbfs/inode.c > @@ -902,7 +902,7 @@ static long hugetlbfs_fallocate(struct file *file, int > mode, loff_t offset, > * to keep reservation accounting consistent. > */ > hugetlb_set_vma_policy(&pseudo_vma, inode, index); > - folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); > + folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0, true); > hugetlb_drop_vma_policy(&pseudo_vma); > if (IS_ERR(folio)) { > mutex_unlock(&hugetlb_fault_mutex_table[hash]); > diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h > index f1b3151ac30b..8641f4320c98 100644 > --- a/include/linux/cgroup-defs.h > +++ b/include/linux/cgroup-defs.h > @@ -115,6 +115,11 @@ enum { > * Enable recursive subtree protection > */ > CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18), > + > + /* > + * Enable hugetlb accounting for the memory controller. > + */ > + CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), > }; > > /* cftype->flags */ > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index a30686e649f7..9b73db1605a2 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -713,7 +713,8 @@ struct huge_bootmem_page { > > int isolate_or_dissolve_huge_page(struct page *page, struct list_head > *list); > struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, > - unsigned long addr, int avoid_reserve); > + unsigned long addr, int avoid_reserve, > + bool restore_reserve_on_memcg_failure); > struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int > preferred_nid, > nodemask_t *nmask, gfp_t gfp_mask); > struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct > vm_area_struct *vma, > @@ -1016,7 +1017,8 @@ static inline int > isolate_or_dissolve_huge_page(struct page *page, > > static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct > *vma, > unsigned long addr, > - int avoid_reserve) > + int avoid_reserve, > + bool > restore_reserve_on_memcg_failure) > { > return NULL; > } > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index e0cfab58ab71..8094679c99dd 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -677,6 +677,8 @@ static inline int mem_cgroup_charge(struct folio > *folio, struct mm_struct *mm, > return __mem_cgroup_charge(folio, mm, gfp); > } > > +int mem_cgroup_hugetlb_charge_folio(struct folio *folio, gfp_t gfp); > + > int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct > *mm, > gfp_t gfp, swp_entry_t entry); > void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); > @@ -1251,6 +1253,12 @@ static inline int mem_cgroup_charge(struct folio > *folio, > return 0; > } > > +static inline int mem_cgroup_hugetlb_charge_folio(struct folio *folio, > + gfp_t gfp) > +{ > + return 0; > +} > + > static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, > struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) > { > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > index 1fb7f562289d..f11488b18ceb 100644 > --- a/kernel/cgroup/cgroup.c > +++ b/kernel/cgroup/cgroup.c > @@ -1902,6 +1902,7 @@ enum cgroup2_param { > Opt_favordynmods, > Opt_memory_localevents, > Opt_memory_recursiveprot, > + Opt_memory_hugetlb_accounting, > nr__cgroup2_params > }; > > @@ -1910,6 +1911,7 @@ static const struct fs_parameter_spec > cgroup2_fs_parameters[] = { > fsparam_flag("favordynmods", Opt_favordynmods), > fsparam_flag("memory_localevents", Opt_memory_localevents), > fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), > + fsparam_flag("memory_hugetlb_accounting", > Opt_memory_hugetlb_accounting), > {} > }; > > @@ -1936,6 +1938,9 @@ static int cgroup2_parse_param(struct fs_context > *fc, struct fs_parameter *param > case Opt_memory_recursiveprot: > ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; > return 0; > + case Opt_memory_hugetlb_accounting: > + ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; > + return 0; > } > return -EINVAL; > } > @@ -1960,6 +1965,11 @@ static void apply_cgroup_root_flags(unsigned int > root_flags) > cgrp_dfl_root.flags |= > CGRP_ROOT_MEMORY_RECURSIVE_PROT; > else > cgrp_dfl_root.flags &= > ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; > + > + if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) > + cgrp_dfl_root.flags |= > CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; > + else > + cgrp_dfl_root.flags &= > ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; > } > } > > @@ -1973,6 +1983,8 @@ static int cgroup_show_options(struct seq_file *seq, > struct kernfs_root *kf_root > seq_puts(seq, ",memory_localevents"); > if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) > seq_puts(seq, ",memory_recursiveprot"); > + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) > + seq_puts(seq, ",memory_hugetlb_accounting"); > return 0; > } > > @@ -7050,7 +7062,8 @@ static ssize_t features_show(struct kobject *kobj, > struct kobj_attribute *attr, > "nsdelegate\n" > "favordynmods\n" > "memory_localevents\n" > - "memory_recursiveprot\n"); > + "memory_recursiveprot\n" > + "memory_hugetlb_accounting\n"); > } > static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index de220e3ff8be..ff88ea4df11a 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -1902,6 +1902,7 @@ void free_huge_folio(struct folio *folio) > pages_per_huge_page(h), folio); > hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), > pages_per_huge_page(h), folio); > + mem_cgroup_uncharge(folio); > if (restore_reserve) > h->resv_huge_pages++; > > @@ -3004,7 +3005,8 @@ int isolate_or_dissolve_huge_page(struct page *page, > struct list_head *list) > } > > struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, > - unsigned long addr, int avoid_reserve) > + unsigned long addr, int > avoid_reserve, > + bool > restore_reserve_on_memcg_failure) > { > struct hugepage_subpool *spool = subpool_vma(vma); > struct hstate *h = hstate_vma(vma); > @@ -3119,6 +3121,15 @@ struct folio *alloc_hugetlb_folio(struct > vm_area_struct *vma, > hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), > pages_per_huge_page(h), folio); > } > + > + /* undo allocation if memory controller disallows it. */ > + if (mem_cgroup_hugetlb_charge_folio(folio, GFP_KERNEL)) { > + if (restore_reserve_on_memcg_failure) > + restore_reserve_on_error(h, vma, addr, folio); > + folio_put(folio); > + return ERR_PTR(-ENOMEM); > + } > + > return folio; > > out_uncharge_cgroup: > @@ -5179,7 +5190,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, > struct mm_struct *src, > spin_unlock(src_ptl); > spin_unlock(dst_ptl); > /* Do not use reserve as it's private > owned */ > - new_folio = alloc_hugetlb_folio(dst_vma, > addr, 1); > + new_folio = alloc_hugetlb_folio(dst_vma, > addr, 1, false); > if (IS_ERR(new_folio)) { > folio_put(pte_folio); > ret = PTR_ERR(new_folio); > @@ -5656,7 +5667,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, > struct vm_area_struct *vma, > * be acquired again before returning to the caller, as expected. > */ > spin_unlock(ptl); > - new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); > + new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve, true); > > if (IS_ERR(new_folio)) { > /* > @@ -5930,7 +5941,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct > *mm, > VM_UFFD_MISSING); > } > > - folio = alloc_hugetlb_folio(vma, haddr, 0); > + folio = alloc_hugetlb_folio(vma, haddr, 0, true); > if (IS_ERR(folio)) { > /* > * Returning error will result in faulting task > being > @@ -6352,7 +6363,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, > goto out; > } > > - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); > + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0, true); > if (IS_ERR(folio)) { > ret = -ENOMEM; > goto out; > @@ -6394,7 +6405,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, > goto out; > } > > - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); > + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0, false); > if (IS_ERR(folio)) { > folio_put(*foliop); > ret = -ENOMEM; > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index d1a322a75172..d5dfc9b36acb 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -7050,6 +7050,47 @@ int __mem_cgroup_charge(struct folio *folio, struct > mm_struct *mm, gfp_t gfp) > return ret; > } > > +static struct mem_cgroup *get_mem_cgroup_from_current(void) > +{ > + struct mem_cgroup *memcg; > + > +again: > + rcu_read_lock(); > + memcg = mem_cgroup_from_task(current); > + if (!css_tryget(&memcg->css)) { > + rcu_read_unlock(); > + goto again; > + } > + rcu_read_unlock(); > + return memcg; > +} > + > +/** > + * mem_cgroup_hugetlb_charge_folio - Charge a newly allocated hugetlb > folio. > + * @folio: folio to charge. > + * @gfp: reclaim mode > + * > + * This function charges an allocated hugetlbf folio to the memcg of the > + * current task. > + * > + * Returns 0 on success. Otherwise, an error code is returned. > + */ > +int mem_cgroup_hugetlb_charge_folio(struct folio *folio, gfp_t gfp) > +{ > + struct mem_cgroup *memcg; > + int ret; > + > + if (mem_cgroup_disabled() || > + !(cgrp_dfl_root.flags & > CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) > + return 0; > + > + memcg = get_mem_cgroup_from_current(); > + ret = charge_memcg(folio, memcg, gfp); > + mem_cgroup_put(memcg); > + > + return ret; > +} > + > /** > * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for > swapin. > * @folio: folio to charge. > -- > 2.34.1 > > With the mount option added, I'm fine with this. There are reasons to want and reasons not to want this, so everybody's happy! Out of curiosity: is anyone aware of any code that may behave badly when folio_memcg(hugetlb_folio) != NULL, not expecting it? - Frank