linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Dan Williams <dan.j.williams@intel.com>
To: Rakie Kim <rakie.kim@sk.com>, Dan Williams <dan.j.williams@intel.com>
Cc: <gourry@gourry.net>, <linux-mm@kvack.org>,
	<linux-kernel@vger.kernel.org>, <linux-cxl@vger.kernel.org>,
	<joshua.hahnjy@gmail.com>, <ying.huang@linux.alibaba.com>,
	<david@redhat.com>, <Jonathan.Cameron@huawei.com>,
	<osalvador@suse.de>, <kernel_team@skhynix.com>,
	<honggyu.kim@sk.com>, <yunjeong.mun@sk.com>, <rakie.kim@sk.com>,
	<akpm@linux-foundation.org>
Subject: Re: [PATCH v7 2/3] mm/mempolicy: Prepare weighted interleave sysfs for memory hotplug
Date: Fri, 11 Apr 2025 15:24:01 -0700	[thread overview]
Message-ID: <67f99681d3e60_720529435@dwillia2-xfh.jf.intel.com.notmuch> (raw)
In-Reply-To: <20250411072135.588-1-rakie.kim@sk.com>

Rakie Kim wrote:
> On Tue, 8 Apr 2025 20:54:48 -0700 Dan Williams <dan.j.williams@intel.com> wrote:
> > Dan Williams wrote:
> > > >  
> > > > +struct sysfs_wi_group {
> > > > +	struct kobject wi_kobj;
> > > > +	struct iw_node_attr *nattrs[];
> > > > +};
> > > > +
> > > > +static struct sysfs_wi_group *wi_group;
> > > > +
> > > >  static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
> > > >  			 char *buf)
> > > >  {
> > > > @@ -3461,27 +3468,24 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
> > > >  	return count;
> > > >  }
> > > >  
> > > > -static struct iw_node_attr **node_attrs;
> > > > -
> > > > -static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> > > > -				  struct kobject *parent)
> > > > +static void sysfs_wi_node_delete(int nid)
> > > >  {
> > > > -	if (!node_attr)
> > > > +	if (!wi_group->nattrs[nid])
> > > >  		return;
> > > > -	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
> > > > -	kfree(node_attr->kobj_attr.attr.name);
> > > > -	kfree(node_attr);
> > > > +
> > > > +	sysfs_remove_file(&wi_group->wi_kobj,
> > > > +			  &wi_group->nattrs[nid]->kobj_attr.attr);
> > > 
> > > This still looks broken to me, but I think this is more a problem that
> > > was present in the original code.
> > > 
> > > At this point @wi_group's reference count is zero because
> > > sysfs_wi_release() has been called. However, it can only be zero if it has
> > > properly transitioned through kobject_del() and final kobject_put(). It
> > > follows that kobject_del() arranges for kobj->sd to be NULL. That means
> > > that this *should* be hitting the WARN() in kernfs_remove_by_name_ns()
> > > for the !parent case.
> > > 
> > > So, either you are not triggering that path, or testing that path, but
> > > sys_remove_file() of the child attributes should be happening *before*
> > > sysfs_wi_release().
> > > 
> > > Did I miss something?
> > 
> > I think the missing change is that sysfs_wi_node_add() failures need to
> > be done with a sysfs_wi_node_delete() of the added attrs *before* the
> > kobject_del() of @wi_group.
> 
> Hi Dan,
> 
> Thank you for pointing out this issue.
> 
> As you suggested, I believe the most appropriate way to handle this is
> to incorporate your feedback into Patch 1 
> (mm/mempolicy: Fix memory leaks in weighted interleave sysfs).
> 
> To ensure that sysfs_remove_file() is called before kobject_del(), I
> have restructured the code as follows:
> 
> <Previously>
> static void sysfs_wi_release(struct kobject *wi_kobj)
> {
> 	int nid;
> 
> 	for (nid = 0; nid < nr_node_ids; nid++)
> 		sysfs_wi_node_delete(node_attrs[nid], wi_kobj);
> 		-> ERROR: sysfs_remove_file called here
> 	kfree(node_attrs);
> 	kfree(wi_kobj);
> }
> 
> <Now>
> static void sysfs_wi_node_delete_all(struct kobject *wi_kobj)
> {
> 	int nid;
> 
> 	for (nid = 0; nid < nr_node_ids; nid++)
> 		sysfs_wi_node_delete(node_attrs[nid], wi_kobj);

At this point the nodes were live which means userspace could have
triggered an iw_table update. So I would expect that after all node
files have been deleted then this function frees the iw_table.

> 		-> sysfs_remove_file called here

Call iw_table_free() after the loop, where that is something like below
(untested!):

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b28a1e6ae096..88538f23c7d4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3430,6 +3430,28 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
 	return sysfs_emit(buf, "%d\n", weight);
 }
 
+static void iw_table_install(static u8 *new, struct iw_node_attr *node_attr, u8 weight)
+{
+	u8 *old;
+
+	mutex_lock(&iw_table_lock);
+	old = rcu_dereference_protected(iw_table,
+					lockdep_is_held(&iw_table_lock));
+	if (old && new)
+		memcpy(new, old, nr_node_ids);
+	if (new)
+		new[node_attr->nid] = weight;
+	rcu_assign_pointer(iw_table, new);
+	mutex_unlock(&iw_table_lock);
+	synchronize_rcu();
+	kfree(old);
+}
+
+static void iw_table_free(void)
+{
+	iw_table_install(NULL, NULL, 0);
+}
+
 static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
 			  const char *buf, size_t count)
 {
@@ -3447,17 +3469,8 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
 	new = kzalloc(nr_node_ids, GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
+	iw_table_install(new, node_attr, weight);
 
-	mutex_lock(&iw_table_lock);
-	old = rcu_dereference_protected(iw_table,
-					lockdep_is_held(&iw_table_lock));
-	if (old)
-		memcpy(new, old, nr_node_ids);
-	new[node_attr->nid] = weight;
-	rcu_assign_pointer(iw_table, new);
-	mutex_unlock(&iw_table_lock);
-	synchronize_rcu();
-	kfree(old);
 	return count;
 }
 
@@ -3550,15 +3563,6 @@ static int add_weighted_interleave_group(struct kobject *root_kobj)
 
 static void mempolicy_kobj_release(struct kobject *kobj)
 {
-	u8 *old;
-
-	mutex_lock(&iw_table_lock);
-	old = rcu_dereference_protected(iw_table,
-					lockdep_is_held(&iw_table_lock));
-	rcu_assign_pointer(iw_table, NULL);
-	mutex_unlock(&iw_table_lock);
-	synchronize_rcu();
-	kfree(old);
 	kfree(node_attrs);
 	kfree(kobj);
 }

> }
> 
> static void sysfs_wi_release(struct kobject *wi_kobj)
> {
> 	kfree(node_attrs);
> 	kfree(wi_kobj);
> }
> 
> In addition, I call sysfs_wi_node_delete_all() before kobject_del()
> during error handling:
> 
> +err_cleanup_kobj:
> +	sysfs_wi_node_delete_all(wi_kobj);
> 	kobject_del(wi_kobj);
> 
> I believe this resolves the issue you raised.

Yes, along with the iw_table_free() change because while it is not a
leak, it is awkward that mempolicy_kobj_release arranges to keep
iw_table allocated long past the time the node attributes have been
deleted and shutdown in sysfs.

> That said, I have a follow-up question. With this structure, when the
> system is shutting down, sysfs_remove_file() will not be called. Based
> on my review of other kernel subsystems, it seems that sysfs_remove_file()
> is only called during module_exit() in driver code, and not in other
> built-in subsystems.

Correct.

> Is this an acceptable practice? If you happen to know the expected
> behavior in such cases, I would appreciate your insights.

Yes, there are plenty of examples of sysfs infrastructure that gets set
up, but never torn down for the life of the kernel. The goal here is to
make the error unwind path correct and make the code clean for potentially
deleting mempolicy_kobj infrastructure in the future, but it is
otherwise ok if the only patch that calls kobject_del() for an object is
the error unwind path.

> 
> Below is the full content of the updated Patch 1.
> @@ -3463,8 +3463,8 @@ static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
>  
>  static struct iw_node_attr **node_attrs;
>  
> -static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> -                                 struct kobject *parent)
> +static void sysfs_wi_node_delete(struct iw_node_attr *node_attr,
> +                                struct kobject *parent)
>  {
>         if (!node_attr)
>                 return;
> @@ -3473,13 +3473,16 @@ static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
>         kfree(node_attr);
>  }
>  
> -static void sysfs_wi_release(struct kobject *wi_kobj)
> +static void sysfs_wi_node_delete_all(struct kobject *wi_kobj)
>  {
> -       int i;
> +       int nid;
>  
> -       for (i = 0; i < nr_node_ids; i++)
> -               sysfs_wi_node_release(node_attrs[i], wi_kobj);
> +       for (nid = 0; nid < nr_node_ids; nid++)
> +               sysfs_wi_node_delete(node_attrs[nid], wi_kobj);
> +}
>  
> +static void sysfs_wi_release(struct kobject *wi_kobj)
> +{
>         kfree(node_attrs);
>         kfree(wi_kobj);
>  }
> @@ -3547,13 +3550,14 @@ static int add_weighted_interleave_group(struct kobject *root_kobj)
>                 err = add_weight_node(nid, wi_kobj);
>                 if (err) {
>                         pr_err("failed to add sysfs [node%d]\n", nid);
> -                       goto err_del_kobj;
> +                       goto err_cleanup_kobj;
>                 }
>         }
>  
>         return 0;
>  
> -err_del_kobj:
> +err_cleanup_kobj:
> +       sysfs_wi_node_delete_all(wi_kobj);
>         kobject_del(wi_kobj);
>  err_put_kobj:
>         kobject_put(wi_kobj);
> 
> Thank you again for your helpful feedback.

Hey, thanks for the patience to get this all fixed up properly.


  reply	other threads:[~2025-04-11 22:24 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-08  7:32 [PATCH v7 0/3] Enhance sysfs handling for memory hotplug in weighted interleave Rakie Kim
2025-04-08  7:32 ` [PATCH v7 1/3] mm/mempolicy: Fix memory leaks in weighted interleave sysfs Rakie Kim
2025-04-08 13:45   ` Joshua Hahn
2025-04-15 15:41   ` Jonathan Cameron
2025-04-08  7:32 ` [PATCH v7 2/3] mm/mempolicy: Prepare weighted interleave sysfs for memory hotplug Rakie Kim
2025-04-08 13:49   ` Joshua Hahn
2025-04-09  3:43   ` Dan Williams
2025-04-09  3:54     ` Dan Williams
2025-04-09  5:56       ` Rakie Kim
2025-04-09 18:51         ` Dan Williams
2025-04-10  7:53           ` Rakie Kim
2025-04-10  8:06             ` Rakie Kim
2025-04-11  3:11               ` Andrew Morton
2025-04-11  7:21       ` Rakie Kim
2025-04-11 22:24         ` Dan Williams [this message]
2025-04-08  7:32 ` [PATCH v7 3/3] mm/mempolicy: Support memory hotplug in weighted interleave Rakie Kim
2025-04-08 13:52   ` Joshua Hahn
2025-04-08 14:45   ` Gregory Price
2025-04-09  9:05   ` David Hildenbrand
2025-04-09 11:39     ` Honggyu Kim
2025-04-09 11:52       ` David Hildenbrand
2025-04-10  7:53         ` Rakie Kim
2025-04-10 13:25         ` Honggyu Kim
2025-04-10 13:41           ` David Hildenbrand
2025-04-15 16:00   ` Jonathan Cameron
2025-04-16  4:04     ` Honggyu Kim
2025-04-16  7:37       ` Honggyu Kim
2025-04-16  7:49       ` Rakie Kim

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=67f99681d3e60_720529435@dwillia2-xfh.jf.intel.com.notmuch \
    --to=dan.j.williams@intel.com \
    --cc=Jonathan.Cameron@huawei.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@redhat.com \
    --cc=gourry@gourry.net \
    --cc=honggyu.kim@sk.com \
    --cc=joshua.hahnjy@gmail.com \
    --cc=kernel_team@skhynix.com \
    --cc=linux-cxl@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=osalvador@suse.de \
    --cc=rakie.kim@sk.com \
    --cc=ying.huang@linux.alibaba.com \
    --cc=yunjeong.mun@sk.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox