commit dc32c8d4875fdd27b43d5c295853860f4d647055
Author: Bijan Tabatabai <bijantabatab@micron.com>

mm/damon/core: skip needless update of damon_attrs in damon_commit_ctx()

This patch adds an optimization to avoid calling damon_set_attrs() when
attributes haven't changed, preventing unnecessary resets of timing fields
that can delay aggregation and ops update operations.

Link: https://lkml.kernel.org/r/20250807001924.76275-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20250806234254.10572-1-bijan311@gmail.com

> diff --git a/mm/damon/core.c b/mm/damon/core.c
> index 106ee8b0f2d5..52ecc3a4426f 100644
> --- a/mm/damon/core.c
> +++ b/mm/damon/core.c
> @@ -570,6 +570,23 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
>  	kfree(ctx);
>  }
>
> +static bool damon_attrs_equals(const struct damon_attrs *attrs1,
> +		const struct damon_attrs *attrs2)
> +{
> +	const struct damon_intervals_goal *ig1 = &attrs1->intervals_goal;
> +	const struct damon_intervals_goal *ig2 = &attrs2->intervals_goal;
> +
> +	return attrs1->sample_interval == attrs2->sample_interval &&
> +		attrs1->aggr_interval == attrs2->aggr_interval &&
> +		attrs1->ops_update_interval == attrs2->ops_update_interval &&
> +		attrs1->min_nr_regions == attrs2->min_nr_regions &&
> +		attrs1->max_nr_regions == attrs2->max_nr_regions &&
> +		ig1->access_bp == ig2->access_bp &&
> +		ig1->aggrs == ig2->aggrs &&
> +		ig1->min_sample_us == ig2->min_sample_us &&
> +		ig1->max_sample_us == ig2->max_sample_us;
> +}

Does this comparison miss the aggr_samples field? The damon_attrs struct
contains an aggr_samples field that is used by damon_set_attrs() and
kdamond_tune_intervals() for timing calculations. If only aggr_samples
differs between the two attrs structs, this function would return true
when the attributes are actually different, causing damon_set_attrs() to
be incorrectly skipped.

--clm false positive, SJ explained there's a comment explanining this, but
claude didn't quite parse it right.

> +
>  static unsigned int damon_age_for_new_attrs(unsigned int age,
>  		struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
>  {
> @@ -1222,9 +1239,11 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
>  	 * 2. ops update should be done after pid handling is done (target
>  	 *    committing require putting pids).
>  	 */
> -	err = damon_set_attrs(dst, &src->attrs);
> -	if (err)
> -		return err;
> +	if (!damon_attrs_equals(&dst->attrs, &src->attrs)) {
> +		err = damon_set_attrs(dst, &src->attrs);
> +		if (err)
> +			return err;
> +	}
>  	dst->ops = src->ops;
>
>  	return 0;

commit 7663e963a51122792811811c8119fd55c9ab254a
Author: Jeff Layton <jlayton@kernel.org>

nfsd: track original timestamps in nfs4_delegation

This commit stores file timestamps in the nfs4_delegation structure at
delegation grant time to support RFC 9754 requirements. It adds three
timespec64 fields and modifies delegation logic to capture timestamps.

Link: https://lore.kernel.org/linux-nfs/47a4e40310e797f21b5137e847b06bb203d99e66.camel@kernel.org/

> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 77eea2ad93cc..8737b721daf3 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -6157,7 +6157,8 @@ nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh,
>  	path.dentry = file_dentry(nf->nf_file);
>
>  	rc = vfs_getattr(&path, stat,
> -			 (STATX_MODE | STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE),
> +			 STATX_MODE | STATX_SIZE | STATX_ATIME |
> +			 STATX_MTIME | STATX_CTIME | STATX_CHANGE_COOKIE,
>  			 AT_STATX_SYNC_AS_STAT);
>
>  	nfsd_file_put(nf);
> @@ -6274,10 +6275,14 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open,
>  						    OPEN_DELEGATE_WRITE;
>  		dp->dl_cb_fattr.ncf_cur_fsize = stat.size;
>  		dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat);
> +		dp->dl_atime = stat.atime;
> +		dp->dl_ctime = stat.ctime;
> +		dp->dl_mtime = stat.mtime;
>  		trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid);
>  	} else {
> -		open->op_delegate_type = deleg_ts ? OPEN_DELEGATE_READ_ATTRS_DELEG :
> -						    OPEN_DELEGATE_READ;
> +		open->op_delegate_type = deleg_ts && nfs4_delegation_stat(dp, currentfh, &stat) ?
> +					 OPEN_DELEGATE_READ_ATTRS_DELEG : OPEN_DELEGATE_READ;
> +		dp->dl_atime = stat.atime;

When nfs4_delegation_stat() fails in the read delegation path, can this
assignment use uninitialized data from the stat structure? The stat
variable is declared on the stack and if vfs_getattr() fails within
nfs4_delegation_stat(), the atime field would contain undefined values.

>  		trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid);
>  	}
>  	nfs4_put_stid(&dp->dl_stid);
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 8adc2550129e..ce7c0d129ba3 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -224,6 +224,11 @@ struct nfs4_delegation {
>
>  	/* for CB_GETATTR */
>  	struct nfs4_cb_fattr    dl_cb_fattr;
> +
> +	/* For delegated timestamps */
> +	struct timespec64	dl_atime;
> +	struct timespec64	dl_mtime;
> +	struct timespec64	dl_ctime;
>  };
>
>  static inline bool deleg_is_read(u32 dl_type)
commit b93128f29733af5d427a335978a19884c2c230e2
Author: Trond Myklebust <trond.myklebust@hammerspace.com>

NFSv4.2: Serialise O_DIRECT i/o and fallocate()

This patch adds a call to nfs_file_block_o_direct() before nfs_sync_inode()
in nfs42_proc_fallocate() to ensure all O_DIRECT reads and writes complete
before calling fallocate operations, preventing races with attribute
updates.

> diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
> index e2fea37c5348..1a169372ca16 100644
> --- a/fs/nfs/nfs42proc.c
> +++ b/fs/nfs/nfs42proc.c
> @@ -114,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
>  	exception.inode = inode;
>  	exception.state = lock->open_context->state;
>
> +	nfs_file_block_o_direct(NFS_I(inode));

Can nfs_file_block_o_direct() be called here without holding inode->i_rwsem?

The function definition in fs/nfs/internal.h has this comment:

/* Must be called with exclusively locked inode->i_rwsem */
static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi)
{
        if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
                clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
                inode_dio_wait(&nfsi->vfs_inode);
        }
}

All other callers in the NFS code acquire the lock first. For example,
nfs_start_io_write() calls down_write_killable(&inode->i_rwsem) before
calling nfs_file_block_o_direct(), and nfs_start_io_read() does the same.

The VFS fallocate path (vfs_fallocate()->nfs42_fallocate()->nfs42_proc_
fallocate()) only calls file_start_write() which acquires sb->s_writers
but not inode->i_rwsem.

>  	err = nfs_sync_inode(inode);
>  	if (err)
>  		goto out;

commit d442670c0f63c46b7f348f68fb2002af597708f2
Author: Jonathan Curley <jcurley@purestorage.com>

NFSv4/flexfiles: Add data structure support for striped layouts

Adds a new struct nfs4_ff_layout_ds_stripe that represents a data
server stripe within a layout. A new dynamically allocated array of
this type has been added to nfs4_ff_layout_mirror and per stripe
configuration information has been moved from the mirror type to the
stripe based on the RFC.

> diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
> index 2ad371faab98..debdc8952ab0 100644
> --- a/fs/nfs/flexfilelayout/flexfilelayout.c
> +++ b/fs/nfs/flexfilelayout/flexfilelayout.c

[ ... ]

> @@ -452,10 +459,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
>  			goto out_err_free;
>  		}
>
> -		fls->mirror_array[i]->ds_count = ds_count;
> +		fls->mirror_array[i]->dss_count = dss_count;
> +		fls->mirror_array[i]->dss =
> +		    kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe),
> +			    gfp_flags);
>
>  		/* deviceid */
> -		rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid);
> +		rc = decode_deviceid(&stream, &fls->mirror_array[i]->dss[dss_id].devid);

Can this dereference NULL in ff_layout_alloc_lseg()? The kcalloc() call
above can return NULL under memory pressure, but there's no check before
accessing fls->mirror_array[i]->dss[dss_id].devid in the decode_deviceid()
call. All other memory allocations in this function check for NULL and
jump to out_err_free on failure.

>  		if (rc)
>  			goto out_err_free;
>

[ ... ]

commit 8c48e1c7520321cc87ff651e96093e2f412785fb
Author: Stefan Metzmacher <metze@samba.org>

smb: client: don't wait for info->send_pending == 0 on error

This commit addresses an issue where smbd_destroy() would wait
indefinitely for send_pending to reach zero after ib_drain_qp() is
called. The patch removes the problematic wait and modifies smbd_send()
to exit early when the socket is disconnected.

> diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
> index c628e91c328b..02d6db431fd4 100644
> --- a/fs/smb/client/smbdirect.c
> +++ b/fs/smb/client/smbdirect.c
> @@ -1337,10 +1337,6 @@ void smbd_destroy(struct TCP_Server_Info *server)
>  	log_rdma_event(INFO, "cancelling idle timer\n");
>  	cancel_delayed_work_sync(&info->idle_timer_work);
>
> -	log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
> -	wait_event(info->wait_send_pending,
> -		atomic_read(&info->send_pending) == 0);
> -

Can smbd_destroy() proceed safely without waiting for send operations to
complete? The send_done() callback accesses the info structure after
this point:

In send_done(), when wc->status != IB_WC_SUCCESS (which happens after
ib_drain_qp() above), the function calls smbd_disconnect_rdma_connection()
but skips the atomic_dec_and_test(&info->send_pending) call. However,
when the operation succeeds, send_done() does call:

    if (atomic_dec_and_test(&info->send_pending))
        wake_up(&info->wait_send_pending);

After ib_drain_qp() triggers completions with IB_WC_WR_FLUSH_ERR status,
these send_done() callbacks can execute after smbd_destroy() has called
kfree(info). When send_done() then tries to access info->send_pending,
does this create a use-after-free condition?

>  	/* It's not possible for upper layer to get to reassembly */
>  	log_rdma_event(INFO, "drain the reassembly queue\n");
>  	do {
> @@ -1986,7 +1982,11 @@ int smbd_send(struct TCP_Server_Info *server,
>  	 */
>
>  	wait_event(info->wait_send_pending,
> -		atomic_read(&info->send_pending) == 0);
> +		atomic_read(&info->send_pending) == 0 ||
> +		sc->status != SMBDIRECT_SOCKET_CONNECTED);
> +
> +	if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0)
> +		rc = -EAGAIN;
>
>  	return rc;
>  }

commit 1f7168b28f667c0920fd83151ae35ce593950663
Author: André Almeida <andrealmeid@igalia.com>

ovl: Ensure that all layers have the same encoding

This patch adds encoding consistency validation to overlayfs when casefold
is enabled. It introduces ovl_set_encoding() to copy encoding settings from
the first layer and validates all layers have matching encoding.

> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
> index df85a76597e9..548ea9b1a48d 100644
> --- a/fs/overlayfs/super.c
> +++ b/fs/overlayfs/super.c
> @@ -991,6 +991,25 @@ static int ovl_get_data_fsid(struct ovl_fs *ofs)
>  	return ofs->numfs;
>  }
>
> +/*
> + * Set the ovl sb encoding as the same one used by the first layer
> + */
> +static int ovl_set_encoding(struct super_block *sb, struct super_block *fs_sb)
> +{
> +	if (!sb_has_encoding(fs_sb))
> +		return 0;
> +
> +#if IS_ENABLED(CONFIG_UNICODE)
> +	if (sb_has_strict_encoding(fs_sb)) {
> +		pr_err("strict encoding not supported\n");
> +		return -EINVAL;
> +	}
> +
> +	sb->s_encoding = fs_sb->s_encoding;
> +	sb->s_encoding_flags = fs_sb->s_encoding_flags;
                         ^^^^

Will this leak the unicode_map on overlayfs unmount? Other filesystems
like shmem and ext4 call utf8_unload(sb->s_encoding) in their put_super
operations, but ovl_put_super()->ovl_free_fs() has no utf8_unload() call.

The unicode_map appears to be reference counted via utf8_load()/utf8_unload()
pairs in other parts of the kernel, but this code just copies the pointer
without taking a proper reference.

> +#endif
> +	return 0;
> +}
>
>  static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
>  			  struct ovl_fs_context *ctx, struct ovl_layer *layers)

commit 8004ea01cf6338298e0c6ab055bc3ec659ce381b
Author: Babu Moger <babu.moger@amd.com>

fs/resctrl: Introduce the interface to switch between monitor modes

This patch introduces mbm_assign_mode resctrl file to switch between
mbm_event and default monitoring modes. It also resets MBM event
counters automatically when the mode changes.

Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com

> diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst
> index f60f6a96cb6b..006d23af66e1 100644
> --- a/Documentation/filesystems/resctrl.rst
> +++ b/Documentation/filesystems/resctrl.rst

[ ... ]

> diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
> index f388dbcdbdcd..50c24460d992 100644
> --- a/fs/resctrl/monitor.c
> +++ b/fs/resctrl/monitor.c

[ ... ]

> +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf,
> +				      size_t nbytes, loff_t off)
> +{
> +	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
> +	struct rdt_mon_domain *d;
> +	int ret = 0;
> +	bool enable;
> +
> +	/* Valid input requires a trailing newline */
> +	if (nbytes == 0 || buf[nbytes - 1] != '\n')
> +		return -EINVAL;
> +
> +	buf[nbytes - 1] = '\0';
> +
> +	cpus_read_lock();
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	rdt_last_cmd_clear();
> +
> +	if (!strcmp(buf, "default")) {
> +		enable = 0;
> +	} else if (!strcmp(buf, "mbm_event")) {
> +		if (r->mon.mbm_cntr_assignable) {
> +			enable = 1;
> +		} else {
> +			ret = -EINVAL;
> +			rdt_last_cmd_puts("mbm_event mode is not supported\n");
> +			goto out_unlock;
> +		}
> +	} else {
> +		ret = -EINVAL;
> +		rdt_last_cmd_puts("Unsupported assign mode\n");
> +		goto out_unlock;
> +	}
> +
> +	if (enable != resctrl_arch_mbm_cntr_assign_enabled(r)) {
> +		ret = resctrl_arch_mbm_cntr_assign_set(r, enable);
> +		if (ret)
> +			goto out_unlock;
> +
> +		/* Update the visibility of BMEC related files */
> +		resctrl_bmec_files_show(r, NULL, !enable);
> +
> +		/*
> +		 * Initialize the default memory transaction values for
> +		 * total and local events.
> +		 */
> +		if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
> +			mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask;
> +		if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
> +			mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask &
> +									   (READS_TO_LOCAL_MEM |
> +									    READS_TO_LOCAL_S_MEM |
> +									    NON_TEMP_WRITE_TO_LOCAL_MEM);
> +		/* Enable auto assignment when switching to "mbm_event" mode */
> +		if (enable)
> +			r->mon.mbm_assign_on_mkdir = true;
> +		/*
> +		 * Reset all the non-achitectural RMID state and assignable counters.
> +		 */
> +		list_for_each_entry(d, &r->mon_domains, hdr.list) {
> +			mbm_cntr_free_all(r, d);
> +			resctrl_reset_rmid_all(r, d);
> +		}
> +	}
> +
> +out_unlock:
> +	mutex_lock(&rdtgroup_mutex);
            ^^^^

Can this cause a deadlock? The function already acquired rdtgroup_mutex
at the beginning, so attempting to lock it again here would block
indefinitely. Should this be mutex_unlock(&rdtgroup_mutex) instead?

> +	cpus_read_unlock();
> +
> +	return ret ?: nbytes;
> +}

[ ... ]

commit aab2c5088cdb26e80d51ffbe72d24ab23fa1533e
Author: Babu Moger <babu.moger@amd.com>

fs/resctrl: Add the functionality to unassign MBM events

This patch adds counter cleanup functionality for MBM events by
implementing rdtgroup_unassign_cntrs() and supporting functions to free
assigned counters when resource groups are deleted.

Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com

> diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
> index 762705d7eb8d..c6b66d4a6a37 100644
> --- a/fs/resctrl/internal.h
> +++ b/fs/resctrl/internal.h
> @@ -398,6 +398,8 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_fil
>
>  void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp);
>
> +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp);
> +

The rdtgroup_unassign_cntrs() function is declared but where should it be
called? Looking at rdtgroup_assign_cntrs() which is called during group
creation, shouldn't rdtgroup_unassign_cntrs() be called during group
deletion in rdtgroup_rmdir_ctrl() and rdtgroup_rmdir_mon()?

> diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
> index 106e9bdb8a9d..2ed29ae831a4 100644
> --- a/fs/resctrl/monitor.c
> +++ b/fs/resctrl/monitor.c
> @@ -405,6 +405,14 @@ static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d,
>  	return -ENOSPC;
>  }
>
> +/*
> + * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d.
> + */
> +static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id)
> +{
> +	memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg));
> +}
> +

[ ... ]

> @@ -1043,6 +1051,64 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp)
>  					   &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]);
>  }
>
> +/*
> + * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration
> + * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp.
> + */
> +static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
> +					struct rdtgroup *rdtgrp, struct mon_evt *mevt)
> +{
> +	int cntr_id;
> +
> +	cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid);
> +
> +	/* If there is no cntr_id assigned, nothing to do */
> +	if (cntr_id < 0)
> +		return;
> +
> +	rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, false);
> +
> +	mbm_cntr_free(d, cntr_id);
> +}
> +
> +/*
> + * rdtgroup_unassign_cntr_event() - Unassign a hardware counter associated with
> + * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign
> + * the counters from all the domains if @d is NULL else unassign from @d.
> + */
> +static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
> +					 struct mon_evt *mevt)
> +{
> +	struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid);
> +
> +	if (!d) {
> +		list_for_each_entry(d, &r->mon_domains, hdr.list)
> +			rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt);
> +	} else {
> +		rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt);
> +	}
> +}
> +
> +/*
> + * rdtgroup_unassign_cntrs() - Unassign the counters associated with MBM events.
> + *			       Called when a group is deleted.
> + */
> +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp)
> +{
> +	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> +
> +	if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r))
> +		return;
> +
> +	if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
> +		rdtgroup_unassign_cntr_event(NULL, rdtgrp,
> +					     &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]);
> +
> +	if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
> +		rdtgroup_unassign_cntr_event(NULL, rdtgrp,
> +					     &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]);
> +}
> +
>  int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of,
>  				 struct seq_file *s, void *v)
>  {

The comment states "Called when a group is deleted" but the function has
no callers. Won't this create a counter leak when groups are deleted
since counters assigned during group creation via rdtgroup_assign_cntrs()
->rdtgroup_assign_cntr_event()->rdtgroup_alloc_assign_cntr() won't be
cleaned up during deletion?

commit 2d8bd8049e89efe42a5397de4effd899e8dd2249
Author: Pankaj Raghav <p.raghav@samsung.com>

mm: add persistent huge zero folio

This patch introduces a config option CONFIG_PERSISTENT_HUGE_ZERO_FOLIO
that changes huge zero folio lifecycle from dynamic allocation/deallocation
to persistent allocation during boot. When enabled, the shrinker is
disabled and the folio is never freed.

Link: https://lkml.kernel.org/r/20250811084113.647267-4-kernel@pankajraghav.com

> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 7748489fde1b..bd547857c6c1 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -495,6 +495,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
>  struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
>  void mm_put_huge_zero_folio(struct mm_struct *mm);
>
> +static inline struct folio *get_persistent_huge_zero_folio(void)
> +{
> +	if (!IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
> +		return NULL;
> +
> +	if (unlikely(!huge_zero_folio))
> +		return NULL;
> +
> +	return huge_zero_folio;
> +}
> +
>  static inline bool thp_migration_supported(void)
>  {
>  	return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);

[ ... ]

> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 2801ce9bbde9..b8bb078a1a34 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -248,6 +248,9 @@ static void put_huge_zero_folio(void)
>
>  struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
>  {
> +	if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
> +		return huge_zero_folio;
> +
>  	if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags))
>  		return READ_ONCE(huge_zero_folio);
>
> @@ -262,6 +265,9 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
>
>  void mm_put_huge_zero_folio(struct mm_struct *mm)
>  {
> +	if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
> +		return;
> +
>  	if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags))
>  		put_huge_zero_folio();
>  }

[ ... ]

> @@ -849,16 +855,34 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
>
>  static int __init thp_shrinker_init(void)
>  {
> -	huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
> -	if (!huge_zero_folio_shrinker)
> -		return -ENOMEM;
> -
>  	deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
>  						 SHRINKER_MEMCG_AWARE |
>  						 SHRINKER_NONSLAB,
>  						 "thp-deferred_split");
> -	if (!deferred_split_shrinker) {
> -		shrinker_free(huge_zero_folio_shrinker);
> +	if (!deferred_split_shrinker)
> +		return -ENOMEM;
> +
> +	deferred_split_shrinker->count_objects = deferred_split_count;
> +	deferred_split_shrinker->scan_objects = deferred_split_scan;
> +	shrinker_register(deferred_split_shrinker);
> +
> +	if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) {
> +		/*
> +		 * Bump the reference of the huge_zero_folio and do not
> +		 * initialize the shrinker.
> +		 *
> +		 * huge_zero_folio will always be NULL on failure. We assume
> +		 * that get_huge_zero_folio() will most likely not fail as
> +		 * thp_shrinker_init() is invoked early on during boot.
> +		 */
> +		if (!get_huge_zero_folio())
> +			pr_warn("Allocating persistent huge zero folio failed\n");
> +		return 0;
> +	}

Should thp_shrinker_init() return an error code when get_huge_zero_folio()
fails in persistent mode? Currently it returns 0 (success) even when the
allocation fails, which means hugepage_init() will believe initialization
succeeded. This leaves the deferred_split_shrinker allocated but the
persistent huge zero folio unavailable.

If the system is supposed to continue working without the persistent huge
zero folio, the current behavior might be intentional. However, callers
like hugepage_init() have no way to know that persistent allocation failed.

> +
> +	huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
> +	if (!huge_zero_folio_shrinker) {
> +		shrinker_free(deferred_split_shrinker);
>  		return -ENOMEM;
>  	}
>
> @@ -866,10 +890,6 @@ static int __init thp_shrinker_init(void)
>  	huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan;
>  	shrinker_register(huge_zero_folio_shrinker);
>
> -	deferred_split_shrinker->count_objects = deferred_split_count;
> -	deferred_split_shrinker->scan_objects = deferred_split_scan;
> -	shrinker_register(deferred_split_shrinker);
> -
>  	return 0;
>  }

commit 12e423ba4eaed7b1561b677d32e6599f932d03db
Author: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>

mm: convert core mm to mm_flags_*() accessors

This patch converts existing users of direct mm_struct->flags bit
operations to use the new mm_flags_*() accessor functions as part of
migrating mm->flags to a bitmap field. The commit states that no
functional changes are intended.

Link: https://lkml.kernel.org/r/1eb2266f4408798a55bda00cb04545a3203aa572.1755012943.git.lorenzo.stoakes@oracle.com

> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 14d424830fa8..84b7eebe0d68 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -327,7 +327,7 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma,
>  	 * example, s390 kvm.
>  	 */
>  	return (vm_flags & VM_NOHUGEPAGE) ||
> -	       test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags);
> +	       mm_flags_test(MMF_DISABLE_THP, vma->vm_mm);
>  }
>
>  static inline bool thp_disabled_by_hw(void)

[ ... ]

> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 25923cfec9c6..17650f0b516e 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c

[ ... ]

> @@ -1251,7 +1251,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
>  	 * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
>  	 * possible change in exit_mmap is seen
>  	 */
> -	if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
> +	if (mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm))
>  		ret = -EAGAIN;
>  	mmap_read_unlock(mm);

Does this condition invert the logic? The original code checked if
MMF_OOM_SKIP was NOT set, but the new code checks if it IS set. This
appears to change when process_mrelease()->__oom_reap_task_mm() gets
called and when -EAGAIN is returned to userspace.

commit 7c80f3c72467b180a9d7b3852c74b7540a5a6ce9
Author: Mike Rapoport (Microsoft) <rppt@kernel.org>

kho: add support for preserving vmalloc allocations

A vmalloc allocation is preserved using binary structure similar to global
KHO memory tracker. It's a linked list of pages where each page is an
array of physical address of pages in vmalloc area.

Link: https://lkml.kernel.org/r/20250921054458.4043761-4-rppt@kernel.org

> diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
> index cec663b39861..1b3c41ab7a0b 100644
> --- a/include/linux/kexec_handover.h
> +++ b/include/linux/kexec_handover.h

[ ... ]

> diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
> index d30c2a99f3ea..57336e510f3b 100644
> --- a/kernel/kexec_handover.c
> +++ b/kernel/kexec_handover.c

[ ... ]

> +void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
> +{
> +	struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
> +	unsigned int align, order, shift, vm_flags;
> +	unsigned long total_pages, contig_pages;
> +	unsigned long addr, size;
> +	struct vm_struct *area;
> +	struct page **pages;
> +	unsigned int idx = 0;
> +	int err;
> +
> +	vm_flags = kho_flags_to_vmalloc(preservation->flags);
> +	if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
> +		return NULL;
> +
> +	total_pages = preservation->total_pages;
> +	pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL);
> +	if (!pages)
> +		return NULL;
> +	order = preservation->order;
> +	contig_pages = (1 << order);
> +	shift = PAGE_SHIFT + order;
> +	align = 1 << shift;
> +
> +	while (chunk) {
> +		struct page *page;
> +
> +		for (int i = 0; chunk->phys[i]; i++) {
> +			phys_addr_t phys = chunk->phys[i];
> +
> +			if (idx + contig_pages > total_pages)
> +				goto err_free_pages_array;
> +
> +			page = kho_restore_pages(phys, contig_pages);
> +			if (!page)
> +				goto err_free_pages_array;
> +
> +			for (int j = 0; j < contig_pages; j++)
> +				pages[idx++] = page;
                                ^^^^^
Can this page array handling work correctly for higher-order pages?
When contig_pages > 1, the loop stores the same compound page head
pointer multiple times instead of individual page pointers (page + j).
The vmap_pages_range() function expects individual page pointers for
each PAGE_SIZE page, but this provides duplicate pointers to the
compound head.

> +
> +			phys += contig_pages * PAGE_SIZE;
> +		}
> +
> +		page = kho_restore_pages(virt_to_phys(chunk), 1);
> +		if (!page)
> +			goto err_free_pages_array;
> +		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
> +		__free_page(page);
> +	}
> +
> +	if (idx != total_pages)
> +		goto err_free_pages_array;
> +
> +	area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift,
> +				  vm_flags, VMALLOC_START, VMALLOC_END,
> +				  NUMA_NO_NODE, GFP_KERNEL,
> +				  __builtin_return_address(0));
> +	if (!area)
> +		goto err_free_pages_array;
> +
> +	addr = (unsigned long)area->addr;
> +	size = get_vm_area_size(area);
> +	err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
> +	if (err)
> +		goto err_free_vm_area;
> +
> +	area->nr_pages = total_pages;
> +	area->pages = pages;
> +
> +	return area->addr;
> +
> +err_free_vm_area:
> +	free_vm_area(area);
> +err_free_pages_array:
> +	kvfree(pages);
> +	return NULL;
> +}

commit 2191b3471df53d87a46179b0c302c77c9ccfec11
Author: Stefan Metzmacher <metze@samba.org>

smb: smbdirect: introduce smbdirect_socket.send_io.credits.{count,wait_queue}

This patch adds new credit management fields to the smbdirect_socket
structure that will be shared between client and server soon. The
patch initializes these fields in smbdirect_socket_init().

> diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h
> index 1df43b7a87f4..7c2147a0244a 100644
> --- a/fs/smb/common/smbdirect/smbdirect_socket.h
> +++ b/fs/smb/common/smbdirect/smbdirect_socket.h
> @@ -78,6 +78,14 @@ struct smbdirect_socket {
>  			mempool_t		*pool;
>  		} mem;
>
> +		/*
> +		 * The credit state for the send side
> +		 */
> +		struct {
> +			atomic_t count;
> +			wait_queue_head_t wait_queue;
> +		} credits;
> +

Are these new fields properly initialized when smbdirect_socket
structures are created? Looking at _smbd_get_connection() in
fs/smb/client/smbdirect.c, the smbd_connection structure (which
contains smbdirect_socket as its first member) is allocated with
kzalloc() but smbdirect_socket_init() is never called.

>  		/*
>  		 * The state about posted/pending sends
>  		 */
> @@ -163,6 +171,9 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
>
>  	init_waitqueue_head(&sc->status_wait);
>
> +	atomic_set(&sc->send_io.credits.count, 0);
> +	init_waitqueue_head(&sc->send_io.credits.wait_queue);
> +

The initialization code looks correct, but can this function be reached?
A grep through the fs/smb/ directory shows smbdirect_socket_init() is
defined but never called. The actual initialization in _smbd_get_connection()
only calls init_waitqueue_head() for some specific wait queues but not
for the new credits.wait_queue.

Without init_waitqueue_head() being called, any subsequent wait_event()
or wake_up() operations on sc->send_io.credits.wait_queue will access
uninitialized spinlocks and list heads, leading to kernel crashes.

>  	atomic_set(&sc->send_io.pending.count, 0);
>  	init_waitqueue_head(&sc->send_io.pending.dec_wait_queue);
>  	init_waitqueue_head(&sc->send_io.pending.zero_wait_queue);

commit 371d3ab51ce44c8a7c0f9b835b74b7340a922059
Author: Stefan Metzmacher <metze@samba.org>

smb: smbdirect: introduce smbdirect_socket.status_wait

This patch adds a wait_queue_head_t status_wait field to the
smbdirect_socket struct for future use by server and client components.

> diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h
> index 8484c6f09317..a69b9649b88e 100644
> --- a/fs/smb/common/smbdirect/smbdirect_socket.h
> +++ b/fs/smb/common/smbdirect/smbdirect_socket.h
> @@ -41,6 +41,7 @@ const char *smbdirect_socket_status_string(enum smbdirect_socket_status status)
>
>  struct smbdirect_socket {
>  	enum smbdirect_socket_status status;
> +	wait_queue_head_t status_wait;
>
>  	/* RDMA related */
>  	struct {

Is the new status_wait field being properly initialized? Looking at
_smbd_get_connection() in fs/smb/client/smbdirect.c, the smbdirect_socket
struct is allocated via kzalloc() which zeros the memory, but wait queue
heads require explicit initialization via init_waitqueue_head() before
use. The existing info->status_wait is correctly initialized, but I don't
see where sc->status_wait gets initialized.

Can uninitialized wait queue operations cause memory corruption when this
field is eventually used?

commit 603b4416232524dafde8e2cf859788dae786dea1
Author: KP Singh <kpsingh@kernel.org>

bpf: Update the bpf_prog_calc_tag to use SHA256

This patch changes bpf_prog_calc_tag() to use SHA256 instead of SHA1
and modifies the bpf_prog structure to store the full SHA256 digest.
The digest and tag fields are placed in a union to maintain compatibility.

Link: https://lore.kernel.org/r/20250914215141.15144-2-kpsingh@kernel.org

> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 41f776071ff5..d75902074bd1 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -31,6 +31,7 @@
>  #include <linux/memcontrol.h>
>  #include <linux/cfi.h>
>  #include <asm/rqspinlock.h>
> +#include <crypto/sha2.h>
>
>  struct bpf_verifier_env;
>  struct bpf_verifier_log;
> @@ -1717,7 +1718,10 @@ struct bpf_prog {
>  	enum bpf_attach_type	expected_attach_type; /* For some prog types */
>  	u32			len;		/* Number of filter blocks */
>  	u32			jited_len;	/* Size of jited insns in bytes */
> -	u8			tag[BPF_TAG_SIZE];
> +	union {
> +		u8 digest[SHA256_DIGEST_SIZE];
> +		u8 tag[BPF_TAG_SIZE];
> +	};

Can this union create memory access issues? The digest array is 32 bytes
(SHA256_DIGEST_SIZE) while tag is 8 bytes (BPF_TAG_SIZE). Code accessing
prog->tag after bpf_prog_calc_tag() writes to prog->digest might read
incorrect data from the union.


-clm false positive, but not sure I want to teach claude to like unions

>  	struct bpf_prog_stats __percpu *stats;
>  	int __percpu		*active;
>  	unsigned int		(*bpf_func)(const void *ctx,

[ ... ]

> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 1cda2589d4b3..9b64674df16b 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -39,6 +39,7 @@
>  #include <linux/bpf_mem_alloc.h>
>  #include <linux/memcontrol.h>
>  #include <linux/execmem.h>
> +#include <crypto/sha2.h>
>
>  #include <asm/barrier.h>
>  #include <linux/unaligned.h>
> @@ -296,7 +297,6 @@ void __bpf_prog_free(struct bpf_prog *fp)
>  int bpf_prog_calc_tag(struct bpf_prog *fp)
>  {
>  	size_t size = bpf_prog_insn_size(fp);
> -	u8 digest[SHA1_DIGEST_SIZE];
>  	struct bpf_insn *dst;
>  	bool was_ld_map;
>  	u32 i;
> @@ -327,8 +327,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
>  			was_ld_map = false;
>  		}
>  	}
> -	sha1((const u8 *)dst, size, digest);
> -	memcpy(fp->tag, digest, sizeof(fp->tag));
> +	sha256((u8 *)dst, size, fp->digest);

Are prog->tag users broken by this change? The function now writes to
fp->digest but existing code accesses fp->tag. For example:

- perf_event_bpf_event() calls memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE)
- bpf_prog_get_info_by_fd() calls memcpy(info.tag, prog->tag, sizeof(prog->tag))
- bpf_prog_get_info_by_fd() also calls copy_to_user(user_prog_tags[0], prog->tag, BPF_TAG_SIZE)

Since fp->digest gets written but fp->tag gets read, will these functions
access uninitialized memory in the tag field?

>  	vfree(dst);
>  	return 0;
>  }