commit dc32c8d4875fdd27b43d5c295853860f4d647055 Author: Bijan Tabatabai mm/damon/core: skip needless update of damon_attrs in damon_commit_ctx() This patch adds an optimization to avoid calling damon_set_attrs() when attributes haven't changed, preventing unnecessary resets of timing fields that can delay aggregation and ops update operations. Link: https://lkml.kernel.org/r/20250807001924.76275-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250806234254.10572-1-bijan311@gmail.com > diff --git a/mm/damon/core.c b/mm/damon/core.c > index 106ee8b0f2d5..52ecc3a4426f 100644 > --- a/mm/damon/core.c > +++ b/mm/damon/core.c > @@ -570,6 +570,23 @@ void damon_destroy_ctx(struct damon_ctx *ctx) > kfree(ctx); > } > > +static bool damon_attrs_equals(const struct damon_attrs *attrs1, > + const struct damon_attrs *attrs2) > +{ > + const struct damon_intervals_goal *ig1 = &attrs1->intervals_goal; > + const struct damon_intervals_goal *ig2 = &attrs2->intervals_goal; > + > + return attrs1->sample_interval == attrs2->sample_interval && > + attrs1->aggr_interval == attrs2->aggr_interval && > + attrs1->ops_update_interval == attrs2->ops_update_interval && > + attrs1->min_nr_regions == attrs2->min_nr_regions && > + attrs1->max_nr_regions == attrs2->max_nr_regions && > + ig1->access_bp == ig2->access_bp && > + ig1->aggrs == ig2->aggrs && > + ig1->min_sample_us == ig2->min_sample_us && > + ig1->max_sample_us == ig2->max_sample_us; > +} Does this comparison miss the aggr_samples field? The damon_attrs struct contains an aggr_samples field that is used by damon_set_attrs() and kdamond_tune_intervals() for timing calculations. If only aggr_samples differs between the two attrs structs, this function would return true when the attributes are actually different, causing damon_set_attrs() to be incorrectly skipped. --clm false positive, SJ explained there's a comment explanining this, but claude didn't quite parse it right. > + > static unsigned int damon_age_for_new_attrs(unsigned int age, > struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) > { > @@ -1222,9 +1239,11 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) > * 2. ops update should be done after pid handling is done (target > * committing require putting pids). > */ > - err = damon_set_attrs(dst, &src->attrs); > - if (err) > - return err; > + if (!damon_attrs_equals(&dst->attrs, &src->attrs)) { > + err = damon_set_attrs(dst, &src->attrs); > + if (err) > + return err; > + } > dst->ops = src->ops; > > return 0; commit 7663e963a51122792811811c8119fd55c9ab254a Author: Jeff Layton nfsd: track original timestamps in nfs4_delegation This commit stores file timestamps in the nfs4_delegation structure at delegation grant time to support RFC 9754 requirements. It adds three timespec64 fields and modifies delegation logic to capture timestamps. Link: https://lore.kernel.org/linux-nfs/47a4e40310e797f21b5137e847b06bb203d99e66.camel@kernel.org/ > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > index 77eea2ad93cc..8737b721daf3 100644 > --- a/fs/nfsd/nfs4state.c > +++ b/fs/nfsd/nfs4state.c > @@ -6157,7 +6157,8 @@ nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh, > path.dentry = file_dentry(nf->nf_file); > > rc = vfs_getattr(&path, stat, > - (STATX_MODE | STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE), > + STATX_MODE | STATX_SIZE | STATX_ATIME | > + STATX_MTIME | STATX_CTIME | STATX_CHANGE_COOKIE, > AT_STATX_SYNC_AS_STAT); > > nfsd_file_put(nf); > @@ -6274,10 +6275,14 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open, > OPEN_DELEGATE_WRITE; > dp->dl_cb_fattr.ncf_cur_fsize = stat.size; > dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat); > + dp->dl_atime = stat.atime; > + dp->dl_ctime = stat.ctime; > + dp->dl_mtime = stat.mtime; > trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); > } else { > - open->op_delegate_type = deleg_ts ? OPEN_DELEGATE_READ_ATTRS_DELEG : > - OPEN_DELEGATE_READ; > + open->op_delegate_type = deleg_ts && nfs4_delegation_stat(dp, currentfh, &stat) ? > + OPEN_DELEGATE_READ_ATTRS_DELEG : OPEN_DELEGATE_READ; > + dp->dl_atime = stat.atime; When nfs4_delegation_stat() fails in the read delegation path, can this assignment use uninitialized data from the stat structure? The stat variable is declared on the stack and if vfs_getattr() fails within nfs4_delegation_stat(), the atime field would contain undefined values. > trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); > } > nfs4_put_stid(&dp->dl_stid); > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h > index 8adc2550129e..ce7c0d129ba3 100644 > --- a/fs/nfsd/state.h > +++ b/fs/nfsd/state.h > @@ -224,6 +224,11 @@ struct nfs4_delegation { > > /* for CB_GETATTR */ > struct nfs4_cb_fattr dl_cb_fattr; > + > + /* For delegated timestamps */ > + struct timespec64 dl_atime; > + struct timespec64 dl_mtime; > + struct timespec64 dl_ctime; > }; > > static inline bool deleg_is_read(u32 dl_type) commit b93128f29733af5d427a335978a19884c2c230e2 Author: Trond Myklebust NFSv4.2: Serialise O_DIRECT i/o and fallocate() This patch adds a call to nfs_file_block_o_direct() before nfs_sync_inode() in nfs42_proc_fallocate() to ensure all O_DIRECT reads and writes complete before calling fallocate operations, preventing races with attribute updates. > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c > index e2fea37c5348..1a169372ca16 100644 > --- a/fs/nfs/nfs42proc.c > +++ b/fs/nfs/nfs42proc.c > @@ -114,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, > exception.inode = inode; > exception.state = lock->open_context->state; > > + nfs_file_block_o_direct(NFS_I(inode)); Can nfs_file_block_o_direct() be called here without holding inode->i_rwsem? The function definition in fs/nfs/internal.h has this comment: /* Must be called with exclusively locked inode->i_rwsem */ static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi) { if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { clear_bit(NFS_INO_ODIRECT, &nfsi->flags); inode_dio_wait(&nfsi->vfs_inode); } } All other callers in the NFS code acquire the lock first. For example, nfs_start_io_write() calls down_write_killable(&inode->i_rwsem) before calling nfs_file_block_o_direct(), and nfs_start_io_read() does the same. The VFS fallocate path (vfs_fallocate()->nfs42_fallocate()->nfs42_proc_ fallocate()) only calls file_start_write() which acquires sb->s_writers but not inode->i_rwsem. > err = nfs_sync_inode(inode); > if (err) > goto out; commit d442670c0f63c46b7f348f68fb2002af597708f2 Author: Jonathan Curley NFSv4/flexfiles: Add data structure support for striped layouts Adds a new struct nfs4_ff_layout_ds_stripe that represents a data server stripe within a layout. A new dynamically allocated array of this type has been added to nfs4_ff_layout_mirror and per stripe configuration information has been moved from the mirror type to the stripe based on the RFC. > diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c > index 2ad371faab98..debdc8952ab0 100644 > --- a/fs/nfs/flexfilelayout/flexfilelayout.c > +++ b/fs/nfs/flexfilelayout/flexfilelayout.c [ ... ] > @@ -452,10 +459,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, > goto out_err_free; > } > > - fls->mirror_array[i]->ds_count = ds_count; > + fls->mirror_array[i]->dss_count = dss_count; > + fls->mirror_array[i]->dss = > + kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe), > + gfp_flags); > > /* deviceid */ > - rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid); > + rc = decode_deviceid(&stream, &fls->mirror_array[i]->dss[dss_id].devid); Can this dereference NULL in ff_layout_alloc_lseg()? The kcalloc() call above can return NULL under memory pressure, but there's no check before accessing fls->mirror_array[i]->dss[dss_id].devid in the decode_deviceid() call. All other memory allocations in this function check for NULL and jump to out_err_free on failure. > if (rc) > goto out_err_free; > [ ... ] commit 8c48e1c7520321cc87ff651e96093e2f412785fb Author: Stefan Metzmacher smb: client: don't wait for info->send_pending == 0 on error This commit addresses an issue where smbd_destroy() would wait indefinitely for send_pending to reach zero after ib_drain_qp() is called. The patch removes the problematic wait and modifies smbd_send() to exit early when the socket is disconnected. > diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c > index c628e91c328b..02d6db431fd4 100644 > --- a/fs/smb/client/smbdirect.c > +++ b/fs/smb/client/smbdirect.c > @@ -1337,10 +1337,6 @@ void smbd_destroy(struct TCP_Server_Info *server) > log_rdma_event(INFO, "cancelling idle timer\n"); > cancel_delayed_work_sync(&info->idle_timer_work); > > - log_rdma_event(INFO, "wait for all send posted to IB to finish\n"); > - wait_event(info->wait_send_pending, > - atomic_read(&info->send_pending) == 0); > - Can smbd_destroy() proceed safely without waiting for send operations to complete? The send_done() callback accesses the info structure after this point: In send_done(), when wc->status != IB_WC_SUCCESS (which happens after ib_drain_qp() above), the function calls smbd_disconnect_rdma_connection() but skips the atomic_dec_and_test(&info->send_pending) call. However, when the operation succeeds, send_done() does call: if (atomic_dec_and_test(&info->send_pending)) wake_up(&info->wait_send_pending); After ib_drain_qp() triggers completions with IB_WC_WR_FLUSH_ERR status, these send_done() callbacks can execute after smbd_destroy() has called kfree(info). When send_done() then tries to access info->send_pending, does this create a use-after-free condition? > /* It's not possible for upper layer to get to reassembly */ > log_rdma_event(INFO, "drain the reassembly queue\n"); > do { > @@ -1986,7 +1982,11 @@ int smbd_send(struct TCP_Server_Info *server, > */ > > wait_event(info->wait_send_pending, > - atomic_read(&info->send_pending) == 0); > + atomic_read(&info->send_pending) == 0 || > + sc->status != SMBDIRECT_SOCKET_CONNECTED); > + > + if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0) > + rc = -EAGAIN; > > return rc; > } commit 1f7168b28f667c0920fd83151ae35ce593950663 Author: André Almeida ovl: Ensure that all layers have the same encoding This patch adds encoding consistency validation to overlayfs when casefold is enabled. It introduces ovl_set_encoding() to copy encoding settings from the first layer and validates all layers have matching encoding. > diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c > index df85a76597e9..548ea9b1a48d 100644 > --- a/fs/overlayfs/super.c > +++ b/fs/overlayfs/super.c > @@ -991,6 +991,25 @@ static int ovl_get_data_fsid(struct ovl_fs *ofs) > return ofs->numfs; > } > > +/* > + * Set the ovl sb encoding as the same one used by the first layer > + */ > +static int ovl_set_encoding(struct super_block *sb, struct super_block *fs_sb) > +{ > + if (!sb_has_encoding(fs_sb)) > + return 0; > + > +#if IS_ENABLED(CONFIG_UNICODE) > + if (sb_has_strict_encoding(fs_sb)) { > + pr_err("strict encoding not supported\n"); > + return -EINVAL; > + } > + > + sb->s_encoding = fs_sb->s_encoding; > + sb->s_encoding_flags = fs_sb->s_encoding_flags; ^^^^ Will this leak the unicode_map on overlayfs unmount? Other filesystems like shmem and ext4 call utf8_unload(sb->s_encoding) in their put_super operations, but ovl_put_super()->ovl_free_fs() has no utf8_unload() call. The unicode_map appears to be reference counted via utf8_load()/utf8_unload() pairs in other parts of the kernel, but this code just copies the pointer without taking a proper reference. > +#endif > + return 0; > +} > > static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, > struct ovl_fs_context *ctx, struct ovl_layer *layers) commit 8004ea01cf6338298e0c6ab055bc3ec659ce381b Author: Babu Moger fs/resctrl: Introduce the interface to switch between monitor modes This patch introduces mbm_assign_mode resctrl file to switch between mbm_event and default monitoring modes. It also resets MBM event counters automatically when the mode changes. Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com > diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst > index f60f6a96cb6b..006d23af66e1 100644 > --- a/Documentation/filesystems/resctrl.rst > +++ b/Documentation/filesystems/resctrl.rst [ ... ] > diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c > index f388dbcdbdcd..50c24460d992 100644 > --- a/fs/resctrl/monitor.c > +++ b/fs/resctrl/monitor.c [ ... ] > +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, > + size_t nbytes, loff_t off) > +{ > + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); > + struct rdt_mon_domain *d; > + int ret = 0; > + bool enable; > + > + /* Valid input requires a trailing newline */ > + if (nbytes == 0 || buf[nbytes - 1] != '\n') > + return -EINVAL; > + > + buf[nbytes - 1] = '\0'; > + > + cpus_read_lock(); > + mutex_lock(&rdtgroup_mutex); > + > + rdt_last_cmd_clear(); > + > + if (!strcmp(buf, "default")) { > + enable = 0; > + } else if (!strcmp(buf, "mbm_event")) { > + if (r->mon.mbm_cntr_assignable) { > + enable = 1; > + } else { > + ret = -EINVAL; > + rdt_last_cmd_puts("mbm_event mode is not supported\n"); > + goto out_unlock; > + } > + } else { > + ret = -EINVAL; > + rdt_last_cmd_puts("Unsupported assign mode\n"); > + goto out_unlock; > + } > + > + if (enable != resctrl_arch_mbm_cntr_assign_enabled(r)) { > + ret = resctrl_arch_mbm_cntr_assign_set(r, enable); > + if (ret) > + goto out_unlock; > + > + /* Update the visibility of BMEC related files */ > + resctrl_bmec_files_show(r, NULL, !enable); > + > + /* > + * Initialize the default memory transaction values for > + * total and local events. > + */ > + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) > + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; > + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) > + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & > + (READS_TO_LOCAL_MEM | > + READS_TO_LOCAL_S_MEM | > + NON_TEMP_WRITE_TO_LOCAL_MEM); > + /* Enable auto assignment when switching to "mbm_event" mode */ > + if (enable) > + r->mon.mbm_assign_on_mkdir = true; > + /* > + * Reset all the non-achitectural RMID state and assignable counters. > + */ > + list_for_each_entry(d, &r->mon_domains, hdr.list) { > + mbm_cntr_free_all(r, d); > + resctrl_reset_rmid_all(r, d); > + } > + } > + > +out_unlock: > + mutex_lock(&rdtgroup_mutex); ^^^^ Can this cause a deadlock? The function already acquired rdtgroup_mutex at the beginning, so attempting to lock it again here would block indefinitely. Should this be mutex_unlock(&rdtgroup_mutex) instead? > + cpus_read_unlock(); > + > + return ret ?: nbytes; > +} [ ... ] commit aab2c5088cdb26e80d51ffbe72d24ab23fa1533e Author: Babu Moger fs/resctrl: Add the functionality to unassign MBM events This patch adds counter cleanup functionality for MBM events by implementing rdtgroup_unassign_cntrs() and supporting functions to free assigned counters when resource groups are deleted. Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com > diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h > index 762705d7eb8d..c6b66d4a6a37 100644 > --- a/fs/resctrl/internal.h > +++ b/fs/resctrl/internal.h > @@ -398,6 +398,8 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_fil > > void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp); > > +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp); > + The rdtgroup_unassign_cntrs() function is declared but where should it be called? Looking at rdtgroup_assign_cntrs() which is called during group creation, shouldn't rdtgroup_unassign_cntrs() be called during group deletion in rdtgroup_rmdir_ctrl() and rdtgroup_rmdir_mon()? > diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c > index 106e9bdb8a9d..2ed29ae831a4 100644 > --- a/fs/resctrl/monitor.c > +++ b/fs/resctrl/monitor.c > @@ -405,6 +405,14 @@ static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, > return -ENOSPC; > } > > +/* > + * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d. > + */ > +static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) > +{ > + memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); > +} > + [ ... ] > @@ -1043,6 +1051,64 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) > &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); > } > > +/* > + * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration > + * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp. > + */ > +static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, > + struct rdtgroup *rdtgrp, struct mon_evt *mevt) > +{ > + int cntr_id; > + > + cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid); > + > + /* If there is no cntr_id assigned, nothing to do */ > + if (cntr_id < 0) > + return; > + > + rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, false); > + > + mbm_cntr_free(d, cntr_id); > +} > + > +/* > + * rdtgroup_unassign_cntr_event() - Unassign a hardware counter associated with > + * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign > + * the counters from all the domains if @d is NULL else unassign from @d. > + */ > +static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, > + struct mon_evt *mevt) > +{ > + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); > + > + if (!d) { > + list_for_each_entry(d, &r->mon_domains, hdr.list) > + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); > + } else { > + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); > + } > +} > + > +/* > + * rdtgroup_unassign_cntrs() - Unassign the counters associated with MBM events. > + * Called when a group is deleted. > + */ > +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp) > +{ > + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); > + > + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) > + return; > + > + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) > + rdtgroup_unassign_cntr_event(NULL, rdtgrp, > + &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]); > + > + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) > + rdtgroup_unassign_cntr_event(NULL, rdtgrp, > + &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); > +} > + > int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, > struct seq_file *s, void *v) > { The comment states "Called when a group is deleted" but the function has no callers. Won't this create a counter leak when groups are deleted since counters assigned during group creation via rdtgroup_assign_cntrs() ->rdtgroup_assign_cntr_event()->rdtgroup_alloc_assign_cntr() won't be cleaned up during deletion? commit 2d8bd8049e89efe42a5397de4effd899e8dd2249 Author: Pankaj Raghav mm: add persistent huge zero folio This patch introduces a config option CONFIG_PERSISTENT_HUGE_ZERO_FOLIO that changes huge zero folio lifecycle from dynamic allocation/deallocation to persistent allocation during boot. When enabled, the shrinker is disabled and the folio is never freed. Link: https://lkml.kernel.org/r/20250811084113.647267-4-kernel@pankajraghav.com > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index 7748489fde1b..bd547857c6c1 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -495,6 +495,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) > struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); > void mm_put_huge_zero_folio(struct mm_struct *mm); > > +static inline struct folio *get_persistent_huge_zero_folio(void) > +{ > + if (!IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) > + return NULL; > + > + if (unlikely(!huge_zero_folio)) > + return NULL; > + > + return huge_zero_folio; > +} > + > static inline bool thp_migration_supported(void) > { > return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); [ ... ] > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 2801ce9bbde9..b8bb078a1a34 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -248,6 +248,9 @@ static void put_huge_zero_folio(void) > > struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) > { > + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) > + return huge_zero_folio; > + > if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) > return READ_ONCE(huge_zero_folio); > > @@ -262,6 +265,9 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) > > void mm_put_huge_zero_folio(struct mm_struct *mm) > { > + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) > + return; > + > if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) > put_huge_zero_folio(); > } [ ... ] > @@ -849,16 +855,34 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) > > static int __init thp_shrinker_init(void) > { > - huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); > - if (!huge_zero_folio_shrinker) > - return -ENOMEM; > - > deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | > SHRINKER_MEMCG_AWARE | > SHRINKER_NONSLAB, > "thp-deferred_split"); > - if (!deferred_split_shrinker) { > - shrinker_free(huge_zero_folio_shrinker); > + if (!deferred_split_shrinker) > + return -ENOMEM; > + > + deferred_split_shrinker->count_objects = deferred_split_count; > + deferred_split_shrinker->scan_objects = deferred_split_scan; > + shrinker_register(deferred_split_shrinker); > + > + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) { > + /* > + * Bump the reference of the huge_zero_folio and do not > + * initialize the shrinker. > + * > + * huge_zero_folio will always be NULL on failure. We assume > + * that get_huge_zero_folio() will most likely not fail as > + * thp_shrinker_init() is invoked early on during boot. > + */ > + if (!get_huge_zero_folio()) > + pr_warn("Allocating persistent huge zero folio failed\n"); > + return 0; > + } Should thp_shrinker_init() return an error code when get_huge_zero_folio() fails in persistent mode? Currently it returns 0 (success) even when the allocation fails, which means hugepage_init() will believe initialization succeeded. This leaves the deferred_split_shrinker allocated but the persistent huge zero folio unavailable. If the system is supposed to continue working without the persistent huge zero folio, the current behavior might be intentional. However, callers like hugepage_init() have no way to know that persistent allocation failed. > + > + huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); > + if (!huge_zero_folio_shrinker) { > + shrinker_free(deferred_split_shrinker); > return -ENOMEM; > } > > @@ -866,10 +890,6 @@ static int __init thp_shrinker_init(void) > huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; > shrinker_register(huge_zero_folio_shrinker); > > - deferred_split_shrinker->count_objects = deferred_split_count; > - deferred_split_shrinker->scan_objects = deferred_split_scan; > - shrinker_register(deferred_split_shrinker); > - > return 0; > } commit 12e423ba4eaed7b1561b677d32e6599f932d03db Author: Lorenzo Stoakes mm: convert core mm to mm_flags_*() accessors This patch converts existing users of direct mm_struct->flags bit operations to use the new mm_flags_*() accessor functions as part of migrating mm->flags to a bitmap field. The commit states that no functional changes are intended. Link: https://lkml.kernel.org/r/1eb2266f4408798a55bda00cb04545a3203aa572.1755012943.git.lorenzo.stoakes@oracle.com > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index 14d424830fa8..84b7eebe0d68 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -327,7 +327,7 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, > * example, s390 kvm. > */ > return (vm_flags & VM_NOHUGEPAGE) || > - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags); > + mm_flags_test(MMF_DISABLE_THP, vma->vm_mm); > } > > static inline bool thp_disabled_by_hw(void) [ ... ] > diff --git a/mm/oom_kill.c b/mm/oom_kill.c > index 25923cfec9c6..17650f0b516e 100644 > --- a/mm/oom_kill.c > +++ b/mm/oom_kill.c [ ... ] > @@ -1251,7 +1251,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) > * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure > * possible change in exit_mmap is seen > */ > - if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) > + if (mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm)) > ret = -EAGAIN; > mmap_read_unlock(mm); Does this condition invert the logic? The original code checked if MMF_OOM_SKIP was NOT set, but the new code checks if it IS set. This appears to change when process_mrelease()->__oom_reap_task_mm() gets called and when -EAGAIN is returned to userspace. commit 7c80f3c72467b180a9d7b3852c74b7540a5a6ce9 Author: Mike Rapoport (Microsoft) kho: add support for preserving vmalloc allocations A vmalloc allocation is preserved using binary structure similar to global KHO memory tracker. It's a linked list of pages where each page is an array of physical address of pages in vmalloc area. Link: https://lkml.kernel.org/r/20250921054458.4043761-4-rppt@kernel.org > diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h > index cec663b39861..1b3c41ab7a0b 100644 > --- a/include/linux/kexec_handover.h > +++ b/include/linux/kexec_handover.h [ ... ] > diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c > index d30c2a99f3ea..57336e510f3b 100644 > --- a/kernel/kexec_handover.c > +++ b/kernel/kexec_handover.c [ ... ] > +void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) > +{ > + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); > + unsigned int align, order, shift, vm_flags; > + unsigned long total_pages, contig_pages; > + unsigned long addr, size; > + struct vm_struct *area; > + struct page **pages; > + unsigned int idx = 0; > + int err; > + > + vm_flags = kho_flags_to_vmalloc(preservation->flags); > + if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) > + return NULL; > + > + total_pages = preservation->total_pages; > + pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL); > + if (!pages) > + return NULL; > + order = preservation->order; > + contig_pages = (1 << order); > + shift = PAGE_SHIFT + order; > + align = 1 << shift; > + > + while (chunk) { > + struct page *page; > + > + for (int i = 0; chunk->phys[i]; i++) { > + phys_addr_t phys = chunk->phys[i]; > + > + if (idx + contig_pages > total_pages) > + goto err_free_pages_array; > + > + page = kho_restore_pages(phys, contig_pages); > + if (!page) > + goto err_free_pages_array; > + > + for (int j = 0; j < contig_pages; j++) > + pages[idx++] = page; ^^^^^ Can this page array handling work correctly for higher-order pages? When contig_pages > 1, the loop stores the same compound page head pointer multiple times instead of individual page pointers (page + j). The vmap_pages_range() function expects individual page pointers for each PAGE_SIZE page, but this provides duplicate pointers to the compound head. > + > + phys += contig_pages * PAGE_SIZE; > + } > + > + page = kho_restore_pages(virt_to_phys(chunk), 1); > + if (!page) > + goto err_free_pages_array; > + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); > + __free_page(page); > + } > + > + if (idx != total_pages) > + goto err_free_pages_array; > + > + area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift, > + vm_flags, VMALLOC_START, VMALLOC_END, > + NUMA_NO_NODE, GFP_KERNEL, > + __builtin_return_address(0)); > + if (!area) > + goto err_free_pages_array; > + > + addr = (unsigned long)area->addr; > + size = get_vm_area_size(area); > + err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift); > + if (err) > + goto err_free_vm_area; > + > + area->nr_pages = total_pages; > + area->pages = pages; > + > + return area->addr; > + > +err_free_vm_area: > + free_vm_area(area); > +err_free_pages_array: > + kvfree(pages); > + return NULL; > +} commit 2191b3471df53d87a46179b0c302c77c9ccfec11 Author: Stefan Metzmacher smb: smbdirect: introduce smbdirect_socket.send_io.credits.{count,wait_queue} This patch adds new credit management fields to the smbdirect_socket structure that will be shared between client and server soon. The patch initializes these fields in smbdirect_socket_init(). > diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h > index 1df43b7a87f4..7c2147a0244a 100644 > --- a/fs/smb/common/smbdirect/smbdirect_socket.h > +++ b/fs/smb/common/smbdirect/smbdirect_socket.h > @@ -78,6 +78,14 @@ struct smbdirect_socket { > mempool_t *pool; > } mem; > > + /* > + * The credit state for the send side > + */ > + struct { > + atomic_t count; > + wait_queue_head_t wait_queue; > + } credits; > + Are these new fields properly initialized when smbdirect_socket structures are created? Looking at _smbd_get_connection() in fs/smb/client/smbdirect.c, the smbd_connection structure (which contains smbdirect_socket as its first member) is allocated with kzalloc() but smbdirect_socket_init() is never called. > /* > * The state about posted/pending sends > */ > @@ -163,6 +171,9 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) > > init_waitqueue_head(&sc->status_wait); > > + atomic_set(&sc->send_io.credits.count, 0); > + init_waitqueue_head(&sc->send_io.credits.wait_queue); > + The initialization code looks correct, but can this function be reached? A grep through the fs/smb/ directory shows smbdirect_socket_init() is defined but never called. The actual initialization in _smbd_get_connection() only calls init_waitqueue_head() for some specific wait queues but not for the new credits.wait_queue. Without init_waitqueue_head() being called, any subsequent wait_event() or wake_up() operations on sc->send_io.credits.wait_queue will access uninitialized spinlocks and list heads, leading to kernel crashes. > atomic_set(&sc->send_io.pending.count, 0); > init_waitqueue_head(&sc->send_io.pending.dec_wait_queue); > init_waitqueue_head(&sc->send_io.pending.zero_wait_queue); commit 371d3ab51ce44c8a7c0f9b835b74b7340a922059 Author: Stefan Metzmacher smb: smbdirect: introduce smbdirect_socket.status_wait This patch adds a wait_queue_head_t status_wait field to the smbdirect_socket struct for future use by server and client components. > diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h > index 8484c6f09317..a69b9649b88e 100644 > --- a/fs/smb/common/smbdirect/smbdirect_socket.h > +++ b/fs/smb/common/smbdirect/smbdirect_socket.h > @@ -41,6 +41,7 @@ const char *smbdirect_socket_status_string(enum smbdirect_socket_status status) > > struct smbdirect_socket { > enum smbdirect_socket_status status; > + wait_queue_head_t status_wait; > > /* RDMA related */ > struct { Is the new status_wait field being properly initialized? Looking at _smbd_get_connection() in fs/smb/client/smbdirect.c, the smbdirect_socket struct is allocated via kzalloc() which zeros the memory, but wait queue heads require explicit initialization via init_waitqueue_head() before use. The existing info->status_wait is correctly initialized, but I don't see where sc->status_wait gets initialized. Can uninitialized wait queue operations cause memory corruption when this field is eventually used? commit 603b4416232524dafde8e2cf859788dae786dea1 Author: KP Singh bpf: Update the bpf_prog_calc_tag to use SHA256 This patch changes bpf_prog_calc_tag() to use SHA256 instead of SHA1 and modifies the bpf_prog structure to store the full SHA256 digest. The digest and tag fields are placed in a union to maintain compatibility. Link: https://lore.kernel.org/r/20250914215141.15144-2-kpsingh@kernel.org > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 41f776071ff5..d75902074bd1 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -31,6 +31,7 @@ > #include > #include > #include > +#include > > struct bpf_verifier_env; > struct bpf_verifier_log; > @@ -1717,7 +1718,10 @@ struct bpf_prog { > enum bpf_attach_type expected_attach_type; /* For some prog types */ > u32 len; /* Number of filter blocks */ > u32 jited_len; /* Size of jited insns in bytes */ > - u8 tag[BPF_TAG_SIZE]; > + union { > + u8 digest[SHA256_DIGEST_SIZE]; > + u8 tag[BPF_TAG_SIZE]; > + }; Can this union create memory access issues? The digest array is 32 bytes (SHA256_DIGEST_SIZE) while tag is 8 bytes (BPF_TAG_SIZE). Code accessing prog->tag after bpf_prog_calc_tag() writes to prog->digest might read incorrect data from the union. -clm false positive, but not sure I want to teach claude to like unions > struct bpf_prog_stats __percpu *stats; > int __percpu *active; > unsigned int (*bpf_func)(const void *ctx, [ ... ] > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 1cda2589d4b3..9b64674df16b 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -39,6 +39,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -296,7 +297,6 @@ void __bpf_prog_free(struct bpf_prog *fp) > int bpf_prog_calc_tag(struct bpf_prog *fp) > { > size_t size = bpf_prog_insn_size(fp); > - u8 digest[SHA1_DIGEST_SIZE]; > struct bpf_insn *dst; > bool was_ld_map; > u32 i; > @@ -327,8 +327,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) > was_ld_map = false; > } > } > - sha1((const u8 *)dst, size, digest); > - memcpy(fp->tag, digest, sizeof(fp->tag)); > + sha256((u8 *)dst, size, fp->digest); Are prog->tag users broken by this change? The function now writes to fp->digest but existing code accesses fp->tag. For example: - perf_event_bpf_event() calls memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE) - bpf_prog_get_info_by_fd() calls memcpy(info.tag, prog->tag, sizeof(prog->tag)) - bpf_prog_get_info_by_fd() also calls copy_to_user(user_prog_tags[0], prog->tag, BPF_TAG_SIZE) Since fp->digest gets written but fp->tag gets read, will these functions access uninitialized memory in the tag field? > vfree(dst); > return 0; > }