From: "Garg, Shivank" <shivankg@amd.com>
To: seanjc@google.com, david@redhat.com, vbabka@suse.cz,
willy@infradead.org, akpm@linux-foundation.org, shuah@kernel.org,
pbonzini@redhat.com, brauner@kernel.org, viro@zeniv.linux.org.uk
Cc: ackerleytng@google.com, paul@paul-moore.com, jmorris@namei.org,
serge@hallyn.com, pvorel@suse.cz, bfoster@redhat.com,
tabba@google.com, vannapurve@google.com, chao.gao@intel.com,
bharata@amd.com, nikunj@amd.com, michael.day@amd.com,
shdhiman@amd.com, yan.y.zhao@intel.com, Neeraj.Upadhyay@amd.com,
thomas.lendacky@amd.com, michael.roth@amd.com, aik@amd.com,
jgg@nvidia.com, kalyazin@amazon.com, peterx@redhat.com,
jack@suse.cz, rppt@kernel.org, hch@infradead.org,
cgzones@googlemail.com, ira.weiny@intel.com, rientjes@google.com,
roypat@amazon.co.uk, ziy@nvidia.com, matthew.brost@intel.com,
joshua.hahnjy@gmail.com, rakie.kim@sk.com, byungchul@sk.com,
gourry@gourry.net, kent.overstreet@linux.dev,
ying.huang@linux.alibaba.com, apopple@nvidia.com,
chao.p.peng@intel.com, amit@infradead.org, ddutile@redhat.com,
dan.j.williams@intel.com, ashish.kalra@amd.com, gshan@redhat.com,
jgowans@amazon.com, pankaj.gupta@amd.com, papaluri@amd.com,
yuzhao@google.com, suzuki.poulose@arm.com,
quic_eberman@quicinc.com, aneeshkumar.kizhakeveetil@arm.com,
linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
linux-kernel@vger.kernel.org,
linux-security-module@vger.kernel.org, kvm@vger.kernel.org,
linux-kselftest@vger.kernel.org, linux-coco@lists.linux.dev
Subject: Re: [PATCH RFC V10 5/7] KVM: guest_memfd: Add slab-allocated inode cache
Date: Wed, 13 Aug 2025 11:40:50 +0530 [thread overview]
Message-ID: <e7f7703d-fe76-4ab2-bef4-8d4c54da03ad@amd.com> (raw)
In-Reply-To: <20250811090605.16057-11-shivankg@amd.com>
On 8/11/2025 2:36 PM, Shivank Garg wrote:
> Add dedicated inode structure (kvm_gmem_inode_info) and slab-allocated
> inode cache for guest memory backing, similar to how shmem handles inodes.
>
> This adds the necessary allocation/destruction functions and prepares
> for upcoming guest_memfd NUMA policy support changes.
>
> Signed-off-by: Shivank Garg <shivankg@amd.com>
> ---
> virt/kvm/guest_memfd.c | 69 ++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 67 insertions(+), 2 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 0e93323fc839..d9c23401e770 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -17,6 +17,15 @@ struct kvm_gmem {
> struct list_head entry;
> };
>
> +struct kvm_gmem_inode_info {
> + struct inode vfs_inode;
> +};
> +
> +static inline struct kvm_gmem_inode_info *KVM_GMEM_I(struct inode *inode)
> +{
> + return container_of(inode, struct kvm_gmem_inode_info, vfs_inode);
> +}
> +
> /**
> * folio_file_pfn - like folio_file_page, but return a pfn.
> * @folio: The folio which contains this index.
> @@ -389,13 +398,46 @@ static struct file_operations kvm_gmem_fops = {
> .fallocate = kvm_gmem_fallocate,
> };
>
> +static struct kmem_cache *kvm_gmem_inode_cachep;
> +
> +static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
> +{
> + struct kvm_gmem_inode_info *info;
> +
> + info = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
> + if (!info)
> + return NULL;
> +
> + return &info->vfs_inode;
> +}
> +
> +static void kvm_gmem_destroy_inode(struct inode *inode)
> +{
> +}
> +
> +static void kvm_gmem_free_inode(struct inode *inode)
> +{
> + kmem_cache_free(kvm_gmem_inode_cachep, KVM_GMEM_I(inode));
> +}
> +
> +static const struct super_operations kvm_gmem_super_operations = {
> + .statfs = simple_statfs,
> + .alloc_inode = kvm_gmem_alloc_inode,
> + .destroy_inode = kvm_gmem_destroy_inode,
> + .free_inode = kvm_gmem_free_inode,
> +};
> +
> static int kvm_gmem_init_fs_context(struct fs_context *fc)
> {
> + struct pseudo_fs_context *ctx;
> +
> if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
> return -ENOMEM;
>
> fc->s_iflags |= SB_I_NOEXEC;
> fc->s_iflags |= SB_I_NODEV;
> + ctx = fc->fs_private;
> + ctx->ops = &kvm_gmem_super_operations;
>
> return 0;
> }
> @@ -417,17 +459,40 @@ static int kvm_gmem_init_mount(void)
> return 0;
> }
>
> +static void kvm_gmem_init_inode(void *foo)
> +{
> + struct kvm_gmem_inode_info *info = foo;
> +
> + inode_init_once(&info->vfs_inode);
> +}
> +
> int kvm_gmem_init(struct module *module)
> {
> - kvm_gmem_fops.owner = module;
> + int ret;
> + struct kmem_cache_args args = {
> + .align = 0,
> + .ctor = kvm_gmem_init_inode,
> + };
>
> - return kvm_gmem_init_mount();
> + kvm_gmem_fops.owner = module;
> + kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
> + sizeof(struct kvm_gmem_inode_info),
> + &args, SLAB_ACCOUNT);
> + if (!kvm_gmem_inode_cachep)
> + return -ENOMEM;
> + ret = kvm_gmem_init_mount();
> + if (ret) {
> + kmem_cache_destroy(kvm_gmem_inode_cachep);
> + return ret;
> + }
> + return 0;
> }
>
> void kvm_gmem_exit(void)
> {
> kern_unmount(kvm_gmem_mnt);
> kvm_gmem_mnt = NULL;
> + kmem_cache_destroy(kvm_gmem_inode_cachep);
> }
While testing my code, I discovered a bug that occurs when unloading the kvm_amd module
after a guest_memfd-backed VM has run.
dmesg logs:
[ 610.075763] =============================================================================
[ 610.083933] BUG kvm_gmem_inode_cache (Not tainted): Objects remaining on __kmem_cache_shutdown()
[ 610.092711] -----------------------------------------------------------------------------
[ 610.102368] Object 0x000000008ee52a58 @offset=19200
[ 610.107247] Slab 0x000000004b1b088c objects=51 used=1 fp=0x000000007c55fc00 flags=0x57ffffc0000240(workingset|head|node=1|zone=2|lastcpupid=0x1fffff)
[ 610.120733] Disabling lock debugging due to kernel taint
[ 610.120741] ------------[ cut here ]------------
[ 610.120742] WARNING: CPU: 7 PID: 7554 at mm/slub.c:1171 __kmem_cache_shutdown+0x264/0x370
[ 610.120751] Modules linked in: xt_set ip_set xt_addrtype xfrm_user xfrm_algo xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT nf_reject_ipv4 nft_compat nff_defrag_ipv4 nf_tables overlay bridge stp llc cfg80211 rfkill binfmt_misc ipmi_ssif amd_atl intel_rapl_msr wmi_bmof intel_rapl_common amd64_edac edac_mce_amdmem_helper drm_kms_helper i2c_piix4 ptdma i2c_smbus k10temp wmi acpi_power_meter ipmi_si acpi_ipmi ipmi_devintf ipmi_msghandler sg dm_multipath fuse drm dm_mo56 async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 sd_mod kvm_amd(-) ahci libahci kvm nvme tg3 libata ccp irqbypass nvme_c
[ 610.120831] CPU: 7 UID: 0 PID: 7554 Comm: rmmod Kdump: loaded Tainted: G B 6.16.0+ #10 PREEMPT(none)
[ 610.120835] Tainted: [B]=BAD_PAGE
[ 610.120836] Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.16.2 07/09/2024
[ 610.120838] RIP: 0010:__kmem_cache_shutdown+0x264/0x370
[ 610.120841] Code: 89 f1 4c 89 f6 4d 8b 46 20 48 c7 c7 08 08 ec 87 81 e2 ff 7f 00 00 e8 fb a7 d7 ff be 01 00 00 00 bf 05 00 00 00 e8 dc e9 cd ff <0f> 0b 48 fe ff ff
[ 610.120843] RSP: 0018:ffffcd6962963cb8 EFLAGS: 00010046
[ 610.120846] RAX: 0000000000000000 RBX: ffff89fde07d21c0 RCX: 0000000000000027
[ 610.120848] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff89fcbe5dbe80
[ 610.120850] RBP: ffff89fde07d21c0 R08: 0000000000000000 R09: 0000000000000003
[ 610.120851] R10: ffffcd6962963b58 R11: ffffffff889db908 R12: ffff89fdcccd7f80
[ 610.120852] R13: ffff89fdcccd0000 R14: fffff96802333400 R15: ffff89fdd6ab6c00
[ 610.120854] FS: 00007f066eaab080(0000) GS:ffff89fd3516f000(0000) knlGS:0000000000000000
[ 610.120856] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 610.120857] CR2: 00007ffefd577828 CR3: 0000000220406004 CR4: 0000000000770ef0
[ 610.120859] PKRU: 55555554
[ 610.120860] Call Trace:
[ 610.120862] <TASK>
[ 610.120866] kmem_cache_destroy+0x3a/0x150
[ 610.120872] kvm_exit+0x7b/0xa0 [kvm]
[ 610.120919] svm_exit+0x5/0x10 [kvm_amd]
[ 610.120926] __do_sys_delete_module.isra.0+0x18b/0x2e0
[ 610.120933] ? srso_alias_return_thunk+0x5/0xfbef5
[ 610.120937] ? syscall_trace_enter+0xfa/0x1a0
[ 610.120941] do_syscall_64+0x7b/0x2c0
[ 610.120947] ? srso_alias_return_thunk+0x5/0xfbef5
[ 610.120950] ? __handle_mm_fault+0x2aa/0x670
[ 610.120954] ? iterate_dir+0x11e/0x230
[ 610.120960] ? srso_alias_return_thunk+0x5/0xfbef5
[ 610.120963] ? count_memcg_events+0xb2/0x160
[ 610.120967] ? srso_alias_return_thunk+0x5/0xfbef5
[ 610.120969] ? handle_mm_fault+0xb2/0x2f0
[ 610.120972] ? srso_alias_return_thunk+0x5/0xfbef5
[ 610.120975] ? do_user_addr_fault+0x16f/0x6f0
[ 610.120981] ? srso_alias_return_thunk+0x5/0xfbef5
[ 610.120984] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 610.120986] RIP: 0033:0x7f066e12ac9b
[ 610.120989] Code: 73 01 c3 48 8b 0d 7d 81 0d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 b0 00 00 00 0f 05 <48> 3d 01 89 01 48
[ 610.120990] RSP: 002b:00007ffc629f1878 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
[ 610.120993] RAX: ffffffffffffffda RBX: 00005630e80256f0 RCX: 00007f066e12ac9b
[ 610.120994] RDX: 0000000000000000 RSI: 0000000000000800 RDI: 00005630e8025758
[ 610.120996] RBP: 00007ffc629f18a0 R08: 1999999999999999 R09: 0000000000000000
[ 610.120997] R10: 00007f066e1b1fc0 R11: 0000000000000206 R12: 0000000000000000
[ 610.120999] R13: 00007ffc629f1af0 R14: 00005630e80256f0 R15: 0000000000000000
[ 610.121003] </TASK>
[ 610.121004] ---[ end trace 0000000000000000 ]---
[ 610.121017] ------------[ cut here ]------------
There is a race condition here:
kern_unmount() -> mntput() -> cleanup_mnt() -> deactivate_super() -> deactivate_locked_super() -> fs->kill_sb() (guest_memfd kill_sb) -> generic_shutdown_super() -> evict_inodes() -> destroy_inode() -> call_rcu()
I should be waiting for pending RCU callback to finish before calling the kmem_cache_destroy().
To fix this, I added rcu_barrier() like dax_fs_exit() is doing.
@@ -561,6 +566,7 @@ void kvm_gmem_exit(void)
{
kern_unmount(kvm_gmem_mnt);
kvm_gmem_mnt = NULL;
+ rcu_barrier();
kmem_cache_destroy(kvm_gmem_inode_cachep);
}
I'll incorporate this fix into next version.
Thanks,
Shivank
next prev parent reply other threads:[~2025-08-13 6:11 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-08-11 9:05 [PATCH RFC V10 0/7] Add NUMA mempolicy support for KVM guest-memfd Shivank Garg
2025-08-11 9:06 ` [PATCH RFC V10 1/7] mm/filemap: Add NUMA mempolicy support to filemap_alloc_folio() Shivank Garg
2025-08-11 9:06 ` [PATCH RFC V10 2/7] mm/filemap: Extend __filemap_get_folio() to support NUMA memory policies Shivank Garg
2025-08-11 9:06 ` [PATCH RFC V10 3/7] mm/mempolicy: Export memory policy symbols Shivank Garg
2025-08-11 9:06 ` [PATCH RFC V10 4/7] KVM: guest_memfd: Use guest mem inodes instead of anonymous inodes Shivank Garg
2025-08-11 15:33 ` David Hildenbrand
2025-08-11 21:23 ` Ackerley Tng
2025-08-13 5:37 ` Garg, Shivank
2025-08-11 9:06 ` [PATCH RFC V10 5/7] KVM: guest_memfd: Add slab-allocated inode cache Shivank Garg
2025-08-13 6:10 ` Garg, Shivank [this message]
2025-08-11 9:06 ` [PATCH RFC V10 6/7] KVM: guest_memfd: Enforce NUMA mempolicy using shared policy Shivank Garg
2025-08-11 9:06 ` [PATCH RFC V10 7/7] KVM: guest_memfd: selftests: Add tests for mmap and NUMA policy support Shivank Garg
2025-08-11 14:34 ` [PATCH RFC V10 0/7] Add NUMA mempolicy support for KVM guest-memfd Sean Christopherson
2025-08-11 14:41 ` David Hildenbrand
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=e7f7703d-fe76-4ab2-bef4-8d4c54da03ad@amd.com \
--to=shivankg@amd.com \
--cc=Neeraj.Upadhyay@amd.com \
--cc=ackerleytng@google.com \
--cc=aik@amd.com \
--cc=akpm@linux-foundation.org \
--cc=amit@infradead.org \
--cc=aneeshkumar.kizhakeveetil@arm.com \
--cc=apopple@nvidia.com \
--cc=ashish.kalra@amd.com \
--cc=bfoster@redhat.com \
--cc=bharata@amd.com \
--cc=brauner@kernel.org \
--cc=byungchul@sk.com \
--cc=cgzones@googlemail.com \
--cc=chao.gao@intel.com \
--cc=chao.p.peng@intel.com \
--cc=dan.j.williams@intel.com \
--cc=david@redhat.com \
--cc=ddutile@redhat.com \
--cc=gourry@gourry.net \
--cc=gshan@redhat.com \
--cc=hch@infradead.org \
--cc=ira.weiny@intel.com \
--cc=jack@suse.cz \
--cc=jgg@nvidia.com \
--cc=jgowans@amazon.com \
--cc=jmorris@namei.org \
--cc=joshua.hahnjy@gmail.com \
--cc=kalyazin@amazon.com \
--cc=kent.overstreet@linux.dev \
--cc=kvm@vger.kernel.org \
--cc=linux-coco@lists.linux.dev \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-security-module@vger.kernel.org \
--cc=matthew.brost@intel.com \
--cc=michael.day@amd.com \
--cc=michael.roth@amd.com \
--cc=nikunj@amd.com \
--cc=pankaj.gupta@amd.com \
--cc=papaluri@amd.com \
--cc=paul@paul-moore.com \
--cc=pbonzini@redhat.com \
--cc=peterx@redhat.com \
--cc=pvorel@suse.cz \
--cc=quic_eberman@quicinc.com \
--cc=rakie.kim@sk.com \
--cc=rientjes@google.com \
--cc=roypat@amazon.co.uk \
--cc=rppt@kernel.org \
--cc=seanjc@google.com \
--cc=serge@hallyn.com \
--cc=shdhiman@amd.com \
--cc=shuah@kernel.org \
--cc=suzuki.poulose@arm.com \
--cc=tabba@google.com \
--cc=thomas.lendacky@amd.com \
--cc=vannapurve@google.com \
--cc=vbabka@suse.cz \
--cc=viro@zeniv.linux.org.uk \
--cc=willy@infradead.org \
--cc=yan.y.zhao@intel.com \
--cc=ying.huang@linux.alibaba.com \
--cc=yuzhao@google.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox