From: Chen Haixiang <chenhaixiang3@huawei.com>
To: <linux-mm@kvack.org>, <akpm@linux-foundation.org>, <hughd@google.com>
Cc: <louhongxiang@huawei.com>, <wangbin224@huawei.com>,
<liuyuntao10@huawei.com>, <chenhaixiang3@huawei.com>
Subject: [PATCH] support tmpfs hugepage PMD is not split when COW
Date: Wed, 10 Jan 2024 17:20:28 +0800 [thread overview]
Message-ID: <20240110092028.1777-1-chenhaixiang3@huawei.com> (raw)
Transparent hugepages in tmpfs can enhance TLB efficiency by reducing
TLB misses. However, during Copy-On-Write (COW) memory faults, these
hugepages may be split. In some scenarios, preventing this splitting
is desirable. We might introduce a shmem_huge_fault to inhibit this
behavior, along with a mount parameter to enable or disable this function.
Signed-off-by: Chen Haixiang <chenhaixiang3@huawei.com>
---
include/linux/mm.h | 1 +
include/linux/shmem_fs.h | 1 +
mm/memory.c | 7 ++++
mm/shmem.c | 85 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 94 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index da5219b48d52..eb44574965d6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -573,6 +573,7 @@ struct vm_operations_struct {
unsigned long end, unsigned long newflags);
vm_fault_t (*fault)(struct vm_fault *vmf);
vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
+ vm_fault_t (*shmem_huge_fault)(struct vm_fault *vmf, pmd_t orig_pmd);
vm_fault_t (*map_pages)(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
unsigned long (*pagesize)(struct vm_area_struct * area);
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 2caa6b86106a..4484f2f33afe 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -73,6 +73,7 @@ struct shmem_sb_info {
struct list_head shrinklist; /* List of shinkable inodes */
unsigned long shrinklist_len; /* Length of shrinklist */
struct shmem_quota_limits qlimits; /* Default quota limits */
+ unsigned int no_split; /* Do not split shmempmdmaped in tmpfs */
};
static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
diff --git a/mm/memory.c b/mm/memory.c
index 5c757fba8858..7d27a6b5e69f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4942,6 +4942,13 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
}
}
+ if (vmf->vma->vm_ops->shmem_huge_fault) {
+ vm_fault_t ret = vmf->vma->vm_ops->shmem_huge_fault(vmf, vmf->orig_pmd);
+
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ }
+
split:
/* COW or write-notify handled on pte level: split pmd. */
__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
diff --git a/mm/shmem.c b/mm/shmem.c
index 0d1ce70bce38..8211211f7405 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -118,6 +118,7 @@ struct shmem_options {
umode_t mode;
bool full_inums;
int huge;
+ unsigned int no_split;
int seen;
bool noswap;
unsigned short quota_types;
@@ -128,6 +129,7 @@ struct shmem_options {
#define SHMEM_SEEN_INUMS 8
#define SHMEM_SEEN_NOSWAP 16
#define SHMEM_SEEN_QUOTA 32
+#define SHMEM_SEEN_NO_SPLIT 64
};
#ifdef CONFIG_TMPFS
@@ -2238,6 +2240,79 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
return ret;
}
+static vm_fault_t shmem_huge_fault(struct vm_fault *vmf, pmd_t orig_pmd)
+{
+ vm_fault_t ret = VM_FAULT_FALLBACK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ struct folio *old_folio, *new_folio;
+ pmd_t entry;
+ int gfp_flags = GFP_HIGHUSER_MOVABLE | __GFP_COMP;
+ struct vm_area_struct *vma = vmf->vma;
+ struct shmem_sb_info *sbinfo = NULL;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+
+ sbinfo = SHMEM_SB(info->vfs_inode.i_sb);
+
+ if (sbinfo->no_split == 0)
+ return VM_FAULT_FALLBACK;
+
+ /* ShmemPmdMapped in tmpfs will not split huge pmd */
+ if (!(vmf->flags & FAULT_FLAG_WRITE)
+ || (vma->vm_flags & VM_SHARED))
+ return VM_FAULT_FALLBACK;
+
+ new_folio = vma_alloc_folio(gfp_flags, HPAGE_PMD_ORDER,
+ vmf->vma, haddr, true);
+ if (!new_folio)
+ ret = VM_FAULT_FALLBACK;
+
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (pmd_none(*vmf->pmd)) {
+ ret = VM_FAULT_FALLBACK;
+ goto out;
+ }
+ if (!pmd_same(*vmf->pmd, orig_pmd)) {
+ ret = 0;
+ goto out;
+ }
+
+ if (!new_folio) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ ret = VM_FAULT_FALLBACK;
+ goto out;
+ }
+ old_folio = page_folio(pmd_page(*vmf->pmd));
+ page_remove_rmap(&old_folio->page, vma, true);
+ pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
+
+ __folio_set_locked(new_folio);
+ __folio_set_swapbacked(new_folio);
+ __folio_mark_uptodate(new_folio);
+
+ flush_icache_pages(vma, &new_folio->page, HPAGE_PMD_NR);
+ entry = mk_huge_pmd(&new_folio->page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+
+ page_add_file_rmap(&new_folio->page, vma, true);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+ update_mmu_cache_pmd(vma, haddr, vmf->pmd);
+ count_vm_event(THP_FILE_MAPPED);
+
+ folio_unlock(new_folio);
+ spin_unlock(vmf->ptl);
+ copy_user_large_folio(new_folio, old_folio, haddr, vma);
+ folio_put(old_folio);
+ ret = 0;
+ return ret;
+
+out:
+ if (new_folio)
+ folio_put(new_folio);
+ spin_unlock(vmf->ptl);
+ return ret;
+}
+
unsigned long shmem_get_unmapped_area(struct file *file,
unsigned long uaddr, unsigned long len,
unsigned long pgoff, unsigned long flags)
@@ -3869,6 +3944,7 @@ enum shmem_param {
Opt_usrquota_inode_hardlimit,
Opt_grpquota_block_hardlimit,
Opt_grpquota_inode_hardlimit,
+ Opt_no_split,
};
static const struct constant_table shmem_param_enums_huge[] = {
@@ -3900,6 +3976,7 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
#endif
+ fsparam_u32 ("no_split", Opt_no_split),
{}
};
@@ -4065,6 +4142,10 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
"Group quota inode hardlimit too large.");
ctx->qlimits.grpquota_ihardlimit = size;
break;
+ case Opt_no_split:
+ ctx->no_split = result.uint_32;
+ ctx->seen |= SHMEM_SEEN_NO_SPLIT;
+ break;
}
return 0;
@@ -4261,6 +4342,8 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
if (sbinfo->huge)
seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
+ if (sbinfo->huge && sbinfo->no_split)
+ seq_puts(seq, ",no_split");
#endif
mpol = shmem_get_sbmpol(sbinfo);
shmem_show_mpol(seq, mpol);
@@ -4315,6 +4398,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
if (!(ctx->seen & SHMEM_SEEN_INUMS))
ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
sbinfo->noswap = ctx->noswap;
+ sbinfo->no_split = ctx->no_split;
} else {
sb->s_flags |= SB_NOUSER;
}
@@ -4568,6 +4652,7 @@ static const struct super_operations shmem_ops = {
static const struct vm_operations_struct shmem_vm_ops = {
.fault = shmem_fault,
.map_pages = filemap_map_pages,
+ .shmem_huge_fault = shmem_huge_fault,
#ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
--
2.33.0
next reply other threads:[~2024-01-10 9:20 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-01-10 9:20 Chen Haixiang [this message]
2024-01-10 12:00 ` David Hildenbrand
2024-01-10 12:44 ` Matthew Wilcox
2024-01-11 0:03 ` kernel test robot
2024-01-11 2:10 ` kernel test robot
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240110092028.1777-1-chenhaixiang3@huawei.com \
--to=chenhaixiang3@huawei.com \
--cc=akpm@linux-foundation.org \
--cc=hughd@google.com \
--cc=linux-mm@kvack.org \
--cc=liuyuntao10@huawei.com \
--cc=louhongxiang@huawei.com \
--cc=wangbin224@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox