linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Huang Shijie <huangsj@hygon.cn>
To: <akpm@linux-foundation.org>, <viro@zeniv.linux.org.uk>,
	<brauner@kernel.org>
Cc: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>,
	<linux-arm-kernel@lists.infradead.org>,
	<linux-fsdevel@vger.kernel.org>, <muchun.song@linux.dev>,
	<osalvador@suse.de>, <linux-trace-kernel@vger.kernel.org>,
	<linux-perf-users@vger.kernel.org>,
	<linux-parisc@vger.kernel.org>, <nvdimm@lists.linux.dev>,
	<zhongyuan@hygon.cn>, <fangbaoshun@hygon.cn>,
	<yingzhiwei@hygon.cn>, Huang Shijie <huangsj@hygon.cn>
Subject: [PATCH 3/3] mm: split the file's i_mmap tree for NUMA
Date: Mon, 13 Apr 2026 14:20:42 +0800	[thread overview]
Message-ID: <20260413062042.804-4-huangsj@hygon.cn> (raw)
In-Reply-To: <20260413062042.804-1-huangsj@hygon.cn>

  In NUMA, there are maybe many NUMA nodes and many CPUs.
For example, a Hygon's server has 12 NUMA nodes, and 384 CPUs.
In the UnixBench tests, there is a test "execl" which tests
the execve system call.

  When we test our server with "./Run -c 384 execl",
the test result is not good enough. The i_mmap locks contended heavily on
"libc.so" and "ld.so". For example, the i_mmap tree for "libc.so" can have 
over 6000 VMAs, all the VMAs can be in different NUMA mode.
The insert/remove operations do not run quickly enough.

 In order to reduce the competition of the i_mmap lock, this patch does
following:
   1.) Split the single i_mmap tree into several sibling trees:
       Each NUMA node has a tree.
   2.) Introduce a new field "tree_idx" for vm_area_struct to save the
       sibling tree index for this VMA.
   3.) Introduce a new field "vma_count" for address_space.
       The new mapping_mapped() will use it.
   4.) Rewrite the vma_interval_tree_foreach() for NUMA.

 After this patch, the VMA insert/remove operations will work faster,
and we can get 77% (10 times average) performance improvement
with the above test.

Signed-off-by: Huang Shijie <huangsj@hygon.cn>
---
 fs/inode.c               | 55 +++++++++++++++++++++++++++++++++++++++-
 include/linux/fs.h       | 35 +++++++++++++++++++++++++
 include/linux/mm.h       | 32 +++++++++++++++++++++++
 include/linux/mm_types.h |  1 +
 mm/mmap.c                |  3 ++-
 mm/nommu.c               |  6 +++--
 mm/vma.c                 | 34 +++++++++++++++++++------
 mm/vma_init.c            |  1 +
 8 files changed, 155 insertions(+), 12 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index cc12b68e021b..3067cb2558da 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -215,6 +215,56 @@ static int no_open(struct inode *inode, struct file *file)
 	return -ENXIO;
 }
 
+#ifdef CONFIG_NUMA
+static void free_mapping_i_mmap(struct address_space *mapping)
+{
+	int i;
+
+	if (!mapping->i_mmap)
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		kfree(mapping->i_mmap[i]);
+
+	kfree(mapping->i_mmap);
+	mapping->i_mmap = NULL;
+}
+
+static int init_mapping_i_mmap(struct address_space *mapping)
+{
+	struct rb_root_cached *root;
+	int i;
+
+	/* The extra one is used as terminator in vma_interval_tree_foreach() */
+	mapping->i_mmap = kzalloc(sizeof(root) * (nr_node_ids + 1), GFP_KERNEL);
+	if (!mapping->i_mmap)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		root = kzalloc_node(sizeof(*root), GFP_KERNEL, i);
+		if (!root)
+			goto no_mem;
+
+		*root = RB_ROOT_CACHED;
+		mapping->i_mmap[i] = root;
+	}
+	return 0;
+
+no_mem:
+	free_mapping_i_mmap(mapping);
+	return -ENOMEM;
+}
+#else
+static int init_mapping_i_mmap(struct address_space *mapping)
+{
+	mapping->i_mmap = RB_ROOT_CACHED;
+	return 0;
+}
+static void free_mapping_i_mmap(struct address_space *mapping)
+{
+}
+#endif
+
 /**
  * inode_init_always_gfp - perform inode structure initialisation
  * @sb: superblock inode belongs to
@@ -307,6 +357,9 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp
 	if (unlikely(security_inode_alloc(inode, gfp)))
 		return -ENOMEM;
 
+	if (init_mapping_i_mmap(mapping))
+		return -ENOMEM;
+
 	this_cpu_inc(nr_inodes);
 
 	return 0;
@@ -383,6 +436,7 @@ void __destroy_inode(struct inode *inode)
 	if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
 		posix_acl_release(inode->i_default_acl);
 #endif
+	free_mapping_i_mmap(&inode->i_data);
 	this_cpu_dec(nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
@@ -486,7 +540,6 @@ static void __address_space_init_once(struct address_space *mapping)
 	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->i_private_list);
 	spin_lock_init(&mapping->i_private_lock);
-	mapping->i_mmap = RB_ROOT_CACHED;
 }
 
 void address_space_init_once(struct address_space *mapping)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a6a99e044265..34064c1cbd10 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -477,7 +477,12 @@ struct address_space {
 	/* number of thp, only for non-shmem files */
 	atomic_t		nr_thps;
 #endif
+#ifdef CONFIG_NUMA
+	struct rb_root_cached	**i_mmap;
+	unsigned long		vma_count;
+#else
 	struct rb_root_cached	i_mmap;
+#endif
 	unsigned long		nrpages;
 	pgoff_t			writeback_index;
 	const struct address_space_operations *a_ops;
@@ -547,6 +552,27 @@ static inline void i_mmap_assert_write_locked(struct address_space *mapping)
 	lockdep_assert_held_write(&mapping->i_mmap_rwsem);
 }
 
+#ifdef CONFIG_NUMA
+static inline int mapping_mapped(const struct address_space *mapping)
+{
+	return	READ_ONCE(mapping->vma_count);
+}
+
+static inline void inc_mapping_vma(struct address_space *mapping)
+{
+	mapping->vma_count++;
+}
+
+static inline void dec_mapping_vma(struct address_space *mapping)
+{
+	mapping->vma_count--;
+}
+
+static inline struct rb_root_cached *get_i_mmap_root(struct address_space *mapping)
+{
+	return (struct rb_root_cached *)mapping->i_mmap;
+}
+#else
 /*
  * Might pages of this file be mapped into userspace?
  */
@@ -555,10 +581,19 @@ static inline int mapping_mapped(const struct address_space *mapping)
 	return	!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
 }
 
+static inline void inc_mapping_vma(struct address_space *mapping)
+{
+}
+
+static inline void dec_mapping_vma(struct address_space *mapping)
+{
+}
+
 static inline struct rb_root_cached *get_i_mmap_root(struct address_space *mapping)
 {
 	return &mapping->i_mmap;
 }
+#endif
 
 /*
  * Might pages of this file have been modified in userspace?
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 15cb1da43eb2..c7f26eb34322 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -913,6 +913,9 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_ops = &vma_dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma_lock_init(vma, false);
+#ifdef CONFIG_NUMA
+	vma->tree_idx = numa_node_id();
+#endif
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
@@ -3783,6 +3786,8 @@ extern atomic_long_t mmap_pages_allocated;
 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
 
 /* interval_tree.c */
+struct rb_root_cached *get_rb_root(struct vm_area_struct *vma,
+					struct address_space *mapping);
 void vma_interval_tree_insert(struct vm_area_struct *node,
 			      struct rb_root_cached *root);
 void vma_interval_tree_insert_after(struct vm_area_struct *node,
@@ -3798,9 +3803,36 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
 				unsigned long start, unsigned long last);
 
 /* Please use get_i_mmap_root() to get the @root */
+#ifdef CONFIG_NUMA
+/* Find the first valid VMA in the sibling trees */
+static inline struct vm_area_struct *first_vma(struct rb_root_cached ***__r,
+				unsigned long start, unsigned long last)
+{
+	struct vm_area_struct *vma = NULL;
+	struct rb_root_cached **tree = *__r;
+
+	while (*tree) {
+		vma = vma_interval_tree_iter_first(*tree++, start, last);
+		if (vma)
+			break;
+	}
+
+	/* Save for the next loop */
+	*__r = tree;
+	return vma;
+}
+
+/* @_tmp is referenced to avoid unused variable warning. */
+#define vma_interval_tree_foreach(vma, root, start, last)		\
+	for (struct rb_root_cached **_r = (void *)(root),		\
+		**_tmp = (vma = first_vma(&_r, start, last)) ? _r : NULL;\
+	     ((_tmp && vma) || (vma = first_vma(&_r, start, last)));	\
+		vma = vma_interval_tree_iter_next(vma, start, last))
+#else
 #define vma_interval_tree_foreach(vma, root, start, last)		\
 	for (vma = vma_interval_tree_iter_first(root, start, last);	\
 	     vma; vma = vma_interval_tree_iter_next(vma, start, last))
+#endif
 
 void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
 				   struct rb_root_cached *root);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3cc8ae722886..4982e20ce27c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -984,6 +984,7 @@ struct vm_area_struct {
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
+	int tree_idx;			/* The sibling tree index for the VMA */
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
diff --git a/mm/mmap.c b/mm/mmap.c
index 5b0671dff019..81a2f4932ca8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1832,8 +1832,9 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
 			vma_interval_tree_insert_after(tmp, mpnt,
-					get_i_mmap_root(mapping));
+					get_rb_root(mpnt, mapping));
 			flush_dcache_mmap_unlock(mapping);
+			inc_mapping_vma(mapping);
 			i_mmap_unlock_write(mapping);
 		}
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 2e64b6c4c539..6553cfcb6683 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -569,8 +569,9 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
 
 		i_mmap_lock_write(mapping);
 		flush_dcache_mmap_lock(mapping);
-		vma_interval_tree_insert(vma, get_i_mmap_root(mapping));
+		vma_interval_tree_insert(vma, get_rb_root(vma, mapping));
 		flush_dcache_mmap_unlock(mapping);
+		inc_mapping_vma(mapping);
 		i_mmap_unlock_write(mapping);
 	}
 }
@@ -585,8 +586,9 @@ static void cleanup_vma_from_mm(struct vm_area_struct *vma)
 
 		i_mmap_lock_write(mapping);
 		flush_dcache_mmap_lock(mapping);
-		vma_interval_tree_remove(vma, get_i_mmap_root(mapping));
+		vma_interval_tree_remove(vma, get_rb_root(vma, mapping));
 		flush_dcache_mmap_unlock(mapping);
+		dec_mapping_vma(mapping);
 		i_mmap_unlock_write(mapping);
 	}
 }
diff --git a/mm/vma.c b/mm/vma.c
index 1768e4355a13..5aa3915d183b 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -224,6 +224,16 @@ static bool can_vma_merge_after(struct vma_merge_struct *vmg)
 	return false;
 }
 
+struct rb_root_cached *get_rb_root(struct vm_area_struct *vma,
+					struct address_space *mapping)
+{
+#ifdef CONFIG_NUMA
+	return mapping->i_mmap[vma->tree_idx];
+#else
+	return &mapping->i_mmap;
+#endif
+}
+
 static void __vma_link_file(struct vm_area_struct *vma,
 			    struct address_space *mapping)
 {
@@ -231,8 +241,9 @@ static void __vma_link_file(struct vm_area_struct *vma,
 		mapping_allow_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
-	vma_interval_tree_insert(vma, get_i_mmap_root(mapping));
+	vma_interval_tree_insert(vma, get_rb_root(vma, mapping));
 	flush_dcache_mmap_unlock(mapping);
+	inc_mapping_vma(mapping);
 }
 
 /*
@@ -245,8 +256,9 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		mapping_unmap_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
-	vma_interval_tree_remove(vma, get_i_mmap_root(mapping));
+	vma_interval_tree_remove(vma, get_rb_root(vma, mapping));
 	flush_dcache_mmap_unlock(mapping);
+	dec_mapping_vma(mapping);
 }
 
 /*
@@ -317,10 +329,13 @@ static void vma_prepare(struct vma_prepare *vp)
 	if (vp->file) {
 		flush_dcache_mmap_lock(vp->mapping);
 		vma_interval_tree_remove(vp->vma,
-					get_i_mmap_root(vp->mapping));
-		if (vp->adj_next)
+					get_rb_root(vp->vma, vp->mapping));
+		dec_mapping_vma(vp->mapping);
+		if (vp->adj_next) {
 			vma_interval_tree_remove(vp->adj_next,
-					get_i_mmap_root(vp->mapping));
+					get_rb_root(vp->adj_next, vp->mapping));
+			dec_mapping_vma(vp->mapping);
+		}
 	}
 
 }
@@ -337,11 +352,14 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
 			 struct mm_struct *mm)
 {
 	if (vp->file) {
-		if (vp->adj_next)
+		if (vp->adj_next) {
 			vma_interval_tree_insert(vp->adj_next,
-					get_i_mmap_root(vp->mapping));
+					get_rb_root(vp->adj_next, vp->mapping));
+			inc_mapping_vma(vp->mapping);
+		}
 		vma_interval_tree_insert(vp->vma,
-					get_i_mmap_root(vp->mapping));
+					get_rb_root(vp->vma, vp->mapping));
+		inc_mapping_vma(vp->mapping);
 		flush_dcache_mmap_unlock(vp->mapping);
 	}
 
diff --git a/mm/vma_init.c b/mm/vma_init.c
index 3c0b65950510..5735868b1ad4 100644
--- a/mm/vma_init.c
+++ b/mm/vma_init.c
@@ -71,6 +71,7 @@ static void vm_area_init_from(const struct vm_area_struct *src,
 #endif
 #ifdef CONFIG_NUMA
 	dest->vm_policy = src->vm_policy;
+	dest->tree_idx = src->tree_idx;
 #endif
 #ifdef __HAVE_PFNMAP_TRACKING
 	dest->pfnmap_track_ctx = NULL;
-- 
2.43.0




      parent reply	other threads:[~2026-04-13  6:21 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-13  6:20 [PATCH 0/3] " Huang Shijie
2026-04-13  6:20 ` [PATCH 1/3] mm: use mapping_mapped to simplify the code Huang Shijie
2026-04-13  6:20 ` [PATCH 2/3] mm: use get_i_mmap_root to access the file's i_mmap Huang Shijie
2026-04-13  6:20 ` Huang Shijie [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260413062042.804-4-huangsj@hygon.cn \
    --to=huangsj@hygon.cn \
    --cc=akpm@linux-foundation.org \
    --cc=brauner@kernel.org \
    --cc=fangbaoshun@hygon.cn \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-parisc@vger.kernel.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=nvdimm@lists.linux.dev \
    --cc=osalvador@suse.de \
    --cc=viro@zeniv.linux.org.uk \
    --cc=yingzhiwei@hygon.cn \
    --cc=zhongyuan@hygon.cn \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox